D45370.diff
No OneTemporary
Actions

Size

160 KB

Referenced Files

None

Subscribers

None

D45370.diff
View Options

	diff --git a/sys/conf/files b/sys/conf/files
	--- a/sys/conf/files
	+++ b/sys/conf/files
	@@ -3438,6 +3438,7 @@
	dev/virtio/mmio/virtio_mmio_cmdline.c optional virtio_mmio
	dev/virtio/mmio/virtio_mmio_fdt.c optional virtio_mmio fdt
	dev/virtio/mmio/virtio_mmio_if.m optional virtio_mmio
	+dev/virtio/dbg/virtio_dbg.c optional virtio_dbg
	dev/virtio/network/if_vtnet.c optional vtnet
	dev/virtio/balloon/virtio_balloon.c optional virtio_balloon
	dev/virtio/block/virtio_blk.c optional virtio_blk
	diff --git a/sys/dev/virtio/dbg/virtio_dbg.h b/sys/dev/virtio/dbg/virtio_dbg.h
	new file mode 100644
	--- /dev/null
	+++ b/sys/dev/virtio/dbg/virtio_dbg.h
	@@ -0,0 +1,25 @@
	+#ifndef _VIRTIO_DBG_
	+#define _VIRTIO_DBG_
	+
	+#include <sys/cdefs.h>
	+#include <sys/ioccom.h>
	+
	+struct vtdbg_transfer {
	+ caddr_t vtdt_device;
	+ caddr_t vtdt_driver;
	+ size_t vtdt_len;
	+};
	+
	+struct vtdbg_io_args {
	+ struct vtdbg_transfer *transfers;
	+ size_t cnt;
	+ bool touser;
	+};
	+
	+#define VIRTIO_DBG_INIT _IO('v', 1)
	+#define VIRTIO_DBG_KICK _IO('v', 2)
	+#define VIRTIO_DBG_ACK _IO('v', 3)
	+#define VIRTIO_DBG_TRANSFER _IOWR('v', 4, struct vtdbg_io_args)
	+
	+
	+#endif /* _VIRTIO_DBG_ */
	diff --git a/sys/dev/virtio/dbg/virtio_dbg.c b/sys/dev/virtio/dbg/virtio_dbg.c
	new file mode 100644
	--- /dev/null
	+++ b/sys/dev/virtio/dbg/virtio_dbg.c
	@@ -0,0 +1,970 @@
	+/*-
	+ * Copyright (c) 2024 Emil Tsalapatis
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+#include <sys/param.h>
	+#include <sys/systm.h>
	+#include <sys/lock.h>
	+#include <sys/bus.h>
	+#include <sys/conf.h>
	+#include <sys/event.h>
	+#include <sys/kernel.h>
	+#include <sys/kobj.h>
	+#include <sys/kthread.h>
	+#include <sys/limits.h>
	+#include <sys/malloc.h>
	+#include <sys/module.h>
	+#include <sys/mutex.h>
	+#include <sys/proc.h>
	+#include <sys/rman.h>
	+#include <sys/rwlock.h>
	+#include <sys/selinfo.h>
	+#include <sys/stat.h>
	+
	+#include <vm/vm.h>
	+#include <vm/pmap.h>
	+#include <vm/vm_extern.h>
	+#include <vm/vm_kern.h>
	+#include <vm/vm_map.h>
	+#include <vm/vm_object.h>
	+#include <vm/vm_page.h>
	+#include <vm/vm_pager.h>
	+#include <vm/vm_param.h>
	+
	+#include <machine/bus.h>
	+#include <machine/pmap.h>
	+#include <machine/resource.h>
	+#include <machine/vmparam.h>
	+
	+#include <dev/virtio/virtio_config.h>
	+#include <dev/virtio/virtqueue.h>
	+#include <dev/virtio/dbg/virtio_dbg.h>
	+#include <dev/virtio/mmio/virtio_mmio.h>
	+
	+#include "virtio_mmio_if.h"
	+
	+#define VTDBG_MAGIC ((uint64_t)0x84848484ULL)
	+
	+/*
	+ * XXX Determine these sizes in a well-defined
	+ * per-device fashion.
	+ */
	+#define VTDBG_MAPSZ (1024 * 1024 * 10)
	+#define VTDBG_RESERVE_DEVSPACE (4096)
	+
	+/* XXX Remove after development is done. */
	+#define VTDBG_WARN(format, ...) \
	+ do { \
	+ printf("(%s:%d) " format, __func__, __LINE__, ##__VA_ARGS__); \
	+ } while (0)
	+
	+static device_t vtdbg_parent;
	+static driver_t *vtdbg_driver;
	+
	+#define VTDBG_UPDATE_DESC (0x01)
	+#define VTDBG_UPDATE_USED (0x02)
	+#define VTDBG_UPDATE_AVAIL (0x04)
	+#define VTDBG_INTR_PENDING (0x08)
	+#define VTDBG_INTR_EXITING (0x10)
	+
	+/*
	+ * Information on a debug device instance. Accessed
	+ * through the control device's softc.
	+ */
	+struct vtdbg_softc {
	+ struct mtx vtd_mtx;
	+ struct cv vtd_cv;
	+ struct knlist vtd_note;
	+ uint32_t vtd_magic;
	+
	+ vm_object_t vtd_object;
	+ vm_ooffset_t vtd_baseaddr;
	+ size_t vtd_bytes;
	+ size_t vtd_allocated;
	+
	+ virtqueue_intr_t *vtd_intr;
	+ void *vtd_intr_arg;
	+ struct proc *vtd_pintr;
	+
	+ vm_ooffset_t vtd_offset;
	+
	+ uint32_t vtd_flags;
	+
	+ device_t vtd_dev;
	+};
	+
	+/*
	+ * Subclass of vtmmio_softc that also lets the virtio device access
	+ * vtdbg related information while also being usable from vtmmio_*
	+ * methods. The vtdbg_softc * is the softc of the control device and
	+ * is allocated dynamically when opening an instance of the control device,
	+ * while the virtio_dbg_softc here is allocated during device_t creation.
	+ */
	+struct virtio_dbg_softc {
	+ struct vtmmio_softc vtmdbg_mmio;
	+ struct vtdbg_softc *vtmdbg_dbg;
	+};
	+
	+/*
	+ * Store the parent bus and driver pointers for the debug devices,
	+ * because we need them when creating debug devices on-demand later on.
	+ * We are hanging off of the nexus, so we are certain it's not going away.
	+ */
	+static void
	+virtio_dbg_identify(driver_t *driver, device_t parent)
	+{
	+ vtdbg_parent = parent;
	+ vtdbg_driver = driver;
	+}
	+
	+static struct vtdbg_softc *
	+vtmmio_get_vtdbg(device_t dev)
	+{
	+ struct virtio_dbg_softc *sc;
	+
	+ sc = device_get_softc(dev);
	+ MPASS(sc->vtmdbg_dbg->vtd_magic == VTDBG_MAGIC);
	+
	+ return (sc->vtmdbg_dbg);
	+}
	+
	+/*
	+ * Explicitly turn polling into a no-op.
	+ */
	+static int
	+virtio_dbg_poll(device_t dev)
	+{
	+
	+ return (0);
	+}
	+
	+
	+/*
	+ * Make sure the shared virtio device region between kernel and userspace
	+ * is configured properly.
	+ */
	+static int
	+virtio_dbg_probe(device_t dev)
	+{
	+ struct virtio_dbg_softc *sc;
	+ struct vtmmio_softc *mmiosc;
	+ uint32_t magic, version;
	+
	+ sc = device_get_softc(dev);
	+ mmiosc = &sc->vtmdbg_mmio;
	+
	+ /* Fake platform to trigger virtio_mmio_note() on writes. */
	+ sc->vtmdbg_mmio.platform = dev;
	+
	+ magic = vtmmio_read_config_4(mmiosc, VIRTIO_MMIO_MAGIC_VALUE);
	+ if (magic != VIRTIO_MMIO_MAGIC_VIRT) {
	+ device_printf(dev, "Bad magic value %#x\n", magic);
	+ return (ENXIO);
	+ }
	+
	+ version = vtmmio_read_config_4(mmiosc, VIRTIO_MMIO_VERSION);
	+ if (version != 2) {
	+ device_printf(dev, "Unsupported version: %#x\n", version);
	+ return (ENXIO);
	+ }
	+
	+ if (vtmmio_read_config_4(mmiosc, VIRTIO_MMIO_DEVICE_ID) == 0)
	+ return (ENXIO);
	+
	+ device_set_desc(dev, "VirtIO Emulated MMIO adapter");
	+
	+ return (0);
	+}
	+
	+/*
	+ * Creates the virtio device corresponding to the transport instance.
	+ */
	+static int
	+virtio_dbg_attach(device_t dev)
	+{
	+ struct virtio_dbg_softc *sc;
	+ struct vtmmio_softc *mmiosc;
	+ device_t child;
	+
	+ sc = device_get_softc(dev);
	+ mmiosc = &sc->vtmdbg_mmio;
	+
	+ mmiosc->dev = dev;
	+ mmiosc->vtmmio_version = vtmmio_read_config_4(mmiosc, VIRTIO_MMIO_VERSION);
	+
	+ vtmmio_reset(mmiosc);
	+
	+ /* Tell the host we've noticed this device. */
	+ vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
	+
	+ mtx_lock(&Giant);
	+ if ((child = device_add_child(dev, NULL, -1)) == NULL) {
	+ device_printf(dev, "Cannot create child device.\n");
	+ vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED);
	+
	+ DEVICE_DETACH(dev);
	+ mtx_unlock(&Giant);
	+
	+ return (ENOMEM);
	+ }
	+
	+ mmiosc->vtmmio_child_dev = child;
	+ vtmmio_probe_and_attach_child(mmiosc);
	+
	+ mtx_unlock(&Giant);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Recompute the queue descriptor to be an offset within the shared user/kernel
	+ * device control region. Our userspace cannot meaningfully translate
	+ * kernel physical addresses, so we transform the values in the queue
	+ * descriptor address registers into offsets. Userspace finds the vq address
	+ * by adding the offset to its own virtual address for the region.
	+ */
	+static void
	+virtio_dbg_qdesc_offset(struct vtmmio_softc *sc, uint64_t baseaddr,
	+ int hireg, int loreg)
	+{
	+ struct resource *res = sc->res[0];
	+ uint32_t hi, lo;
	+ uint64_t qaddr;
	+
	+ /* Read in the components of the physical address. */
	+ hi = bus_read_4(res, hireg);
	+ lo = bus_read_4(res, loreg);
	+
	+ /* Recompute into an offset into the vq control region. */
	+ qaddr = (((uint64_t)hi) << 32 \| (uint64_t)lo);
	+ qaddr -= vtophys(baseaddr);
	+
	+ /* Update the register values. */
	+ hi = (qaddr >> 32);
	+ lo = (qaddr & ((1ULL << 32) - 1));
	+
	+ /* Direct bus write because to avoid triggering note(). */
	+ bus_write_4(res, hireg, hi);
	+ bus_write_4(res, loreg, lo);
	+}
	+
	+/* Notify userspace of a write, and wait for a response. */
	+static int
	+virtio_dbg_note(device_t dev, size_t offset, int val)
	+{
	+ struct vtdbg_softc *vtdsc;
	+ struct virtio_dbg_softc *sc;
	+
	+ sc = device_get_softc(dev);
	+ vtdsc = sc->vtmdbg_dbg;
	+ MPASS(vtdsc->vtd_magic == VTDBG_MAGIC);
	+
	+ /*
	+ * Intercept writes to the QUEUE_{DESC, AVAIL, USED}_{HIGH, LOW}
	+ * registers and instead pass to the user the offset from the beginning
	+ * of the control region. Do not actually notify userspace of the writes,
	+ * we will recompute and notify once we set VIRTIO_MMIO_QUEUE_READY.
	+ *
	+ * Both high and low registers are set together, so just track writes to
	+ * the high address bits.
	+ */
	+ switch (offset) {
	+ case VIRTIO_MMIO_QUEUE_DESC_HIGH:
	+ vtdsc->vtd_flags \|= VTDBG_UPDATE_DESC;
	+ return (1);
	+ case VIRTIO_MMIO_QUEUE_USED_HIGH:
	+ vtdsc->vtd_flags \|= VTDBG_UPDATE_USED;
	+ return (1);
	+ case VIRTIO_MMIO_QUEUE_AVAIL_HIGH:
	+ vtdsc->vtd_flags \|= VTDBG_UPDATE_AVAIL;
	+ return (1);
	+ }
	+
	+ /* Only forward the listed register writes to userspace. */
	+ switch (offset) {
	+ case VIRTIO_MMIO_HOST_FEATURES_SEL:
	+ case VIRTIO_MMIO_GUEST_FEATURES:
	+ case VIRTIO_MMIO_QUEUE_SEL:
	+ case VIRTIO_MMIO_QUEUE_NUM:
	+ case VIRTIO_MMIO_QUEUE_NOTIFY:
	+ case VIRTIO_MMIO_INTERRUPT_ACK:
	+ case VIRTIO_MMIO_STATUS:
	+ break;
	+ case VIRTIO_MMIO_QUEUE_READY:
	+ /* if changed, transform the offsets. */
	+ if (vtdsc->vtd_flags & VTDBG_UPDATE_DESC) {
	+ virtio_dbg_qdesc_offset(&sc->vtmdbg_mmio, vtdsc->vtd_baseaddr,
	+ VIRTIO_MMIO_QUEUE_DESC_HIGH, VIRTIO_MMIO_QUEUE_DESC_LOW);
	+ vtdsc->vtd_flags &= ~VTDBG_UPDATE_DESC;
	+ }
	+
	+ if (vtdsc->vtd_flags & VTDBG_UPDATE_USED) {
	+ virtio_dbg_qdesc_offset(&sc->vtmdbg_mmio, vtdsc->vtd_baseaddr,
	+ VIRTIO_MMIO_QUEUE_USED_HIGH, VIRTIO_MMIO_QUEUE_USED_LOW);
	+ vtdsc->vtd_flags &= ~VTDBG_UPDATE_USED;
	+ }
	+
	+ if (vtdsc->vtd_flags & VTDBG_UPDATE_AVAIL) {
	+ virtio_dbg_qdesc_offset(&sc->vtmdbg_mmio, vtdsc->vtd_baseaddr,
	+ VIRTIO_MMIO_QUEUE_AVAIL_HIGH, VIRTIO_MMIO_QUEUE_AVAIL_LOW);
	+ vtdsc->vtd_flags &= ~VTDBG_UPDATE_AVAIL;
	+ }
	+ break;
	+ default:
	+ return (1);
	+ }
	+
	+ mtx_lock(&vtdsc->vtd_mtx);
	+ vtdsc->vtd_offset = offset;
	+ KNOTE_LOCKED(&vtdsc->vtd_note, 0);
	+
	+ /*
	+ * We cannot sleep here because this code is called holding non-sleepable locks.
	+ * This is because this busy wait's corresponding operation for other transports is
	+ * a VM exit, which is instantaneous from the point of view of the guest kernel.
	+ * To prevent a "sleeping thread" panic, we busy wait here. There is always the
	+ * danger of our VMM process leaving us hanging, but that is always a danger even
	+ * with non-emulated virtio transports - it just isn't visible to the guest, since
	+ * the VMM is normally on the host.
	+ */
	+ while (vtdsc->vtd_offset != 0) {
	+ mtx_unlock(&vtdsc->vtd_mtx);
	+ cpu_spinwait();
	+ mtx_lock(&vtdsc->vtd_mtx);
	+ }
	+
	+ mtx_unlock(&vtdsc->vtd_mtx);
	+
	+ return (1);
	+}
	+
	+/*
	+ * Pass interrupt information to the cdev. The cdev will be directly
	+ * running the device interrupt handling code as an ioctl.
	+ */
	+static int
	+virtio_dbg_setup_intr(device_t dev, device_t mmio_dev, void handler, void ih_user)
	+{
	+ struct vtdbg_softc *sc;
	+
	+ sc = vtmmio_get_vtdbg(dev);
	+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
	+
	+ mtx_lock(&sc->vtd_mtx);
	+ sc->vtd_intr = handler;
	+ sc->vtd_intr_arg = ih_user;
	+ mtx_unlock(&sc->vtd_mtx);
	+
	+ return (0);
	+}
	+
	+static device_method_t virtio_dbg_methods[] = {
	+ DEVMETHOD(device_attach, virtio_dbg_attach),
	+ DEVMETHOD(device_identify, virtio_dbg_identify),
	+ DEVMETHOD(device_probe, virtio_dbg_probe),
	+
	+ DEVMETHOD(virtio_mmio_poll, virtio_dbg_poll),
	+ DEVMETHOD(virtio_mmio_note, virtio_dbg_note),
	+ DEVMETHOD(virtio_mmio_setup_intr, virtio_dbg_setup_intr),
	+
	+ DEVMETHOD_END
	+};
	+
	+DEFINE_CLASS_1(virtio_dbg, virtio_dbg_driver, virtio_dbg_methods,
	+ sizeof(struct vtdbg_softc), vtmmio_driver);
	+/*
	+ * XXX We are currently hanging off of the nexus, not 100% it's the right way.
	+ */
	+DRIVER_MODULE(virtio_dbg, nexus, virtio_dbg_driver, 0, 0);
	+MODULE_VERSION(virtio_dbg, 1);
	+
	+static struct cdev *vtdbg_dev;
	+
	+/*
	+ * Create and map the device memory into the kernel.
	+ */
	+static int
	+vtdbg_map_kernel(struct vtdbg_softc *sc)
	+{
	+ vm_object_t obj = sc->vtd_object;
	+ size_t bytes = IDX_TO_OFF(obj->size);
	+ vm_offset_t baseaddr, tmp;
	+ vm_page_t m, end_m;
	+ int error;
	+
	+ /* XXX Do not allow mapping twice. */
	+
	+ vm_object_reference(obj);
	+
	+ /*
	+ * Populate the object with physically contiguous pages, because
	+ * the object is used to back the virtio device control region.
	+ */
	+ VM_OBJECT_WLOCK(obj);
	+ m = vm_page_alloc_contig(obj, 0, VM_ALLOC_NORMAL \| VM_ALLOC_ZERO, obj->size,
	+ 0, (uint64_t) -1, 1, 0, VM_MEMATTR_DEFAULT);
	+ VM_OBJECT_WUNLOCK(obj);
	+ if (m == NULL) {
	+ vm_object_deallocate(obj);
	+ return (ENOMEM);
	+ }
	+
	+
	+ baseaddr = VM_MIN_KERNEL_ADDRESS;
	+ error = vm_map_find(kernel_map, obj, 0, &baseaddr, bytes, VM_MAX_KERNEL_ADDRESS,
	+ VMFS_OPTIMAL_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0);
	+ if (error != KERN_SUCCESS) {
	+ vm_object_deallocate(obj);
	+ return (ENOMEM);
	+ }
	+
	+ end_m = m + (bytes / PAGE_SIZE);
	+ tmp = baseaddr;
	+ for (; m < end_m; m++) {
	+ vm_page_valid(m);
	+ pmap_zero_page(m);
	+ pmap_enter(kernel_pmap, tmp, m, VM_PROT_RW,
	+ VM_PROT_RW \| PMAP_ENTER_WIRED, 0);
	+ tmp += PAGE_SIZE;
	+ vm_page_xunbusy(m);
	+ }
	+
	+
	+ sc->vtd_baseaddr = baseaddr;
	+ sc->vtd_bytes = bytes;
	+
	+ /* Reserve space for the device control region. */
	+ sc->vtd_allocated = VTDBG_RESERVE_DEVSPACE;
	+
	+ return (0);
	+}
	+
	+static void
	+vtdbg_intr(void *arg)
	+{
	+ struct vtdbg_softc sc = (struct vtdbg_softc )arg;
	+
	+ mtx_lock(&sc->vtd_mtx);
	+ while ((sc->vtd_flags & VTDBG_INTR_EXITING) == 0) {
	+ if ((sc->vtd_flags & VTDBG_INTR_PENDING) == 0) {
	+ cv_wait(&sc->vtd_cv, &sc->vtd_mtx);
	+ continue;
	+ }
	+
	+ sc->vtd_flags &= ~VTDBG_INTR_PENDING;
	+ mtx_unlock(&sc->vtd_mtx);
	+
	+ if (sc->vtd_intr)
	+ sc->vtd_intr(sc->vtd_intr_arg);
	+
	+ mtx_lock(&sc->vtd_mtx);
	+ cv_wait(&sc->vtd_cv, &sc->vtd_mtx);
	+ }
	+
	+ sc->vtd_pintr = NULL;
	+ cv_signal(&sc->vtd_cv);
	+
	+ mtx_unlock(&sc->vtd_mtx);
	+
	+ kproc_exit(0);
	+}
	+
	+/*
	+ * Destroy the virtio transport instance when closing the
	+ * corresponding control device fd.
	+ */
	+static void
	+vtdbg_dtor(void *arg)
	+{
	+ struct virtio_dbg_softc *devsc;
	+ struct vtdbg_softc sc = (struct vtdbg_softc )arg;
	+ vm_offset_t sva, eva;
	+ device_t dev;
	+
	+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
	+
	+ if (sc->vtd_pintr != NULL) {
	+ mtx_lock(&sc->vtd_mtx);
	+ sc->vtd_flags \|= VTDBG_INTR_EXITING;
	+ cv_signal(&sc->vtd_cv);
	+ mtx_unlock(&sc->vtd_mtx);
	+
	+ mtx_lock(&sc->vtd_mtx);
	+ while (sc->vtd_pintr != NULL)
	+ cv_wait(&sc->vtd_cv, &sc->vtd_mtx);
	+ mtx_unlock(&sc->vtd_mtx);
	+ }
	+
	+ dev = sc->vtd_dev;
	+ if (dev != NULL) {
	+ devsc = device_get_softc(dev);
	+
	+ mtx_lock(&Giant);
	+ DEVICE_DETACH(dev);
	+ mtx_unlock(&Giant);
	+
	+ free(devsc->vtmdbg_mmio.res[0], M_DEVBUF);
	+ device_delete_child(vtdbg_parent, dev);
	+ }
	+
	+
	+ if (sc->vtd_baseaddr != 0) {
	+ sva = sc->vtd_baseaddr;
	+ eva = sva + sc->vtd_bytes;
	+ vm_map_remove(kernel_map, sva, eva);
	+ pmap_remove(kernel_pmap, sva, eva);
	+ }
	+
	+ vm_object_deallocate(sc->vtd_object);
	+
	+ knlist_delete(&sc->vtd_note, curthread, 0);
	+ knlist_destroy(&sc->vtd_note);
	+
	+ cv_destroy(&sc->vtd_cv);
	+ mtx_destroy(&sc->vtd_mtx);
	+
	+ free(sc, M_DEVBUF);
	+}
	+
	+static int
	+vtdbg_open(struct cdev cdev, int oflags, int devtype, struct thread td)
	+{
	+ size_t sz = round_page(VTDBG_MAPSZ);
	+ struct vtdbg_softc *sc;
	+ int error;
	+
	+ sc = malloc(sizeof(struct vtdbg_softc), M_DEVBUF, M_NOWAIT\|M_ZERO);
	+ if (sc == NULL)
	+ return (ENOMEM);
	+
	+ sc->vtd_magic = VTDBG_MAGIC;
	+ mtx_init(&sc->vtd_mtx, "vtdbg", NULL, MTX_DEF);
	+ cv_init(&sc->vtd_cv, "vtdbg");
	+
	+ knlist_init_mtx(&sc->vtd_note, &sc->vtd_mtx);
	+
	+ /* Create the common userspace/kernel virtio device region. */
	+ sc->vtd_object = vm_pager_allocate(OBJT_PHYS, NULL, sz, VM_PROT_ALL,
	+ 0, thread0.td_ucred);
	+ if (sc->vtd_object == NULL) {
	+ vtdbg_dtor(sc);
	+ return (ENOMEM);
	+ }
	+
	+ error = vtdbg_map_kernel(sc);
	+ if (error != 0) {
	+ vtdbg_dtor(sc);
	+ return (error);
	+ }
	+
	+ error = kproc_create(vtdbg_intr, (void *)sc, &sc->vtd_pintr,
	+ 0, 0, "vtdbg_intr");
	+ if (error != 0) {
	+ vtdbg_dtor(sc);
	+ return (error);
	+ }
	+
	+ error = devfs_set_cdevpriv((void *)sc, vtdbg_dtor);
	+ if (error != 0)
	+ vtdbg_dtor(sc);
	+
	+ return (error);
	+}
	+
	+static int
	+vtdbg_mmap_single(struct cdev cdev, vm_ooffset_t offset,
	+ vm_size_t size, vm_object_t *objp, int nprot)
	+{
	+ struct vtdbg_softc *sc;
	+ int error;
	+
	+ error = devfs_get_cdevpriv((void **)&sc);
	+ if (error != 0)
	+ return (error);
	+
	+ if (*offset + size > sc->vtd_bytes)
	+ return (EINVAL);
	+
	+ vm_object_reference(sc->vtd_object);
	+ *objp = sc->vtd_object;
	+
	+ return (0);
	+}
	+
	+static void *
	+vtdbg_ringalloc(device_t dev, size_t size)
	+{
	+ struct vtdbg_softc *sc = vtmmio_get_vtdbg(dev);
	+ void *mem;
	+
	+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
	+
	+ mtx_lock(&sc->vtd_mtx);
	+ if (sc->vtd_allocated + size > sc->vtd_bytes) {
	+ mtx_unlock(&sc->vtd_mtx);
	+ return (NULL);
	+ }
	+
	+ mem = (void *)(sc->vtd_baseaddr + sc->vtd_allocated);
	+ sc->vtd_allocated += size;
	+
	+ mtx_unlock(&sc->vtd_mtx);
	+
	+ return (mem);
	+}
	+
	+static device_t
	+vtdbg_create_transport(device_t parent, struct vtdbg_softc *vtdsc)
	+{
	+ struct virtio_dbg_softc *sc;
	+ struct vtmmio_softc *mmiosc;
	+ struct resource *res;
	+ device_t transport;
	+
	+ int uid = 0;
	+
	+ transport = BUS_ADD_CHILD(parent, 0, virtio_dbg_driver.name, uid);
	+ device_set_driver(transport, vtdbg_driver);
	+
	+ sc = device_get_softc(transport);
	+ mmiosc = &sc->vtmdbg_mmio;
	+
	+ /*
	+ * XXX Hack. Create the resource out of thin air to
	+ * keep the vtmmio_write_* calls working. If we wanted to be uniform
	+ * would be reserving the resource out of the RAM pseudobus,
	+ * but it has no associated struct rman * instance,
	+ * and we have already reserved this memory region
	+ * by allocating it anyway so there is no possiblity
	+ * of conflicts..
	+ */
	+ res = malloc(sizeof(*res), M_DEVBUF, M_WAITOK);
	+ res->r_bushandle = vtdsc->vtd_baseaddr;
	+ res->r_bustag = X86_BUS_SPACE_MEM;
	+ mmiosc->res[0] = res;
	+
	+ /* Ring buffer allocation callback. */
	+ mmiosc->vtmmio_ringalloc_cb = vtdbg_ringalloc;
	+
	+ return (transport);
	+}
	+
	+static int
	+vtdbg_linkup_transport(struct vtdbg_softc *vtdsc, device_t dev)
	+{
	+ struct virtio_dbg_softc *mmiosc;
	+
	+ mtx_lock(&vtdsc->vtd_mtx);
	+ if (vtdsc->vtd_dev != NULL) {
	+ mtx_unlock(&vtdsc->vtd_mtx);
	+ return (EALREADY);
	+ }
	+
	+ mmiosc = device_get_softc(dev);
	+
	+ /* Have the device and cdev be able to refer to each other. */
	+ mmiosc->vtmdbg_dbg = vtdsc;
	+ vtdsc->vtd_dev = dev;
	+
	+ mtx_unlock(&vtdsc->vtd_mtx);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Create virtio device. This function does the initialization both
	+ * for the emulated transport, and for the virtio device. These are
	+ * normally (e.g., for MMIO)) created at boot time using vtmmio_probe/vtmmio_attach,
	+ * and vtmmio_probe_and_attach_child, respectively. We do this initialization
	+ * here because we are dynamically creating the devices after booting, so
	+ * we must manually invoke the device probe and attach methods.
	+ */
	+static int
	+vtdbg_init(void)
	+{
	+ struct virtio_dbg_softc *sc;
	+ struct vtdbg_softc *vtdsc;
	+ device_t transport;
	+ int error;
	+
	+ /* Retrieve the mapping address/size. */
	+ error = devfs_get_cdevpriv((void **)&vtdsc);
	+ if (error != 0)
	+ return (error);
	+
	+ MPASS(vtdsc->vtd_magic == VTDBG_MAGIC);
	+
	+ transport = vtdbg_create_transport(vtdbg_parent, vtdsc);
	+
	+ error = vtdbg_linkup_transport(vtdsc, transport);
	+ if (error != 0)
	+ goto err;
	+
	+ error = DEVICE_PROBE(transport);
	+ if (error != 0)
	+ goto err;
	+
	+ return (DEVICE_ATTACH(transport));
	+
	+err:
	+ sc = device_get_softc(transport);
	+
	+ /*
	+ * Release the resource but do not notify
	+ * the parent bus as we didn't reserve it
	+ * from it.
	+ */
	+ free(sc->vtmdbg_mmio.res[0], M_DEVBUF);
	+
	+ mtx_lock(&Giant);
	+ device_delete_child(vtdbg_parent, transport);
	+ mtx_unlock(&Giant);
	+
	+ vtdsc->vtd_dev = NULL;
	+
	+ return (error);
	+}
	+
	+/*
	+ * Kick the dedicated kernel interrupt process.
	+ */
	+static void
	+vtdbg_kick(struct vtdbg_softc *sc)
	+{
	+ mtx_lock(&sc->vtd_mtx);
	+ sc->vtd_flags \|= VTDBG_INTR_PENDING;
	+ cv_signal(&sc->vtd_cv);
	+ mtx_unlock(&sc->vtd_mtx);
	+}
	+
	+/*
	+ * The mmio virtio code uses note() to let the host know there has been a write.
	+ * The note() call suspends the thread until the userspace device has been properly
	+ * emulated, at which point a userspace thread will allow it to resume.
	+ *
	+ * There can only be one unacknowledged interrupt outstanding at a time, so a single
	+ * vtd_offset in the softc is enough.
	+ */
	+static void
	+vtdbg_ack(struct vtdbg_softc *sc)
	+{
	+ mtx_lock(&sc->vtd_mtx);
	+ sc->vtd_offset = 0;
	+ wakeup(sc);
	+ mtx_unlock(&sc->vtd_mtx);
	+}
	+
	+/*
	+ * Get virtio data in and out of the kernel, required by userspace to interact with
	+ * the data pointed to by the virtqueue descriptors.
	+ */
	+static int
	+vtdbg_io(struct vtdbg_softc sc, struct vtdbg_io_args args)
	+{
	+ struct vtdbg_transfer *tf;
	+ caddr_t driver, device;
	+ int error = 0;
	+ size_t len;
	+ int i;
	+
	+ tf = malloc(args->cnt * sizeof(*tf), M_DEVBUF, M_NOWAIT);
	+ if (tf == NULL)
	+ return (ENOMEM);
	+
	+ error = copyin(args->transfers, tf, args->cnt * (sizeof(*tf)));
	+ if (error != 0) {
	+ free(tf, M_DEVBUF);
	+ return (error);
	+ }
	+
	+ for (i = 0; i < args->cnt; i++) {
	+ driver = (caddr_t)PHYS_TO_DMAP((vm_paddr_t)tf[i].vtdt_driver);
	+ /* Translate from physical to kernel virtual. */
	+ device = tf[i].vtdt_device;
	+ len = tf[i].vtdt_len;
	+
	+ if (args->touser)
	+ error = copyout(driver, device, len);
	+ else
	+ error = copyin(device, driver, len);
	+
	+ if (error != 0)
	+ break;
	+ }
	+
	+ free(tf, M_DEVBUF);
	+
	+ return (error);
	+}
	+
	+
	+static int
	+vtdbg_ioctl(struct cdev cdev, u_long cmd, caddr_t data, int fflag, struct thread td)
	+{
	+ struct vtdbg_softc *sc;
	+ int ret = 0;
	+
	+ ret = devfs_get_cdevpriv((void **)&sc);
	+ if (ret != 0)
	+ return (ret);
	+
	+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
	+ switch (cmd) {
	+ case VIRTIO_DBG_INIT:
	+ ret = vtdbg_init();
	+ break;
	+ case VIRTIO_DBG_KICK:
	+ vtdbg_kick(sc);
	+ break;
	+ case VIRTIO_DBG_ACK:
	+ vtdbg_ack(sc);
	+ break;
	+ case VIRTIO_DBG_TRANSFER:
	+ ret = vtdbg_io(sc, (struct vtdbg_io_args *)data);
	+ break;
	+ }
	+
	+ return (ret);
	+}
	+
	+static int
	+vtdbg_filt_attach(struct knote *kn)
	+{
	+ kn->kn_flags \|= EV_CLEAR;
	+ return (0);
	+}
	+
	+static void
	+vtdbg_filt_detach(struct knote *kn)
	+{
	+ struct vtdbg_softc *sc;
	+ sc = (struct vtdbg_softc *)kn->kn_hook;
	+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
	+
	+ knlist_remove(&sc->vtd_note, kn, 0);
	+ kn->kn_hook = NULL;
	+}
	+
	+static int
	+vtdbg_filt_read(struct knote *kn, long hint)
	+{
	+ struct vtdbg_softc *sc;
	+
	+
	+ sc = (struct vtdbg_softc *)kn->kn_hook;
	+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
	+ mtx_assert(&sc->vtd_mtx, MA_OWNED);
	+
	+ if (sc->vtd_offset == 0)
	+ return (0);
	+
	+ kn->kn_data = sc->vtd_offset;
	+
	+ return (1);
	+}
	+
	+struct filterops vtdbg_filtops = {
	+ .f_isfd = 1,
	+ .f_attach = vtdbg_filt_attach,
	+ .f_detach = vtdbg_filt_detach,
	+ .f_event = vtdbg_filt_read,
	+};
	+
	+static int
	+vtdbg_kqfilter(struct cdev dev, struct knote kn)
	+{
	+ struct vtdbg_softc *sc;
	+ int error;
	+
	+ error = devfs_get_cdevpriv((void **)&sc);
	+ if (error != 0)
	+ return (error);
	+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
	+
	+ if (kn->kn_filter != EVFILT_READ) {
	+ kn->kn_data = EINVAL;
	+ return (EINVAL);
	+ }
	+
	+ kn->kn_fop = &vtdbg_filtops;
	+ kn->kn_hook = sc;
	+ knlist_add(&sc->vtd_note, kn, 0);
	+
	+ return (0);
	+
	+}
	+
	+static struct cdevsw vtdbg_cdevsw = {
	+ .d_open = vtdbg_open,
	+ .d_mmap_single = vtdbg_mmap_single,
	+ .d_ioctl = vtdbg_ioctl,
	+ .d_kqfilter = vtdbg_kqfilter,
	+ .d_name = "vtdbg",
	+ .d_version = D_VERSION,
	+};
	+
	+static int
	+vtdbg_dev_create(void)
	+{
	+ vtdbg_dev = make_dev(&vtdbg_cdevsw, 0, UID_ROOT, GID_OPERATOR,
	+ S_IRUSR \| S_IWUSR \| S_IRGRP \| S_IWGRP, "vtdbg");
	+ if (vtdbg_dev == NULL)
	+ return (ENOMEM);
	+
	+ return (0);
	+}
	+
	+static void
	+vtdbg_dev_destroy(void)
	+{
	+ MPASS(vtdbg_dev != NULL);
	+ destroy_dev(vtdbg_dev);
	+}
	+
	+static int
	+vtdbg_loader(struct module m, int what, void arg)
	+{
	+ int err = 0;
	+
	+ switch (what) {
	+ case MOD_LOAD:
	+ err = vtdbg_dev_create();
	+ break;
	+ case MOD_UNLOAD:
	+ vtdbg_dev_destroy();
	+ break;
	+ default:
	+ return (EINVAL);
	+ }
	+
	+ return (err);
	+}
	+
	+static moduledata_t vtdbg_moddata = {
	+ "vtdbg",
	+ vtdbg_loader,
	+ NULL,
	+};
	+
	+DECLARE_MODULE(vtdbg, vtdbg_moddata, SI_SUB_VFS, SI_ORDER_MIDDLE);
	diff --git a/sys/dev/virtio/mmio/virtio_mmio.h b/sys/dev/virtio/mmio/virtio_mmio.h
	--- a/sys/dev/virtio/mmio/virtio_mmio.h
	+++ b/sys/dev/virtio/mmio/virtio_mmio.h
	@@ -31,9 +31,12 @@
	#ifndef _VIRTIO_MMIO_H
	#define _VIRTIO_MMIO_H

	+#ifdef _KERNEL
	+
	DECLARE_CLASS(vtmmio_driver);

	struct vtmmio_virtqueue;
	+typedef void *vtmmio_alloc_cb_t(device_t, size_t);

	struct vtmmio_softc {
	device_t dev;
	@@ -51,10 +54,25 @@
	int vtmmio_nvqs;
	struct vtmmio_virtqueue *vtmmio_vqs;
	void *ih;
	+
	+ vtmmio_alloc_cb_t *vtmmio_ringalloc_cb;
	};

	int vtmmio_probe(device_t);
	int vtmmio_attach(device_t);
	+void vtmmio_reset(struct vtmmio_softc *);
	+uint8_t vtmmio_get_status(device_t);
	+void vtmmio_set_status(device_t, uint8_t);
	+void vtmmio_probe_and_attach_child(struct vtmmio_softc *);
	+
	+#define vtmmio_read_config_1(sc, o) \
	+ bus_read_1((sc)->res[0], (o))
	+#define vtmmio_read_config_2(sc, o) \
	+ bus_read_2((sc)->res[0], (o))
	+#define vtmmio_read_config_4(sc, o) \
	+ bus_read_4((sc)->res[0], (o))
	+
	+#endif /* _KERNEL */

	#define VIRTIO_MMIO_MAGIC_VALUE 0x000
	#define VIRTIO_MMIO_VERSION 0x004
	diff --git a/sys/dev/virtio/mmio/virtio_mmio.c b/sys/dev/virtio/mmio/virtio_mmio.c
	--- a/sys/dev/virtio/mmio/virtio_mmio.c
	+++ b/sys/dev/virtio/mmio/virtio_mmio.c
	@@ -84,19 +84,15 @@
	static void vtmmio_reinit_complete(device_t);
	static void vtmmio_notify_virtqueue(device_t, uint16_t, bus_size_t);
	static int vtmmio_config_generation(device_t);
	-static uint8_t vtmmio_get_status(device_t);
	-static void vtmmio_set_status(device_t, uint8_t);
	static void vtmmio_read_dev_config(device_t, bus_size_t, void *, int);
	static uint64_t vtmmio_read_dev_config_8(struct vtmmio_softc *, bus_size_t);
	static void vtmmio_write_dev_config(device_t, bus_size_t, const void *, int);
	static void vtmmio_describe_features(struct vtmmio_softc , const char ,
	uint64_t);
	-static void vtmmio_probe_and_attach_child(struct vtmmio_softc *);
	static int vtmmio_reinit_virtqueue(struct vtmmio_softc *, int);
	static void vtmmio_free_interrupts(struct vtmmio_softc *);
	static void vtmmio_free_virtqueues(struct vtmmio_softc *);
	static void vtmmio_release_child_resources(struct vtmmio_softc *);
	-static void vtmmio_reset(struct vtmmio_softc *);
	static void vtmmio_select_virtqueue(struct vtmmio_softc *, int);
	static void vtmmio_vq_intr(void *);

	@@ -128,13 +124,6 @@
	VIRTIO_MMIO_NOTE(sc->platform, (o), (v)); \
	} while (0)

	-#define vtmmio_read_config_1(sc, o) \
	- bus_read_1((sc)->res[0], (o))
	-#define vtmmio_read_config_2(sc, o) \
	- bus_read_2((sc)->res[0], (o))
	-#define vtmmio_read_config_4(sc, o) \
	- bus_read_4((sc)->res[0], (o))
	-
	static device_method_t vtmmio_methods[] = {
	/* Device interface. */
	DEVMETHOD(device_attach, vtmmio_attach),
	@@ -572,7 +561,7 @@

	error = virtqueue_alloc(dev, idx, size,
	VIRTIO_MMIO_QUEUE_NOTIFY, VIRTIO_MMIO_VRING_ALIGN,
	- ~(vm_paddr_t)0, info, &vq);
	+ ~(vm_paddr_t)0, info, &vq, sc->vtmmio_ringalloc_cb);
	if (error) {
	device_printf(dev,
	"cannot allocate virtqueue %d: %d\n",
	@@ -689,7 +678,7 @@
	return (gen);
	}

	-static uint8_t
	+uint8_t
	vtmmio_get_status(device_t dev)
	{
	struct vtmmio_softc *sc;
	@@ -699,7 +688,7 @@
	return (vtmmio_read_config_4(sc, VIRTIO_MMIO_STATUS));
	}

	-static void
	+void
	vtmmio_set_status(device_t dev, uint8_t status)
	{
	struct vtmmio_softc *sc;
	@@ -875,7 +864,7 @@
	virtio_describe(dev, msg, features, sc->vtmmio_child_feat_desc);
	}

	-static void
	+void
	vtmmio_probe_and_attach_child(struct vtmmio_softc *sc)
	{
	device_t dev, child;
	@@ -976,7 +965,7 @@
	vtmmio_free_virtqueues(sc);
	}

	-static void
	+void
	vtmmio_reset(struct vtmmio_softc *sc)
	{

	diff --git a/sys/dev/virtio/pci/virtio_pci.c b/sys/dev/virtio/pci/virtio_pci.c
	--- a/sys/dev/virtio/pci/virtio_pci.c
	+++ b/sys/dev/virtio/pci/virtio_pci.c
	@@ -362,7 +362,7 @@
	notify_offset = vtpci_get_vq_notify_off(cn, idx);

	error = virtqueue_alloc(dev, idx, size, notify_offset, align,
	- ~(vm_paddr_t)0, info, &vq);
	+ ~(vm_paddr_t)0, info, &vq, NULL);
	if (error) {
	device_printf(dev,
	"cannot allocate virtqueue %d: %d\n", idx, error);
	diff --git a/sys/dev/virtio/virtio.h b/sys/dev/virtio/virtio.h
	--- a/sys/dev/virtio/virtio.h
	+++ b/sys/dev/virtio/virtio.h
	@@ -68,7 +68,9 @@

	#define VIRTIO_DRIVER_MODULE(name, driver, evh, arg) \
	DRIVER_MODULE(name, virtio_mmio, driver, evh, arg); \
	- DRIVER_MODULE(name, virtio_pci, driver, evh, arg)
	+ DRIVER_MODULE(name, virtio_pci, driver, evh, arg); \
	+ DRIVER_MODULE(name, virtio_dbg, driver, evh, arg)
	+

	struct virtio_pnp_match {
	uint32_t device_type;
	@@ -82,6 +84,8 @@
	MODULE_PNP_INFO("U32:device_type;D:#", virtio_mmio, driver, \
	&driver ## _match, 1); \
	MODULE_PNP_INFO("U32:device_type;D:#", virtio_pci, driver, \
	+ &driver ## _match, 1) \
	+ MODULE_PNP_INFO("U32:device_type;D:#", virtio_dbg, driver, \
	&driver ## _match, 1)
	#define VIRTIO_SIMPLE_PROBE(dev, driver) \
	(virtio_simple_probe(dev, &driver ## _match))
	diff --git a/sys/dev/virtio/virtqueue.h b/sys/dev/virtio/virtqueue.h
	--- a/sys/dev/virtio/virtqueue.h
	+++ b/sys/dev/virtio/virtqueue.h
	@@ -34,6 +34,7 @@

	/* Device callback for a virtqueue interrupt. */
	typedef void virtqueue_intr_t(void *);
	+typedef void *virtqueue_alloc_cb_t(device_t, size_t);

	/*
	* Hint on how long the next interrupt should be postponed. This is
	@@ -67,7 +68,8 @@

	int virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size,
	bus_size_t notify_offset, int align, vm_paddr_t highaddr,
	- struct vq_alloc_info info, struct virtqueue *vqp);
	+ struct vq_alloc_info info, struct virtqueue *vqp,
	+ virtqueue_alloc_cb_t *cb);
	void virtqueue_drain(struct virtqueue vq, int *last);
	void virtqueue_free(struct virtqueue *vq);
	int virtqueue_reinit(struct virtqueue *vq, uint16_t size);
	diff --git a/sys/dev/virtio/virtqueue.c b/sys/dev/virtio/virtqueue.c
	--- a/sys/dev/virtio/virtqueue.c
	+++ b/sys/dev/virtio/virtqueue.c
	@@ -151,7 +151,8 @@
	int
	virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size,
	bus_size_t notify_offset, int align, vm_paddr_t highaddr,
	- struct vq_alloc_info info, struct virtqueue *vqp)
	+ struct vq_alloc_info info, struct virtqueue *vqp,
	+ virtqueue_alloc_cb_t alloc_cb)
	{
	struct virtqueue *vq;
	int error;
	@@ -206,8 +207,12 @@
	}

	vq->vq_ring_size = round_page(vring_size(size, align));
	- vq->vq_ring_mem = contigmalloc(vq->vq_ring_size, M_DEVBUF,
	- M_NOWAIT \| M_ZERO, 0, highaddr, PAGE_SIZE, 0);
	+ if (alloc_cb != NULL) {
	+ vq->vq_ring_mem = alloc_cb(dev, vq->vq_ring_size);
	+ } else {
	+ vq->vq_ring_mem = contigmalloc(vq->vq_ring_size, M_DEVBUF,
	+ M_NOWAIT \| M_ZERO, 0, highaddr, PAGE_SIZE, 0);
	+ }
	if (vq->vq_ring_mem == NULL) {
	device_printf(dev,
	"cannot allocate memory for virtqueue ring\n");
	diff --git a/tests/sys/Makefile b/tests/sys/Makefile
	--- a/tests/sys/Makefile
	+++ b/tests/sys/Makefile
	@@ -33,6 +33,7 @@
	TESTS_SUBDIRS+= ses
	TESTS_SUBDIRS+= sys
	TESTS_SUBDIRS+= vfs
	+TESTS_SUBDIRS+= virtio
	TESTS_SUBDIRS+= vm
	TESTS_SUBDIRS+= vmm

	diff --git a/tests/sys/virtio/Makefile b/tests/sys/virtio/Makefile
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/Makefile
	@@ -0,0 +1,28 @@
	+PROG= virtiodbg
	+
	+.PATH: ${SRCTOP}/sys/libkern
	+
	+SRCS= block_if.c \
	+ config.c \
	+ iov.c \
	+ iov_emul.c \
	+ mevent.c \
	+ mmio_virtio_block.c \
	+ mmio_emul.c \
	+ virtio.c \
	+ virtiodbg.c
	+
	+MAN=
	+
	+CFLAGS+=-I${.CURDIR} \
	+ -I${SRCTOP}/sys
	+
	+LIBADD+= md nv pthread
	+
	+# Disable thread safety analysis since it only finds very simple bugs and
	+# yields many false positives.
	+NO_WTHREAD_SAFETY=
	+
	+NO_WCAST_ALIGN=
	+
	+.include <bsd.prog.mk>
	diff --git a/tests/sys/virtio/block_if.h b/tests/sys/virtio/block_if.h
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/block_if.h
	@@ -0,0 +1,84 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
	+ * All rights reserved.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+/*
	+ * The block API to be used by bhyve block-device emulations. The routines
	+ * are thread safe, with no assumptions about the context of the completion
	+ * callback - it may occur in the caller's context, or asynchronously in
	+ * another thread.
	+ */
	+
	+#ifndef _BLOCK_IF_H_
	+#define _BLOCK_IF_H_
	+
	+#include <sys/nv.h>
	+#include <sys/uio.h>
	+#include <sys/unistd.h>
	+
	+/*
	+ * BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in
	+ * a single request. BLOCKIF_RING_MAX is the maxmimum number of
	+ * pending requests that can be queued.
	+ */
	+#define BLOCKIF_IOV_MAX 128 /* not practical to be IOV_MAX */
	+#define BLOCKIF_RING_MAX 128
	+
	+struct blockif_req {
	+ int br_iovcnt;
	+ off_t br_offset;
	+ ssize_t br_resid;
	+ void (br_callback)(struct blockif_req req, int err);
	+ void *br_param;
	+ struct iovec br_iov[BLOCKIF_IOV_MAX];
	+};
	+
	+struct mmio_devinst;
	+struct blockif_ctxt;
	+
	+typedef void blockif_resize_cb(struct blockif_ctxt , void , size_t, uint64_t);
	+
	+int blockif_legacy_config(nvlist_t nvl, const char opts);
	+struct blockif_ctxt blockif_open(nvlist_t nvl, const char *ident);
	+int blockif_register_resize_callback(struct blockif_ctxt *bc,
	+ blockif_resize_cb cb, void cb_arg);
	+off_t blockif_size(struct blockif_ctxt *bc);
	+void blockif_chs(struct blockif_ctxt bc, uint16_t c, uint8_t *h,
	+ uint8_t *s);
	+int blockif_sectsz(struct blockif_ctxt *bc);
	+void blockif_psectsz(struct blockif_ctxt bc, int size, int *off);
	+int blockif_queuesz(struct blockif_ctxt *bc);
	+int blockif_is_ro(struct blockif_ctxt *bc);
	+int blockif_candelete(struct blockif_ctxt *bc);
	+int blockif_read(struct blockif_ctxt bc, struct blockif_req breq);
	+int blockif_write(struct blockif_ctxt bc, struct blockif_req breq);
	+int blockif_flush(struct blockif_ctxt bc, struct blockif_req breq);
	+int blockif_delete(struct blockif_ctxt bc, struct blockif_req breq);
	+int blockif_cancel(struct blockif_ctxt bc, struct blockif_req breq);
	+int blockif_close(struct blockif_ctxt *bc);
	+
	+#endif /* _BLOCK_IF_H_ */
	diff --git a/tests/sys/virtio/block_if.c b/tests/sys/virtio/block_if.c
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/block_if.c
	@@ -0,0 +1,980 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
	+ * All rights reserved.
	+ * Copyright 2020 Joyent, Inc.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+#include <sys/param.h>
	+#ifndef WITHOUT_CAPSICUM
	+#include <sys/capsicum.h>
	+#endif
	+#include <sys/queue.h>
	+#include <sys/errno.h>
	+#include <sys/stat.h>
	+#include <sys/ioctl.h>
	+#include <sys/disk.h>
	+
	+#include <assert.h>
	+#ifndef WITHOUT_CAPSICUM
	+#include <capsicum_helpers.h>
	+#endif
	+#include <err.h>
	+#include <fcntl.h>
	+#include <stdio.h>
	+#include <stdlib.h>
	+#include <string.h>
	+#include <pthread.h>
	+#include <pthread_np.h>
	+#include <signal.h>
	+#include <sysexits.h>
	+#include <unistd.h>
	+
	+#include <machine/atomic.h>
	+#include <machine/vmm_snapshot.h>
	+
	+#include "config.h"
	+#include "debug.h"
	+#include "mevent.h"
	+#include "block_if.h"
	+
	+#define BLOCKIF_SIG 0xb109b109
	+
	+#define BLOCKIF_NUMTHR 8
	+#define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
	+
	+enum blockop {
	+ BOP_READ,
	+ BOP_WRITE,
	+ BOP_FLUSH,
	+ BOP_DELETE
	+};
	+
	+enum blockstat {
	+ BST_FREE,
	+ BST_BLOCK,
	+ BST_PEND,
	+ BST_BUSY,
	+ BST_DONE
	+};
	+
	+struct blockif_elem {
	+ TAILQ_ENTRY(blockif_elem) be_link;
	+ struct blockif_req *be_req;
	+ enum blockop be_op;
	+ enum blockstat be_status;
	+ pthread_t be_tid;
	+ off_t be_block;
	+};
	+
	+struct blockif_ctxt {
	+ unsigned int bc_magic;
	+ int bc_fd;
	+ int bc_ischr;
	+ int bc_isgeom;
	+ int bc_candelete;
	+ int bc_rdonly;
	+ off_t bc_size;
	+ int bc_sectsz;
	+ int bc_psectsz;
	+ int bc_psectoff;
	+ int bc_closing;
	+ int bc_paused;
	+ pthread_t bc_btid[BLOCKIF_NUMTHR];
	+ pthread_mutex_t bc_mtx;
	+ pthread_cond_t bc_cond;
	+ pthread_cond_t bc_work_done_cond;
	+ blockif_resize_cb *bc_resize_cb;
	+ void *bc_resize_cb_arg;
	+ struct mevent *bc_resize_event;
	+
	+ /* Request elements and free/pending/busy queues */
	+ TAILQ_HEAD(, blockif_elem) bc_freeq;
	+ TAILQ_HEAD(, blockif_elem) bc_pendq;
	+ TAILQ_HEAD(, blockif_elem) bc_busyq;
	+ struct blockif_elem bc_reqs[BLOCKIF_MAXREQ];
	+ int bc_bootindex;
	+};
	+
	+static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
	+
	+struct blockif_sig_elem {
	+ pthread_mutex_t bse_mtx;
	+ pthread_cond_t bse_cond;
	+ int bse_pending;
	+ struct blockif_sig_elem *bse_next;
	+};
	+
	+static struct blockif_sig_elem *blockif_bse_head;
	+
	+static int
	+blockif_enqueue(struct blockif_ctxt bc, struct blockif_req breq,
	+ enum blockop op)
	+{
	+ struct blockif_elem be, tbe;
	+ off_t off;
	+ int i;
	+
	+ be = TAILQ_FIRST(&bc->bc_freeq);
	+ assert(be != NULL);
	+ assert(be->be_status == BST_FREE);
	+ TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
	+ be->be_req = breq;
	+ be->be_op = op;
	+ switch (op) {
	+ case BOP_READ:
	+ case BOP_WRITE:
	+ case BOP_DELETE:
	+ off = breq->br_offset;
	+ for (i = 0; i < breq->br_iovcnt; i++)
	+ off += breq->br_iov[i].iov_len;
	+ break;
	+ default:
	+ off = OFF_MAX;
	+ }
	+ be->be_block = off;
	+ TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
	+ if (tbe->be_block == breq->br_offset)
	+ break;
	+ }
	+ if (tbe == NULL) {
	+ TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
	+ if (tbe->be_block == breq->br_offset)
	+ break;
	+ }
	+ }
	+ if (tbe == NULL)
	+ be->be_status = BST_PEND;
	+ else
	+ be->be_status = BST_BLOCK;
	+ TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
	+ return (be->be_status == BST_PEND);
	+}
	+
	+static int
	+blockif_dequeue(struct blockif_ctxt bc, pthread_t t, struct blockif_elem *bep)
	+{
	+ struct blockif_elem *be;
	+
	+ TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
	+ if (be->be_status == BST_PEND)
	+ break;
	+ assert(be->be_status == BST_BLOCK);
	+ }
	+ if (be == NULL)
	+ return (0);
	+ TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
	+ be->be_status = BST_BUSY;
	+ be->be_tid = t;
	+ TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
	+ *bep = be;
	+ return (1);
	+}
	+
	+static void
	+blockif_complete(struct blockif_ctxt bc, struct blockif_elem be)
	+{
	+ struct blockif_elem *tbe;
	+
	+ if (be->be_status == BST_DONE \|\| be->be_status == BST_BUSY)
	+ TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
	+ else
	+ TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
	+ TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
	+ if (tbe->be_req->br_offset == be->be_block)
	+ tbe->be_status = BST_PEND;
	+ }
	+ be->be_tid = 0;
	+ be->be_status = BST_FREE;
	+ be->be_req = NULL;
	+ TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
	+}
	+
	+static int
	+blockif_flush_bc(struct blockif_ctxt *bc)
	+{
	+ if (bc->bc_ischr) {
	+ if (ioctl(bc->bc_fd, DIOCGFLUSH))
	+ return (errno);
	+ } else if (fsync(bc->bc_fd))
	+ return (errno);
	+
	+ return (0);
	+}
	+
	+static void
	+blockif_proc(struct blockif_ctxt bc, struct blockif_elem be, uint8_t *buf)
	+{
	+ struct spacectl_range range;
	+ struct blockif_req *br;
	+ off_t arg[2];
	+ ssize_t n;
	+ size_t clen, len, off, boff, voff;
	+ int i, err;
	+
	+ br = be->be_req;
	+ assert(br->br_resid >= 0);
	+
	+ if (br->br_iovcnt <= 1)
	+ buf = NULL;
	+ err = 0;
	+ switch (be->be_op) {
	+ case BOP_READ:
	+ if (buf == NULL) {
	+ if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
	+ br->br_offset)) < 0)
	+ err = errno;
	+ else
	+ br->br_resid -= n;
	+ break;
	+ }
	+ i = 0;
	+ off = voff = 0;
	+ while (br->br_resid > 0) {
	+ len = MIN(br->br_resid, MAXPHYS);
	+ n = pread(bc->bc_fd, buf, len, br->br_offset + off);
	+ if (n < 0) {
	+ err = errno;
	+ break;
	+ }
	+ len = (size_t)n;
	+ boff = 0;
	+ do {
	+ clen = MIN(len - boff, br->br_iov[i].iov_len -
	+ voff);
	+ memcpy((uint8_t *)br->br_iov[i].iov_base + voff,
	+ buf + boff, clen);
	+ if (clen < br->br_iov[i].iov_len - voff)
	+ voff += clen;
	+ else {
	+ i++;
	+ voff = 0;
	+ }
	+ boff += clen;
	+ } while (boff < len);
	+ off += len;
	+ br->br_resid -= len;
	+ }
	+ break;
	+ case BOP_WRITE:
	+ if (bc->bc_rdonly) {
	+ err = EROFS;
	+ break;
	+ }
	+ if (buf == NULL) {
	+ if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
	+ br->br_offset)) < 0)
	+ err = errno;
	+ else
	+ br->br_resid -= n;
	+ break;
	+ }
	+ i = 0;
	+ off = voff = 0;
	+ while (br->br_resid > 0) {
	+ len = MIN(br->br_resid, MAXPHYS);
	+ boff = 0;
	+ do {
	+ clen = MIN(len - boff, br->br_iov[i].iov_len -
	+ voff);
	+ memcpy(buf + boff,
	+ (uint8_t *)br->br_iov[i].iov_base + voff,
	+ clen);
	+ if (clen < br->br_iov[i].iov_len - voff)
	+ voff += clen;
	+ else {
	+ i++;
	+ voff = 0;
	+ }
	+ boff += clen;
	+ } while (boff < len);
	+
	+ n = pwrite(bc->bc_fd, buf, len, br->br_offset + off);
	+ if (n < 0) {
	+ err = errno;
	+ break;
	+ }
	+ off += n;
	+ br->br_resid -= n;
	+ }
	+ break;
	+ case BOP_FLUSH:
	+ err = blockif_flush_bc(bc);
	+ break;
	+ case BOP_DELETE:
	+ if (!bc->bc_candelete)
	+ err = EOPNOTSUPP;
	+ else if (bc->bc_rdonly)
	+ err = EROFS;
	+ else if (bc->bc_ischr) {
	+ arg[0] = br->br_offset;
	+ arg[1] = br->br_resid;
	+ if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
	+ err = errno;
	+ else
	+ br->br_resid = 0;
	+ } else {
	+ range.r_offset = br->br_offset;
	+ range.r_len = br->br_resid;
	+
	+ while (range.r_len > 0) {
	+ if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC,
	+ &range, 0, &range) != 0) {
	+ err = errno;
	+ break;
	+ }
	+ }
	+ if (err == 0)
	+ br->br_resid = 0;
	+ }
	+ break;
	+ default:
	+ err = EINVAL;
	+ break;
	+ }
	+
	+ be->be_status = BST_DONE;
	+
	+ (*br->br_callback)(br, err);
	+}
	+
	+static inline bool
	+blockif_empty(const struct blockif_ctxt *bc)
	+{
	+ return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq));
	+}
	+
	+static void *
	+blockif_thr(void *arg)
	+{
	+ struct blockif_ctxt *bc;
	+ struct blockif_elem *be;
	+ pthread_t t;
	+ uint8_t *buf;
	+
	+ bc = arg;
	+ if (bc->bc_isgeom)
	+ buf = malloc(MAXPHYS);
	+ else
	+ buf = NULL;
	+ t = pthread_self();
	+
	+ pthread_mutex_lock(&bc->bc_mtx);
	+ for (;;) {
	+ while (blockif_dequeue(bc, t, &be)) {
	+ pthread_mutex_unlock(&bc->bc_mtx);
	+ blockif_proc(bc, be, buf);
	+ pthread_mutex_lock(&bc->bc_mtx);
	+ blockif_complete(bc, be);
	+ }
	+
	+ /* If none to work, notify the main thread */
	+ if (blockif_empty(bc))
	+ pthread_cond_broadcast(&bc->bc_work_done_cond);
	+
	+ /* Check ctxt status here to see if exit requested */
	+ if (bc->bc_closing)
	+ break;
	+
	+ pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
	+ }
	+ pthread_mutex_unlock(&bc->bc_mtx);
	+
	+ if (buf)
	+ free(buf);
	+ pthread_exit(NULL);
	+ return (NULL);
	+}
	+
	+static void
	+blockif_sigcont_handler(int signal __unused, enum ev_type type __unused,
	+ void *arg __unused, uint64_t data __unused)
	+{
	+ struct blockif_sig_elem *bse;
	+
	+ for (;;) {
	+ /*
	+ * Process the entire list even if not intended for
	+ * this thread.
	+ */
	+ do {
	+ bse = blockif_bse_head;
	+ if (bse == NULL)
	+ return;
	+ } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
	+ (uintptr_t)bse,
	+ (uintptr_t)bse->bse_next));
	+
	+ pthread_mutex_lock(&bse->bse_mtx);
	+ bse->bse_pending = 0;
	+ pthread_cond_signal(&bse->bse_cond);
	+ pthread_mutex_unlock(&bse->bse_mtx);
	+ }
	+}
	+
	+static void
	+blockif_init(void)
	+{
	+ mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
	+ (void) signal(SIGCONT, SIG_IGN);
	+}
	+
	+
	+struct blockif_ctxt *
	+blockif_open(nvlist_t nvl, const char ident)
	+{
	+ char tname[MAXCOMLEN + 1];
	+ char name[MAXPATHLEN];
	+ const char path, pssval, ssval, bootindex_val;
	+ char *cp;
	+ struct blockif_ctxt *bc;
	+ struct stat sbuf;
	+ struct diocgattr_arg arg;
	+ off_t size, psectsz, psectoff;
	+ int extra, fd, i, sectsz;
	+ int ro, candelete, geom, ssopt, pssopt;
	+ int nodelete;
	+ int bootindex;
	+
	+#ifndef WITHOUT_CAPSICUM
	+ cap_rights_t rights;
	+ cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE };
	+#endif
	+
	+ pthread_once(&blockif_once, blockif_init);
	+
	+ fd = -1;
	+ extra = 0;
	+ ssopt = 0;
	+ ro = 0;
	+ nodelete = 0;
	+ bootindex = -1;
	+
	+ if (get_config_bool_node_default(nvl, "nocache", false))
	+ extra \|= O_DIRECT;
	+ if (get_config_bool_node_default(nvl, "nodelete", false))
	+ nodelete = 1;
	+ if (get_config_bool_node_default(nvl, "sync", false) \|\|
	+ get_config_bool_node_default(nvl, "direct", false))
	+ extra \|= O_SYNC;
	+ if (get_config_bool_node_default(nvl, "ro", false))
	+ ro = 1;
	+ ssval = get_config_value_node(nvl, "sectorsize");
	+ if (ssval != NULL) {
	+ ssopt = strtol(ssval, &cp, 10);
	+ if (cp == ssval) {
	+ EPRINTLN("Invalid sector size \"%s\"", ssval);
	+ goto err;
	+ }
	+ if (*cp == '\0') {
	+ pssopt = ssopt;
	+ } else if (*cp == '/') {
	+ pssval = cp + 1;
	+ pssopt = strtol(pssval, &cp, 10);
	+ if (cp == pssval \|\| *cp != '\0') {
	+ EPRINTLN("Invalid sector size \"%s\"", ssval);
	+ goto err;
	+ }
	+ } else {
	+ EPRINTLN("Invalid sector size \"%s\"", ssval);
	+ goto err;
	+ }
	+ }
	+
	+ bootindex_val = get_config_value_node(nvl, "bootindex");
	+ if (bootindex_val != NULL) {
	+ bootindex = atoi(bootindex_val);
	+ }
	+
	+ path = get_config_value_node(nvl, "path");
	+ if (path == NULL) {
	+ EPRINTLN("Missing \"path\" for block device.");
	+ goto err;
	+ }
	+
	+ fd = open(path, (ro ? O_RDONLY : O_RDWR) \| extra);
	+ if (fd < 0 && !ro) {
	+ /* Attempt a r/w fail with a r/o open */
	+ fd = open(path, O_RDONLY \| extra);
	+ ro = 1;
	+ }
	+
	+ if (fd < 0) {
	+ warn("Could not open backing file: %s", path);
	+ goto err;
	+ }
	+
	+ if (fstat(fd, &sbuf) < 0) {
	+ warn("Could not stat backing file %s", path);
	+ goto err;
	+ }
	+
	+#ifndef WITHOUT_CAPSICUM
	+ cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
	+ CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF);
	+ if (ro)
	+ cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
	+
	+ if (caph_rights_limit(fd, &rights) == -1)
	+ errx(EX_OSERR, "Unable to apply rights for sandbox");
	+#endif
	+
	+ /*
	+ * Deal with raw devices
	+ */
	+ size = sbuf.st_size;
	+ sectsz = DEV_BSIZE;
	+ psectsz = psectoff = 0;
	+ candelete = geom = 0;
	+ if (S_ISCHR(sbuf.st_mode)) {
	+ if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 \|\|
	+ ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
	+ perror("Could not fetch dev blk/sector size");
	+ goto err;
	+ }
	+ assert(size != 0);
	+ assert(sectsz != 0);
	+ if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
	+ ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
	+ strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
	+ arg.len = sizeof(arg.value.i);
	+ if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
	+ candelete = arg.value.i;
	+ if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
	+ geom = 1;
	+ } else {
	+ psectsz = sbuf.st_blksize;
	+ /* Avoid fallback implementation */
	+ candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1;
	+ }
	+
	+#ifndef WITHOUT_CAPSICUM
	+ if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
	+ errx(EX_OSERR, "Unable to apply rights for sandbox");
	+#endif
	+
	+ if (ssopt != 0) {
	+ if (!powerof2(ssopt) \|\| !powerof2(pssopt) \|\| ssopt < 512 \|\|
	+ ssopt > pssopt) {
	+ EPRINTLN("Invalid sector size %d/%d",
	+ ssopt, pssopt);
	+ goto err;
	+ }
	+
	+ /*
	+ * Some backend drivers (e.g. cd0, ada0) require that the I/O
	+ * size be a multiple of the device's sector size.
	+ *
	+ * Validate that the emulated sector size complies with this
	+ * requirement.
	+ */
	+ if (S_ISCHR(sbuf.st_mode)) {
	+ if (ssopt < sectsz \|\| (ssopt % sectsz) != 0) {
	+ EPRINTLN("Sector size %d incompatible "
	+ "with underlying device sector size %d",
	+ ssopt, sectsz);
	+ goto err;
	+ }
	+ }
	+
	+ sectsz = ssopt;
	+ psectsz = pssopt;
	+ psectoff = 0;
	+ }
	+
	+ bc = calloc(1, sizeof(struct blockif_ctxt));
	+ if (bc == NULL) {
	+ perror("calloc");
	+ goto err;
	+ }
	+
	+ bc->bc_magic = BLOCKIF_SIG;
	+ bc->bc_fd = fd;
	+ bc->bc_ischr = S_ISCHR(sbuf.st_mode);
	+ bc->bc_isgeom = geom;
	+ bc->bc_candelete = candelete;
	+ bc->bc_rdonly = ro;
	+ bc->bc_size = size;
	+ bc->bc_sectsz = sectsz;
	+ bc->bc_psectsz = psectsz;
	+ bc->bc_psectoff = psectoff;
	+ pthread_mutex_init(&bc->bc_mtx, NULL);
	+ pthread_cond_init(&bc->bc_cond, NULL);
	+ bc->bc_paused = 0;
	+ pthread_cond_init(&bc->bc_work_done_cond, NULL);
	+ TAILQ_INIT(&bc->bc_freeq);
	+ TAILQ_INIT(&bc->bc_pendq);
	+ TAILQ_INIT(&bc->bc_busyq);
	+ bc->bc_bootindex = bootindex;
	+ for (i = 0; i < BLOCKIF_MAXREQ; i++) {
	+ bc->bc_reqs[i].be_status = BST_FREE;
	+ TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
	+ }
	+
	+ for (i = 0; i < BLOCKIF_NUMTHR; i++) {
	+ pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
	+ snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
	+ pthread_set_name_np(bc->bc_btid[i], tname);
	+ }
	+
	+ return (bc);
	+err:
	+ if (fd >= 0)
	+ close(fd);
	+ return (NULL);
	+}
	+
	+static void
	+blockif_resized(int fd, enum ev_type type __unused, void *arg,
	+ uint64_t data __unused)
	+{
	+ struct blockif_ctxt *bc;
	+ struct stat sb;
	+ off_t mediasize;
	+
	+ if (fstat(fd, &sb) != 0)
	+ return;
	+
	+ if (S_ISCHR(sb.st_mode)) {
	+ if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) {
	+ EPRINTLN("blockif_resized: get mediasize failed: %s",
	+ strerror(errno));
	+ return;
	+ }
	+ } else
	+ mediasize = sb.st_size;
	+
	+ bc = arg;
	+ pthread_mutex_lock(&bc->bc_mtx);
	+ if (mediasize != bc->bc_size) {
	+ bc->bc_size = mediasize;
	+ bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size, 0);
	+ }
	+ pthread_mutex_unlock(&bc->bc_mtx);
	+}
	+
	+int
	+blockif_register_resize_callback(struct blockif_ctxt bc, blockif_resize_cb cb,
	+ void *cb_arg)
	+{
	+ struct stat sb;
	+ int err;
	+
	+ if (cb == NULL)
	+ return (EINVAL);
	+
	+ err = 0;
	+
	+ pthread_mutex_lock(&bc->bc_mtx);
	+ if (bc->bc_resize_cb != NULL) {
	+ err = EBUSY;
	+ goto out;
	+ }
	+
	+ assert(bc->bc_closing == 0);
	+
	+ if (fstat(bc->bc_fd, &sb) != 0) {
	+ err = errno;
	+ goto out;
	+ }
	+
	+ bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE,
	+ EVFF_ATTRIB, blockif_resized, bc);
	+ if (bc->bc_resize_event == NULL) {
	+ err = ENXIO;
	+ goto out;
	+ }
	+
	+ bc->bc_resize_cb = cb;
	+ bc->bc_resize_cb_arg = cb_arg;
	+out:
	+ pthread_mutex_unlock(&bc->bc_mtx);
	+
	+ return (err);
	+}
	+
	+static int
	+blockif_request(struct blockif_ctxt bc, struct blockif_req breq,
	+ enum blockop op)
	+{
	+ int err;
	+
	+ err = 0;
	+
	+ pthread_mutex_lock(&bc->bc_mtx);
	+ assert(!bc->bc_paused);
	+ if (!TAILQ_EMPTY(&bc->bc_freeq)) {
	+ /*
	+ * Enqueue and inform the block i/o thread
	+ * that there is work available
	+ */
	+ if (blockif_enqueue(bc, breq, op))
	+ pthread_cond_signal(&bc->bc_cond);
	+ } else {
	+ /*
	+ * Callers are not allowed to enqueue more than
	+ * the specified blockif queue limit. Return an
	+ * error to indicate that the queue length has been
	+ * exceeded.
	+ */
	+ err = E2BIG;
	+ }
	+ pthread_mutex_unlock(&bc->bc_mtx);
	+
	+ return (err);
	+}
	+
	+int
	+blockif_read(struct blockif_ctxt bc, struct blockif_req breq)
	+{
	+ assert(bc->bc_magic == BLOCKIF_SIG);
	+ return (blockif_request(bc, breq, BOP_READ));
	+}
	+
	+int
	+blockif_write(struct blockif_ctxt bc, struct blockif_req breq)
	+{
	+ assert(bc->bc_magic == BLOCKIF_SIG);
	+ return (blockif_request(bc, breq, BOP_WRITE));
	+}
	+
	+int
	+blockif_flush(struct blockif_ctxt bc, struct blockif_req breq)
	+{
	+ assert(bc->bc_magic == BLOCKIF_SIG);
	+ return (blockif_request(bc, breq, BOP_FLUSH));
	+}
	+
	+int
	+blockif_delete(struct blockif_ctxt bc, struct blockif_req breq)
	+{
	+ assert(bc->bc_magic == BLOCKIF_SIG);
	+ return (blockif_request(bc, breq, BOP_DELETE));
	+}
	+
	+int
	+blockif_cancel(struct blockif_ctxt bc, struct blockif_req breq)
	+{
	+ struct blockif_elem *be;
	+
	+ assert(bc->bc_magic == BLOCKIF_SIG);
	+
	+ pthread_mutex_lock(&bc->bc_mtx);
	+ /* XXX: not waiting while paused */
	+
	+ /*
	+ * Check pending requests.
	+ */
	+ TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
	+ if (be->be_req == breq)
	+ break;
	+ }
	+ if (be != NULL) {
	+ /*
	+ * Found it.
	+ */
	+ blockif_complete(bc, be);
	+ pthread_mutex_unlock(&bc->bc_mtx);
	+
	+ return (0);
	+ }
	+
	+ /*
	+ * Check in-flight requests.
	+ */
	+ TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
	+ if (be->be_req == breq)
	+ break;
	+ }
	+ if (be == NULL) {
	+ /*
	+ * Didn't find it.
	+ */
	+ pthread_mutex_unlock(&bc->bc_mtx);
	+ return (EINVAL);
	+ }
	+
	+ /*
	+ * Interrupt the processing thread to force it return
	+ * prematurely via it's normal callback path.
	+ */
	+ while (be->be_status == BST_BUSY) {
	+ struct blockif_sig_elem bse, *old_head;
	+
	+ pthread_mutex_init(&bse.bse_mtx, NULL);
	+ pthread_cond_init(&bse.bse_cond, NULL);
	+
	+ bse.bse_pending = 1;
	+
	+ do {
	+ old_head = blockif_bse_head;
	+ bse.bse_next = old_head;
	+ } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
	+ (uintptr_t)old_head,
	+ (uintptr_t)&bse));
	+
	+ pthread_kill(be->be_tid, SIGCONT);
	+
	+ pthread_mutex_lock(&bse.bse_mtx);
	+ while (bse.bse_pending)
	+ pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
	+ pthread_mutex_unlock(&bse.bse_mtx);
	+ }
	+
	+ pthread_mutex_unlock(&bc->bc_mtx);
	+
	+ /*
	+ * The processing thread has been interrupted. Since it's not
	+ * clear if the callback has been invoked yet, return EBUSY.
	+ */
	+ return (EBUSY);
	+}
	+
	+int
	+blockif_close(struct blockif_ctxt *bc)
	+{
	+ void *jval;
	+ int i;
	+
	+ assert(bc->bc_magic == BLOCKIF_SIG);
	+
	+ /*
	+ * Stop the block i/o thread
	+ */
	+ pthread_mutex_lock(&bc->bc_mtx);
	+ bc->bc_closing = 1;
	+ if (bc->bc_resize_event != NULL)
	+ mevent_disable(bc->bc_resize_event);
	+ pthread_mutex_unlock(&bc->bc_mtx);
	+ pthread_cond_broadcast(&bc->bc_cond);
	+ for (i = 0; i < BLOCKIF_NUMTHR; i++)
	+ pthread_join(bc->bc_btid[i], &jval);
	+
	+ /* XXX Cancel queued i/o's ??? */
	+
	+ /*
	+ * Release resources
	+ */
	+ bc->bc_magic = 0;
	+ close(bc->bc_fd);
	+ free(bc);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Return virtual C/H/S values for a given block. Use the algorithm
	+ * outlined in the VHD specification to calculate values.
	+ */
	+void
	+blockif_chs(struct blockif_ctxt bc, uint16_t c, uint8_t h, uint8_t s)
	+{
	+ off_t sectors; /* total sectors of the block dev */
	+ off_t hcyl; /* cylinders times heads */
	+ uint16_t secpt; /* sectors per track */
	+ uint8_t heads;
	+
	+ assert(bc->bc_magic == BLOCKIF_SIG);
	+
	+ sectors = bc->bc_size / bc->bc_sectsz;
	+
	+ /* Clamp the size to the largest possible with CHS */
	+ if (sectors > 65535L * 16 * 255)
	+ sectors = 65535L * 16 * 255;
	+
	+ if (sectors >= 65536L * 16 * 63) {
	+ secpt = 255;
	+ heads = 16;
	+ hcyl = sectors / secpt;
	+ } else {
	+ secpt = 17;
	+ hcyl = sectors / secpt;
	+ heads = (hcyl + 1023) / 1024;
	+
	+ if (heads < 4)
	+ heads = 4;
	+
	+ if (hcyl >= (heads * 1024) \|\| heads > 16) {
	+ secpt = 31;
	+ heads = 16;
	+ hcyl = sectors / secpt;
	+ }
	+ if (hcyl >= (heads * 1024)) {
	+ secpt = 63;
	+ heads = 16;
	+ hcyl = sectors / secpt;
	+ }
	+ }
	+
	+ *c = hcyl / heads;
	+ *h = heads;
	+ *s = secpt;
	+}
	+
	+/*
	+ * Accessors
	+ */
	+off_t
	+blockif_size(struct blockif_ctxt *bc)
	+{
	+ assert(bc->bc_magic == BLOCKIF_SIG);
	+ return (bc->bc_size);
	+}
	+
	+int
	+blockif_sectsz(struct blockif_ctxt *bc)
	+{
	+ assert(bc->bc_magic == BLOCKIF_SIG);
	+ return (bc->bc_sectsz);
	+}
	+
	+void
	+blockif_psectsz(struct blockif_ctxt bc, int size, int *off)
	+{
	+ assert(bc->bc_magic == BLOCKIF_SIG);
	+ *size = bc->bc_psectsz;
	+ *off = bc->bc_psectoff;
	+}
	+
	+int
	+blockif_queuesz(struct blockif_ctxt *bc)
	+{
	+ assert(bc->bc_magic == BLOCKIF_SIG);
	+ return (BLOCKIF_MAXREQ - 1);
	+}
	+
	+int
	+blockif_is_ro(struct blockif_ctxt *bc)
	+{
	+ assert(bc->bc_magic == BLOCKIF_SIG);
	+ return (bc->bc_rdonly);
	+}
	+
	+int
	+blockif_candelete(struct blockif_ctxt *bc)
	+{
	+ assert(bc->bc_magic == BLOCKIF_SIG);
	+ return (bc->bc_candelete);
	+}
	diff --git a/tests/sys/virtio/config.h b/tests/sys/virtio/config.h
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/config.h
	@@ -0,0 +1,129 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2021 John H. Baldwin <jhb@FreeBSD.org>
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+#ifndef __CONFIG_H__
	+#define __CONFIG_H__
	+
	+#include <sys/nv.h>
	+
	+/*-
	+ * Manages a configuration database backed by an nv(9) list.
	+ *
	+ * The database only stores string values. Callers should parse
	+ * values into other types if needed. String values can reference
	+ * other configuration variables using a '%(name)' syntax. In this
	+ * case, the name must be the full path of the configuration
	+ * variable. The % character can be escaped with a preceding \ to
	+ * avoid expansion. Any \ characters must be escaped.
	+ *
	+ * Configuration variables are stored in a tree. The full path of a
	+ * variable is specified as a dot-separated name similar to sysctl(8)
	+ * OIDs.
	+ */
	+
	+/*
	+ * Fetches the value of a configuration variable. If the "raw" value
	+ * contains references to other configuration variables, this function
	+ * expands those references and returns a pointer to the parsed
	+ * string. The string's storage is only stable until the next call to
	+ * this function.
	+ *
	+ * If no node is found, returns NULL.
	+ *
	+ * If 'parent' is NULL, 'name' is assumed to be a top-level variable.
	+ */
	+const char get_config_value_node(const nvlist_t parent, const char *name);
	+
	+/*
	+ * Similar to get_config_value_node but expects a full path to the
	+ * leaf node.
	+ */
	+const char get_config_value(const char path);
	+
	+/* Initializes the tree to an empty state. */
	+void init_config(void);
	+
	+/*
	+ * Creates an existing configuration node via a dot-separated OID
	+ * path. Will fail if the path names an existing leaf configuration
	+ * variable. If the node already exists, this returns a pointer to
	+ * the existing node.
	+ */
	+nvlist_t create_config_node(const char path);
	+
	+/*
	+ * Looks for an existing configuration node via a dot-separated OID
	+ * path. Will fail if the path names an existing leaf configuration
	+ * variable.
	+ */
	+nvlist_t find_config_node(const char path);
	+
	+/*
	+ * Similar to the above, but treats the path relative to an existing
	+ * 'parent' node rather than as an absolute path.
	+ */
	+nvlist_t create_relative_config_node(nvlist_t parent, const char *path);
	+nvlist_t find_relative_config_node(nvlist_t parent, const char *path);
	+
	+/*
	+ * Adds or replaces the value of the specified variable.
	+ *
	+ * If 'parent' is NULL, 'name' is assumed to be a top-level variable.
	+ */
	+void set_config_value_node(nvlist_t parent, const char name,
	+ const char *value);
	+
	+/*
	+ * Similar to set_config_value_node but only sets value if it's unset yet.
	+ */
	+void set_config_value_node_if_unset(nvlist_t *const parent,
	+ const char const name, const char const value);
	+
	+/*
	+ * Similar to set_config_value_node but expects a full path to the
	+ * leaf node.
	+ */
	+void set_config_value(const char path, const char value);
	+
	+/*
	+ * Similar to set_config_value but only sets the value if it's unset yet.
	+ */
	+void set_config_value_if_unset(const char *const path,
	+ const char *const value);
	+
	+/* Convenience wrappers for boolean variables. */
	+bool get_config_bool(const char *path);
	+bool get_config_bool_node(const nvlist_t parent, const char name);
	+bool get_config_bool_default(const char *path, bool def);
	+bool get_config_bool_node_default(const nvlist_t parent, const char name,
	+ bool def);
	+void set_config_bool(const char *path, bool value);
	+void set_config_bool_node(nvlist_t parent, const char name, bool value);
	+
	+void dump_config(void);
	+
	+#endif /* !__CONFIG_H__ */
	diff --git a/tests/sys/virtio/config.c b/tests/sys/virtio/config.c
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/config.c
	@@ -0,0 +1,464 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2021 John H. Baldwin <jhb@FreeBSD.org>
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+#include <sys/cdefs.h>
	+#include <assert.h>
	+#include <err.h>
	+#include <stdio.h>
	+#include <stdlib.h>
	+#include <string.h>
	+
	+#include "config.h"
	+
	+static nvlist_t *config_root;
	+
	+void
	+init_config(void)
	+{
	+
	+ config_root = nvlist_create(0);
	+ if (config_root == NULL)
	+ err(4, "Failed to create configuration root nvlist");
	+}
	+
	+static nvlist_t *
	+_lookup_config_node(nvlist_t parent, const char path, bool create)
	+{
	+ char copy, name, *tofree;
	+ nvlist_t nvl, new_nvl;
	+
	+ copy = strdup(path);
	+ if (copy == NULL)
	+ errx(4, "Failed to allocate memory");
	+ tofree = copy;
	+ nvl = parent;
	+ while ((name = strsep(&copy, ".")) != NULL) {
	+ if (*name == '\0') {
	+ warnx("Invalid configuration node: %s", path);
	+ nvl = NULL;
	+ break;
	+ }
	+ if (nvlist_exists_nvlist(nvl, name))
	+ /*
	+ * XXX-MJ it is incorrect to cast away the const
	+ * qualifier like this since the contract with nvlist
	+ * says that values are immutable, and some consumers
	+ * will indeed add nodes to the returned nvlist. In
	+ * practice, however, it appears to be harmless with the
	+ * current nvlist implementation, so we just live with
	+ * it until the implementation is reworked.
	+ */
	+ nvl = __DECONST(nvlist_t *,
	+ nvlist_get_nvlist(nvl, name));
	+ else if (nvlist_exists(nvl, name)) {
	+ for (copy = tofree; copy < name; copy++)
	+ if (*copy == '\0')
	+ *copy = '.';
	+ warnx(
	+ "Configuration node %s is a child of existing variable %s",
	+ path, tofree);
	+ nvl = NULL;
	+ break;
	+ } else if (create) {
	+ /*
	+ * XXX-MJ as with the case above, "new_nvl" shouldn't be
	+ * mutated after its ownership is given to "nvl".
	+ */
	+ new_nvl = nvlist_create(0);
	+ if (new_nvl == NULL)
	+ errx(4, "Failed to allocate memory");
	+ nvlist_move_nvlist(nvl, name, new_nvl);
	+ nvl = new_nvl;
	+ } else {
	+ nvl = NULL;
	+ break;
	+ }
	+ }
	+ free(tofree);
	+ return (nvl);
	+}
	+
	+nvlist_t *
	+create_config_node(const char *path)
	+{
	+
	+ return (_lookup_config_node(config_root, path, true));
	+}
	+
	+nvlist_t *
	+find_config_node(const char *path)
	+{
	+
	+ return (_lookup_config_node(config_root, path, false));
	+}
	+
	+nvlist_t *
	+create_relative_config_node(nvlist_t parent, const char path)
	+{
	+
	+ return (_lookup_config_node(parent, path, true));
	+}
	+
	+nvlist_t *
	+find_relative_config_node(nvlist_t parent, const char path)
	+{
	+
	+ return (_lookup_config_node(parent, path, false));
	+}
	+
	+void
	+set_config_value_node(nvlist_t parent, const char name, const char *value)
	+{
	+
	+ if (strchr(name, '.') != NULL)
	+ errx(4, "Invalid config node name %s", name);
	+ if (parent == NULL)
	+ parent = config_root;
	+ if (nvlist_exists_string(parent, name))
	+ nvlist_free_string(parent, name);
	+ else if (nvlist_exists(parent, name))
	+ errx(4,
	+ "Attempting to add value %s to existing node %s of list %p",
	+ value, name, parent);
	+ nvlist_add_string(parent, name, value);
	+}
	+
	+void
	+set_config_value_node_if_unset(nvlist_t const parent, const char const name,
	+ const char *const value)
	+{
	+ if (get_config_value_node(parent, name) != NULL) {
	+ return;
	+ }
	+
	+ set_config_value_node(parent, name, value);
	+}
	+
	+void
	+set_config_value(const char path, const char value)
	+{
	+ const char *name;
	+ char *node_name;
	+ nvlist_t *nvl;
	+
	+ /* Look for last separator. */
	+ name = strrchr(path, '.');
	+ if (name == NULL) {
	+ nvl = config_root;
	+ name = path;
	+ } else {
	+ node_name = strndup(path, name - path);
	+ if (node_name == NULL)
	+ errx(4, "Failed to allocate memory");
	+ nvl = create_config_node(node_name);
	+ if (nvl == NULL)
	+ errx(4, "Failed to create configuration node %s",
	+ node_name);
	+ free(node_name);
	+
	+ /* Skip over '.'. */
	+ name++;
	+ }
	+
	+ if (nvlist_exists_nvlist(nvl, name))
	+ errx(4, "Attempting to add value %s to existing node %s",
	+ value, path);
	+ set_config_value_node(nvl, name, value);
	+}
	+
	+void
	+set_config_value_if_unset(const char const path, const char const value)
	+{
	+ if (get_config_value(path) != NULL) {
	+ return;
	+ }
	+
	+ set_config_value(path, value);
	+}
	+
	+static const char *
	+get_raw_config_value(const char *path)
	+{
	+ const char *name;
	+ char *node_name;
	+ nvlist_t *nvl;
	+
	+ /* Look for last separator. */
	+ name = strrchr(path, '.');
	+ if (name == NULL) {
	+ nvl = config_root;
	+ name = path;
	+ } else {
	+ node_name = strndup(path, name - path);
	+ if (node_name == NULL)
	+ errx(4, "Failed to allocate memory");
	+ nvl = find_config_node(node_name);
	+ free(node_name);
	+ if (nvl == NULL)
	+ return (NULL);
	+
	+ /* Skip over '.'. */
	+ name++;
	+ }
	+
	+ if (nvlist_exists_string(nvl, name))
	+ return (nvlist_get_string(nvl, name));
	+ if (nvlist_exists_nvlist(nvl, name))
	+ warnx("Attempting to fetch value of node %s", path);
	+ return (NULL);
	+}
	+
	+static char *
	+_expand_config_value(const char *value, int depth)
	+{
	+ FILE *valfp;
	+ const char cp, vp;
	+ char nestedval, path, *valbuf;
	+ size_t valsize;
	+
	+ valfp = open_memstream(&valbuf, &valsize);
	+ if (valfp == NULL)
	+ errx(4, "Failed to allocate memory");
	+
	+ vp = value;
	+ while (*vp != '\0') {
	+ switch (*vp) {
	+ case '%':
	+ if (depth > 15) {
	+ warnx(
	+ "Too many recursive references in configuration value");
	+ fputc('%', valfp);
	+ vp++;
	+ break;
	+ }
	+ if (vp[1] != '(' \|\| vp[2] == '\0')
	+ cp = NULL;
	+ else
	+ cp = strchr(vp + 2, ')');
	+ if (cp == NULL) {
	+ warnx(
	+ "Invalid reference in configuration value \"%s\"",
	+ value);
	+ fputc('%', valfp);
	+ vp++;
	+ break;
	+ }
	+ vp += 2;
	+
	+ if (cp == vp) {
	+ warnx(
	+ "Empty reference in configuration value \"%s\"",
	+ value);
	+ vp++;
	+ break;
	+ }
	+
	+ /* Allocate a C string holding the path. */
	+ path = strndup(vp, cp - vp);
	+ if (path == NULL)
	+ errx(4, "Failed to allocate memory");
	+
	+ /* Advance 'vp' past the reference. */
	+ vp = cp + 1;
	+
	+ /* Fetch the referenced value. */
	+ cp = get_raw_config_value(path);
	+ if (cp == NULL)
	+ warnx(
	+ "Failed to fetch referenced configuration variable %s",
	+ path);
	+ else {
	+ nestedval = _expand_config_value(cp, depth + 1);
	+ fputs(nestedval, valfp);
	+ free(nestedval);
	+ }
	+ free(path);
	+ break;
	+ case '\\':
	+ vp++;
	+ if (*vp == '\0') {
	+ warnx(
	+ "Trailing \\ in configuration value \"%s\"",
	+ value);
	+ break;
	+ }
	+ /* FALLTHROUGH */
	+ default:
	+ fputc(*vp, valfp);
	+ vp++;
	+ break;
	+ }
	+ }
	+ fclose(valfp);
	+ return (valbuf);
	+}
	+
	+static const char *
	+expand_config_value(const char *value)
	+{
	+ static char *valbuf;
	+
	+ if (strchr(value, '%') == NULL)
	+ return (value);
	+
	+ free(valbuf);
	+ valbuf = _expand_config_value(value, 0);
	+ return (valbuf);
	+}
	+
	+const char *
	+get_config_value(const char *path)
	+{
	+ const char *value;
	+
	+ value = get_raw_config_value(path);
	+ if (value == NULL)
	+ return (NULL);
	+ return (expand_config_value(value));
	+}
	+
	+const char *
	+get_config_value_node(const nvlist_t parent, const char name)
	+{
	+
	+ if (strchr(name, '.') != NULL)
	+ errx(4, "Invalid config node name %s", name);
	+ if (parent == NULL)
	+ parent = config_root;
	+ if (nvlist_exists_nvlist(parent, name))
	+ warnx("Attempt to fetch value of node %s of list %p", name,
	+ parent);
	+ if (!nvlist_exists_string(parent, name))
	+ return (NULL);
	+
	+ return (expand_config_value(nvlist_get_string(parent, name)));
	+}
	+
	+static bool
	+_bool_value(const char name, const char value)
	+{
	+
	+ if (strcasecmp(value, "true") == 0 \|\|
	+ strcasecmp(value, "on") == 0 \|\|
	+ strcasecmp(value, "yes") == 0 \|\|
	+ strcmp(value, "1") == 0)
	+ return (true);
	+ if (strcasecmp(value, "false") == 0 \|\|
	+ strcasecmp(value, "off") == 0 \|\|
	+ strcasecmp(value, "no") == 0 \|\|
	+ strcmp(value, "0") == 0)
	+ return (false);
	+ err(4, "Invalid value %s for boolean variable %s", value, name);
	+}
	+
	+bool
	+get_config_bool(const char *path)
	+{
	+ const char *value;
	+
	+ value = get_config_value(path);
	+ if (value == NULL)
	+ err(4, "Failed to fetch boolean variable %s", path);
	+ return (_bool_value(path, value));
	+}
	+
	+bool
	+get_config_bool_default(const char *path, bool def)
	+{
	+ const char *value;
	+
	+ value = get_config_value(path);
	+ if (value == NULL)
	+ return (def);
	+ return (_bool_value(path, value));
	+}
	+
	+bool
	+get_config_bool_node(const nvlist_t parent, const char name)
	+{
	+ const char *value;
	+
	+ value = get_config_value_node(parent, name);
	+ if (value == NULL)
	+ err(4, "Failed to fetch boolean variable %s", name);
	+ return (_bool_value(name, value));
	+}
	+
	+bool
	+get_config_bool_node_default(const nvlist_t parent, const char name,
	+ bool def)
	+{
	+ const char *value;
	+
	+ value = get_config_value_node(parent, name);
	+ if (value == NULL)
	+ return (def);
	+ return (_bool_value(name, value));
	+}
	+
	+void
	+set_config_bool(const char *path, bool value)
	+{
	+
	+ set_config_value(path, value ? "true" : "false");
	+}
	+
	+void
	+set_config_bool_node(nvlist_t parent, const char name, bool value)
	+{
	+
	+ set_config_value_node(parent, name, value ? "true" : "false");
	+}
	+
	+static void
	+dump_tree(const char prefix, const nvlist_t nvl)
	+{
	+ const char *name;
	+ void *cookie;
	+ int type;
	+
	+ cookie = NULL;
	+ while ((name = nvlist_next(nvl, &type, &cookie)) != NULL) {
	+ if (type == NV_TYPE_NVLIST) {
	+ char *new_prefix;
	+
	+ asprintf(&new_prefix, "%s%s.", prefix, name);
	+ dump_tree(new_prefix, nvlist_get_nvlist(nvl, name));
	+ free(new_prefix);
	+ } else {
	+ assert(type == NV_TYPE_STRING);
	+ printf("%s%s=%s\n", prefix, name,
	+ nvlist_get_string(nvl, name));
	+ }
	+ }
	+}
	+
	+void
	+dump_config(void)
	+{
	+ dump_tree("", config_root);
	+}
	diff --git a/tests/sys/virtio/debug.h b/tests/sys/virtio/debug.h
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/debug.h
	@@ -0,0 +1,40 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2019 Vincenzo Maffione <vmaffione@freebsd.org>
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+#ifndef _DEBUG_H_
	+#define _DEBUG_H_
	+
	+
	+#define FPRINTLN(filep, fmt, arg...) \
	+ do { \
	+ fprintf(filep, fmt "\n", ##arg); \
	+ } while (0)
	+
	+#define PRINTLN(fmt, arg...) FPRINTLN(stdout, fmt, ##arg)
	+#define EPRINTLN(fmt, arg...) FPRINTLN(stderr, fmt, ##arg)
	+
	+#endif
	diff --git a/tests/sys/virtio/iov.h b/tests/sys/virtio/iov.h
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/iov.h
	@@ -0,0 +1,42 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>.
	+ * Copyright (c) 2018 Alexander Motin <mav@FreeBSD.org>
	+ * All rights reserved.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer
	+ * in this position and unchanged.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+#ifndef _IOV_H_
	+#define _IOV_H_
	+
	+void seek_iov(const struct iovec iov1, int niov1, struct iovec iov2,
	+ int *niov2, size_t seek);
	+void truncate_iov(struct iovec iov, int niov, size_t length);
	+size_t count_iov(const struct iovec *iov, int niov);
	+ssize_t iov_to_buf(const struct iovec iov, int niov, void *buf);
	+ssize_t buf_to_iov(const void buf, size_t buflen, const struct iovec iov,
	+ int niov, size_t seek);
	+
	+#endif /* _IOV_H_ */
	diff --git a/tests/sys/virtio/iov.c b/tests/sys/virtio/iov.c
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/iov.c
	@@ -0,0 +1,146 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>.
	+ * Copyright (c) 2018 Alexander Motin <mav@FreeBSD.org>
	+ * All rights reserved.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer
	+ * in this position and unchanged.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+#include <sys/param.h>
	+#include <sys/types.h>
	+#include <sys/uio.h>
	+
	+#include <stdlib.h>
	+#include <string.h>
	+#include "iov.h"
	+
	+void
	+seek_iov(const struct iovec iov1, int niov1, struct iovec iov2, int *niov2,
	+ size_t seek)
	+{
	+ size_t remainder = 0;
	+ size_t left = seek;
	+ int i, j;
	+
	+ for (i = 0; i < niov1; i++) {
	+ size_t toseek = MIN(left, iov1[i].iov_len);
	+ left -= toseek;
	+
	+ if (toseek == iov1[i].iov_len)
	+ continue;
	+
	+ if (left == 0) {
	+ remainder = toseek;
	+ break;
	+ }
	+ }
	+
	+ for (j = i; j < niov1; j++) {
	+ iov2[j - i].iov_base = (char *)iov1[j].iov_base + remainder;
	+ iov2[j - i].iov_len = iov1[j].iov_len - remainder;
	+ remainder = 0;
	+ }
	+
	+ *niov2 = j - i;
	+}
	+
	+size_t
	+count_iov(const struct iovec *iov, int niov)
	+{
	+ size_t total = 0;
	+ int i;
	+
	+ for (i = 0; i < niov; i++)
	+ total += iov[i].iov_len;
	+
	+ return (total);
	+}
	+
	+void
	+truncate_iov(struct iovec iov, int niov, size_t length)
	+{
	+ size_t done = 0;
	+ int i;
	+
	+ for (i = 0; i < *niov; i++) {
	+ size_t toseek = MIN(length - done, iov[i].iov_len);
	+ done += toseek;
	+
	+ if (toseek <= iov[i].iov_len) {
	+ iov[i].iov_len = toseek;
	+ *niov = i + 1;
	+ return;
	+ }
	+ }
	+}
	+
	+ssize_t
	+iov_to_buf(const struct iovec iov, int niov, void *buf)
	+{
	+ size_t ptr, total;
	+ int i;
	+
	+ total = count_iov(iov, niov);
	+ buf = realloc(buf, total);
	+ if (*buf == NULL)
	+ return (-1);
	+
	+ for (i = 0, ptr = 0; i < niov; i++) {
	+ memcpy((uint8_t )buf + ptr, iov[i].iov_base, iov[i].iov_len);
	+ ptr += iov[i].iov_len;
	+ }
	+
	+ return (total);
	+}
	+
	+ssize_t
	+buf_to_iov(const void buf, size_t buflen, const struct iovec iov, int niov,
	+ size_t seek)
	+{
	+ struct iovec *diov;
	+ size_t off = 0, len;
	+ int i;
	+
	+ if (seek > 0) {
	+ int ndiov;
	+
	+ diov = malloc(sizeof(struct iovec) * niov);
	+ seek_iov(iov, niov, diov, &ndiov, seek);
	+ iov = diov;
	+ niov = ndiov;
	+ }
	+
	+ for (i = 0; i < niov && off < buflen; i++) {
	+ len = MIN(iov[i].iov_len, buflen - off);
	+ memcpy(iov[i].iov_base, (const uint8_t *)buf + off, len);
	+ off += len;
	+ }
	+
	+ if (seek > 0)
	+ free(diov);
	+
	+ return ((ssize_t)off);
	+}
	+
	diff --git a/tests/sys/virtio/iov_emul.h b/tests/sys/virtio/iov_emul.h
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/iov_emul.h
	@@ -0,0 +1,20 @@
	+#ifndef _IOV_EMUL_E
	+#define _IOV_EMUL_E
	+
	+struct virtio_softc;
	+
	+struct iov_emul {
	+ struct vtdbg_transfer *iove_tf;
	+ size_t iove_maxcnt;
	+ size_t iove_ind;
	+};
	+
	+#define IOVE_INIT (16)
	+
	+struct iov_emul *iove_alloc(void);
	+void iove_free(struct iov_emul *iove);
	+int iove_add(struct iov_emul iove, uint64_t phys, size_t len, struct iovec iov);
	+int iove_import(int fd, struct iov_emul *iove);
	+int iove_export(int fd, struct iov_emul *iove);
	+
	+#endif /* _IOV_EMUL_E */
	diff --git a/tests/sys/virtio/iov_emul.c b/tests/sys/virtio/iov_emul.c
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/iov_emul.c
	@@ -0,0 +1,106 @@
	+#include <sys/param.h>
	+#include <sys/uio.h>
	+
	+#include <errno.h>
	+#include <stdbool.h>
	+#include <stdlib.h>
	+
	+#include <dev/virtio/dbg/virtio_dbg.h>
	+
	+#include "debug.h"
	+#include "iov_emul.h"
	+#include "mmio_emul.h"
	+#include "virtio.h"
	+
	+struct iov_emul *
	+iove_alloc(void)
	+{
	+ struct iov_emul *iove;
	+
	+ iove = calloc(1, sizeof(*iove));
	+
	+ iove->iove_tf = calloc(IOVE_INIT, sizeof(*iove->iove_tf));
	+ if (iove->iove_tf == NULL) {
	+ free(iove);
	+ return (NULL);
	+ }
	+
	+ iove->iove_maxcnt = IOVE_INIT;
	+
	+ return (iove);
	+}
	+
	+void
	+iove_free(struct iov_emul *iove)
	+{
	+ size_t i;
	+
	+ for (i = 0; i < iove->iove_ind; i++)
	+ free(iove->iove_tf[i].vtdt_device);
	+
	+ free(iove);
	+}
	+
	+
	+int
	+iove_add(struct iov_emul iove, uint64_t phys, size_t len, struct iovec iov)
	+{
	+ struct vtdbg_transfer *tf = iove->iove_tf;
	+ size_t ind = iove->iove_ind;
	+ char *base;
	+
	+ if (ind == iove->iove_maxcnt){
	+ tf = reallocarray(tf, 2 * iove->iove_maxcnt,
	+ sizeof(*tf));
	+ if (tf == NULL)
	+ return (ENOMEM);
	+ iove->iove_tf = tf;
	+ iove->iove_maxcnt *= 2;
	+ }
	+
	+ base = malloc(len);
	+ if (base == NULL)
	+ return (ENOMEM);
	+
	+ iove->iove_tf[ind].vtdt_device = base;
	+ iove->iove_tf[ind].vtdt_driver = (caddr_t) phys;
	+ iove->iove_tf[ind].vtdt_len = len;
	+ iove->iove_ind += 1;
	+
	+ iov->iov_base = base;
	+ iov->iov_len = len;
	+
	+ return (0);
	+}
	+
	+
	+/*
	+ * Import a read IO vector from the kernel.
	+ */
	+int
	+iove_import(int fd, struct iov_emul *iove)
	+{
	+ struct vtdbg_io_args args = {
	+ .transfers = iove->iove_tf,
	+ .cnt = iove->iove_ind,
	+ .touser = true,
	+ };
	+
	+ return (ioctl(fd, VIRTIO_DBG_TRANSFER, &args));
	+}
	+
	+/*
	+ * Export a write IO vector to the kernel.
	+ */
	+int
	+iove_export(int fd, struct iov_emul *iove)
	+{
	+ struct vtdbg_io_args args = {
	+ .transfers = iove->iove_tf,
	+ .cnt = iove->iove_ind,
	+ .touser = false,
	+ };
	+
	+ return (ioctl(fd, VIRTIO_DBG_TRANSFER, &args));
	+}
	+
	diff --git a/tests/sys/virtio/mevent.h b/tests/sys/virtio/mevent.h
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/mevent.h
	@@ -0,0 +1,60 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2011 NetApp, Inc.
	+ * All rights reserved.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+#ifndef _MEVENT_H_
	+#define _MEVENT_H_
	+
	+enum ev_type {
	+ EVF_READ,
	+ EVF_WRITE,
	+ EVF_TIMER,
	+ EVF_SIGNAL,
	+ EVF_VNODE,
	+};
	+
	+/* Filter flags for EVF_VNODE */
	+#define EVFF_ATTRIB 0x0001
	+
	+typedef void mevent_cb_t(int, enum ev_type, void *, uint64_t);
	+struct mevent;
	+
	+struct mevent mevent_add(int fd, enum ev_type type, mevent_cb_t func,
	+ void *param);
	+struct mevent *mevent_add_flags(int fd, enum ev_type type, int fflags,
	+ mevent_cb_t func, void param);
	+struct mevent *mevent_add_disabled(int fd, enum ev_type type,
	+ mevent_cb_t func, void param);
	+int mevent_enable(struct mevent *evp);
	+int mevent_disable(struct mevent *evp);
	+int mevent_delete(struct mevent *evp);
	+int mevent_delete_close(struct mevent *evp);
	+int mevent_timer_update(struct mevent *evp, int msecs);
	+
	+void mevent_dispatch(void);
	+
	+#endif /* _MEVENT_H_ */
	diff --git a/tests/sys/virtio/mevent.c b/tests/sys/virtio/mevent.c
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/mevent.c
	@@ -0,0 +1,564 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2011 NetApp, Inc.
	+ * All rights reserved.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+/*
	+ * Micro event library for FreeBSD, designed for a single i/o thread
	+ * using kqueue, and having events be persistent by default.
	+ */
	+
	+#include <sys/cdefs.h>
	+#include <assert.h>
	+#ifndef WITHOUT_CAPSICUM
	+#include <capsicum_helpers.h>
	+#endif
	+#include <err.h>
	+#include <errno.h>
	+#include <stdbool.h>
	+#include <stdlib.h>
	+#include <stdio.h>
	+#include <string.h>
	+#include <sysexits.h>
	+#include <unistd.h>
	+
	+#include <sys/types.h>
	+#ifndef WITHOUT_CAPSICUM
	+#include <sys/capsicum.h>
	+#endif
	+#include <sys/event.h>
	+#include <sys/time.h>
	+
	+#include <pthread.h>
	+#include <pthread_np.h>
	+
	+#include "mevent.h"
	+
	+#define MEVENT_MAX 64
	+
	+static pthread_t mevent_tid;
	+static pthread_once_t mevent_once = PTHREAD_ONCE_INIT;
	+static int mevent_timid = 43;
	+static int mevent_pipefd[2];
	+static int mfd;
	+static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
	+
	+struct mevent {
	+ mevent_cb_t *me_func;
	+#define me_msecs me_fd
	+ int me_fd;
	+ int me_timid;
	+ enum ev_type me_type;
	+ void *me_param;
	+ int me_cq;
	+ int me_state; /* Desired kevent flags. */
	+ int me_closefd;
	+ int me_fflags;
	+ LIST_ENTRY(mevent) me_list;
	+};
	+
	+enum mevent_update_type {
	+ UPDATE_ENABLE,
	+ UPDATE_DISABLE,
	+ UPDATE_TIMER,
	+};
	+
	+static LIST_HEAD(listhead, mevent) global_head, change_head;
	+
	+static void
	+mevent_qlock(void)
	+{
	+ pthread_mutex_lock(&mevent_lmutex);
	+}
	+
	+static void
	+mevent_qunlock(void)
	+{
	+ pthread_mutex_unlock(&mevent_lmutex);
	+}
	+
	+static void
	+mevent_pipe_read(int fd, enum ev_type type __unused, void *param __unused,
	+ uint64_t data __unused)
	+{
	+ char buf[MEVENT_MAX];
	+ int status;
	+
	+ /*
	+ * Drain the pipe read side. The fd is non-blocking so this is
	+ * safe to do.
	+ */
	+ do {
	+ status = read(fd, buf, sizeof(buf));
	+ } while (status == MEVENT_MAX);
	+}
	+
	+static void
	+mevent_notify(void)
	+{
	+ char c = '\0';
	+
	+ /*
	+ * If calling from outside the i/o thread, write a byte on the
	+ * pipe to force the i/o thread to exit the blocking kevent call.
	+ */
	+ if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
	+ write(mevent_pipefd[1], &c, 1);
	+ }
	+}
	+
	+static void
	+mevent_init(void)
	+{
	+#ifndef WITHOUT_CAPSICUM
	+ cap_rights_t rights;
	+#endif
	+
	+ mfd = kqueue();
	+ assert(mfd > 0);
	+
	+#ifndef WITHOUT_CAPSICUM
	+ cap_rights_init(&rights, CAP_KQUEUE);
	+ if (caph_rights_limit(mfd, &rights) == -1)
	+ errx(EX_OSERR, "Unable to apply rights for sandbox");
	+#endif
	+
	+ LIST_INIT(&change_head);
	+ LIST_INIT(&global_head);
	+}
	+
	+static int
	+mevent_kq_filter(struct mevent *mevp)
	+{
	+ int retval;
	+
	+ retval = 0;
	+
	+ if (mevp->me_type == EVF_READ)
	+ retval = EVFILT_READ;
	+
	+ if (mevp->me_type == EVF_WRITE)
	+ retval = EVFILT_WRITE;
	+
	+ if (mevp->me_type == EVF_TIMER)
	+ retval = EVFILT_TIMER;
	+
	+ if (mevp->me_type == EVF_SIGNAL)
	+ retval = EVFILT_SIGNAL;
	+
	+ if (mevp->me_type == EVF_VNODE)
	+ retval = EVFILT_VNODE;
	+
	+ return (retval);
	+}
	+
	+static int
	+mevent_kq_flags(struct mevent *mevp)
	+{
	+ int retval;
	+
	+ retval = mevp->me_state;
	+
	+ if (mevp->me_type == EVF_VNODE)
	+ retval \|= EV_CLEAR;
	+
	+ return (retval);
	+}
	+
	+static int
	+mevent_kq_fflags(struct mevent *mevp)
	+{
	+ int retval;
	+
	+ retval = 0;
	+
	+ switch (mevp->me_type) {
	+ case EVF_VNODE:
	+ if ((mevp->me_fflags & EVFF_ATTRIB) != 0)
	+ retval \|= NOTE_ATTRIB;
	+ break;
	+ case EVF_READ:
	+ case EVF_WRITE:
	+ case EVF_TIMER:
	+ case EVF_SIGNAL:
	+ break;
	+ }
	+
	+ return (retval);
	+}
	+
	+static void
	+mevent_populate(struct mevent mevp, struct kevent kev)
	+{
	+ if (mevp->me_type == EVF_TIMER) {
	+ kev->ident = mevp->me_timid;
	+ kev->data = mevp->me_msecs;
	+ } else {
	+ kev->ident = mevp->me_fd;
	+ kev->data = 0;
	+ }
	+ kev->filter = mevent_kq_filter(mevp);
	+ kev->flags = mevent_kq_flags(mevp);
	+ kev->fflags = mevent_kq_fflags(mevp);
	+ kev->udata = mevp;
	+}
	+
	+static int
	+mevent_build(struct kevent *kev)
	+{
	+ struct mevent mevp, tmpp;
	+ int i;
	+
	+ i = 0;
	+
	+ mevent_qlock();
	+
	+ LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
	+ if (mevp->me_closefd) {
	+ /*
	+ * A close of the file descriptor will remove the
	+ * event
	+ */
	+ close(mevp->me_fd);
	+ } else {
	+ mevent_populate(mevp, &kev[i]);
	+ i++;
	+ }
	+
	+ mevp->me_cq = 0;
	+ LIST_REMOVE(mevp, me_list);
	+
	+ if (mevp->me_state & EV_DELETE) {
	+ free(mevp);
	+ } else {
	+ LIST_INSERT_HEAD(&global_head, mevp, me_list);
	+ }
	+
	+ assert(i < MEVENT_MAX);
	+ }
	+
	+ mevent_qunlock();
	+
	+ return (i);
	+}
	+
	+static void
	+mevent_handle(struct kevent *kev, int numev)
	+{
	+ struct mevent *mevp;
	+ uint64_t data;
	+ int i;
	+
	+ for (i = 0; i < numev; i++) {
	+ mevp = kev[i].udata;
	+ data = kev[i].data;
	+
	+ /* XXX check for EV_ERROR ? */
	+
	+ (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param, data);
	+ }
	+}
	+
	+static struct mevent *
	+mevent_add_state(int tfd, enum ev_type type, mevent_cb_t func, void param,
	+ int state, int fflags)
	+{
	+ struct kevent kev;
	+ struct mevent lp, mevp;
	+ int ret;
	+
	+ if (tfd < 0 \|\| func == NULL) {
	+ return (NULL);
	+ }
	+
	+ mevp = NULL;
	+
	+ pthread_once(&mevent_once, mevent_init);
	+
	+ mevent_qlock();
	+
	+ /*
	+ * Verify that the fd/type tuple is not present in any list
	+ */
	+ LIST_FOREACH(lp, &global_head, me_list) {
	+ if (type != EVF_TIMER && lp->me_fd == tfd &&
	+ lp->me_type == type) {
	+ goto exit;
	+ }
	+ }
	+
	+ LIST_FOREACH(lp, &change_head, me_list) {
	+ if (type != EVF_TIMER && lp->me_fd == tfd &&
	+ lp->me_type == type) {
	+ goto exit;
	+ }
	+ }
	+
	+ /*
	+ * Allocate an entry and populate it.
	+ */
	+ mevp = calloc(1, sizeof(struct mevent));
	+ if (mevp == NULL) {
	+ goto exit;
	+ }
	+
	+ if (type == EVF_TIMER) {
	+ mevp->me_msecs = tfd;
	+ mevp->me_timid = mevent_timid++;
	+ } else
	+ mevp->me_fd = tfd;
	+ mevp->me_type = type;
	+ mevp->me_func = func;
	+ mevp->me_param = param;
	+ mevp->me_state = state;
	+ mevp->me_fflags = fflags;
	+
	+ /*
	+ * Try to add the event. If this fails, report the failure to
	+ * the caller.
	+ */
	+ mevent_populate(mevp, &kev);
	+ ret = kevent(mfd, &kev, 1, NULL, 0, NULL);
	+ if (ret == -1) {
	+ free(mevp);
	+ mevp = NULL;
	+ goto exit;
	+ }
	+
	+ mevp->me_state &= ~EV_ADD;
	+ LIST_INSERT_HEAD(&global_head, mevp, me_list);
	+
	+exit:
	+ mevent_qunlock();
	+
	+ return (mevp);
	+}
	+
	+struct mevent *
	+mevent_add(int tfd, enum ev_type type, mevent_cb_t func, void param)
	+{
	+
	+ return (mevent_add_state(tfd, type, func, param, EV_ADD, 0));
	+}
	+
	+struct mevent *
	+mevent_add_flags(int tfd, enum ev_type type, int fflags, mevent_cb_t func, void param)
	+{
	+
	+ return (mevent_add_state(tfd, type, func, param, EV_ADD, fflags));
	+}
	+
	+struct mevent *
	+mevent_add_disabled(int tfd, enum ev_type type, mevent_cb_t func, void param)
	+{
	+
	+ return (mevent_add_state(tfd, type, func, param, EV_ADD \| EV_DISABLE, 0));
	+}
	+
	+static int
	+mevent_update(struct mevent *evp, enum mevent_update_type type, int msecs)
	+{
	+ int newstate;
	+
	+ mevent_qlock();
	+
	+ /*
	+ * It's not possible to update a deleted event
	+ */
	+ assert((evp->me_state & EV_DELETE) == 0);
	+
	+ newstate = evp->me_state;
	+ if (type == UPDATE_ENABLE) {
	+ newstate \|= EV_ENABLE;
	+ newstate &= ~EV_DISABLE;
	+ } else if (type == UPDATE_DISABLE) {
	+ newstate \|= EV_DISABLE;
	+ newstate &= ~EV_ENABLE;
	+ } else {
	+ assert(type == UPDATE_TIMER);
	+ assert(evp->me_type == EVF_TIMER);
	+ newstate \|= EV_ADD;
	+ evp->me_msecs = msecs;
	+ }
	+
	+ /*
	+ * No update needed if enable/disable had no effect
	+ */
	+ if (evp->me_state != newstate \|\| type == UPDATE_TIMER) {
	+ evp->me_state = newstate;
	+
	+ /*
	+ * Place the entry onto the changed list if not
	+ * already there.
	+ */
	+ if (evp->me_cq == 0) {
	+ evp->me_cq = 1;
	+ LIST_REMOVE(evp, me_list);
	+ LIST_INSERT_HEAD(&change_head, evp, me_list);
	+ mevent_notify();
	+ }
	+ }
	+
	+ mevent_qunlock();
	+
	+ return (0);
	+}
	+
	+int
	+mevent_enable(struct mevent *evp)
	+{
	+ return (mevent_update(evp, UPDATE_ENABLE, -1));
	+}
	+
	+int
	+mevent_disable(struct mevent *evp)
	+{
	+ return (mevent_update(evp, UPDATE_DISABLE, -1));
	+}
	+
	+int
	+mevent_timer_update(struct mevent *evp, int msecs)
	+{
	+ return (mevent_update(evp, UPDATE_TIMER, msecs));
	+}
	+
	+static int
	+mevent_delete_event(struct mevent *evp, int closefd)
	+{
	+ mevent_qlock();
	+
	+ /*
	+ * Place the entry onto the changed list if not already there, and
	+ * mark as to be deleted.
	+ */
	+ if (evp->me_cq == 0) {
	+ evp->me_cq = 1;
	+ LIST_REMOVE(evp, me_list);
	+ LIST_INSERT_HEAD(&change_head, evp, me_list);
	+ mevent_notify();
	+ }
	+ evp->me_state = EV_DELETE;
	+
	+ if (closefd)
	+ evp->me_closefd = 1;
	+
	+ mevent_qunlock();
	+
	+ return (0);
	+}
	+
	+int
	+mevent_delete(struct mevent *evp)
	+{
	+
	+ return (mevent_delete_event(evp, 0));
	+}
	+
	+int
	+mevent_delete_close(struct mevent *evp)
	+{
	+
	+ return (mevent_delete_event(evp, 1));
	+}
	+
	+static void
	+mevent_set_name(void)
	+{
	+
	+ pthread_set_name_np(mevent_tid, "mevent");
	+}
	+
	+void
	+mevent_dispatch(void)
	+{
	+ struct kevent changelist[MEVENT_MAX];
	+ struct kevent eventlist[MEVENT_MAX];
	+ struct mevent *pipev;
	+ int numev;
	+ int ret;
	+#ifndef WITHOUT_CAPSICUM
	+ cap_rights_t rights;
	+#endif
	+
	+ mevent_tid = pthread_self();
	+ mevent_set_name();
	+
	+ pthread_once(&mevent_once, mevent_init);
	+
	+ /*
	+ * Open the pipe that will be used for other threads to force
	+ * the blocking kqueue call to exit by writing to it. Set the
	+ * descriptor to non-blocking.
	+ */
	+ ret = pipe(mevent_pipefd);
	+ if (ret < 0) {
	+ perror("pipe");
	+ exit(0);
	+ }
	+
	+#ifndef WITHOUT_CAPSICUM
	+ cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
	+ if (caph_rights_limit(mevent_pipefd[0], &rights) == -1)
	+ errx(EX_OSERR, "Unable to apply rights for sandbox");
	+ if (caph_rights_limit(mevent_pipefd[1], &rights) == -1)
	+ errx(EX_OSERR, "Unable to apply rights for sandbox");
	+#endif
	+
	+ /*
	+ * Add internal event handler for the pipe write fd
	+ */
	+ pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
	+ assert(pipev != NULL);
	+
	+ for (;;) {
	+ /*
	+ * Build changelist if required.
	+ * XXX the changelist can be put into the blocking call
	+ * to eliminate the extra syscall. Currently better for
	+ * debug.
	+ */
	+ numev = mevent_build(changelist);
	+ if (numev) {
	+ ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
	+ if (ret == -1) {
	+ perror("Error return from kevent change");
	+ }
	+ }
	+
	+ /*
	+ * Block awaiting events
	+ */
	+ ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
	+ if (ret == -1 && errno != EINTR) {
	+ perror("Error return from kevent monitor");
	+ }
	+
	+ /*
	+ * Handle reported events
	+ */
	+ mevent_handle(eventlist, ret);
	+ }
	+}
	diff --git a/tests/sys/virtio/mmio_emul.h b/tests/sys/virtio/mmio_emul.h
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/mmio_emul.h
	@@ -0,0 +1,117 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2011 NetApp, Inc.
	+ * All rights reserved.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+#ifndef _MMIO_EMUL_H_
	+#define _MMIO_EMUL_H_
	+
	+#include <sys/types.h>
	+#include <sys/queue.h>
	+#include <sys/kernel.h>
	+#include <sys/nv.h>
	+#include <sys/_pthreadtypes.h>
	+
	+#include <assert.h>
	+
	+#define MI_NAMESZ (40)
	+
	+struct mmio_devinst;
	+
	+struct mmio_devemu {
	+ const char me_emu; / Name of device emulation */
	+
	+ /* instance creation */
	+ int (me_init)(struct mmio_devinst , nvlist_t *);
	+ void (me_write)(struct mmio_devinst mdi, uint64_t offset,
	+ int size, uint32_t value);
	+};
	+#define MMIO_EMUL_SET(x) DATA_SET(mmio_devemu_set, x)
	+
	+enum mmio_devstate {
	+ MIDEV_INVALID,
	+ MIDEV_ACKNOWLEDGED,
	+ MIDEV_DRIVER_FOUND,
	+ MIDEV_FEATURES_OK,
	+ MIDEV_LIVE,
	+ MIDEV_FAILED,
	+ MIDEV_DEVICE_STATES,
	+};
	+
	+struct mmio_devinst {
	+ struct mmio_devemu *mi_d;
	+ char mi_name[MI_NAMESZ];
	+ char mi_addr; / VQ control region */
	+ size_t mi_bytes; /* Size of region in bytes */
	+ int mi_fd; /* File descriptor for the region. */
	+ enum mmio_devstate mi_state;
	+};
	+
	+/* XXX Sensible default until proven otherwise. But we need to link it with the in-kernel header. */
	+#define MMIO_TOTAL_SIZE (1024 * 1024 * 10)
	+#define MMIO_CTRDEV ("/dev/vtdbg")
	+
	+int init_mmio(nvlist_t *nvl);
	+void mmio_print_supported_devices(void);
	+int mmio_parse_device(nvlist_t nvl, char opt);
	+
	+static __inline void
	+mmio_set_cfgdata8(struct mmio_devinst *mdi, int offset, uint8_t val)
	+{
	+ (uint8_t )(mdi->mi_addr + offset) = val;
	+}
	+
	+static __inline void
	+mmio_set_cfgdata16(struct mmio_devinst *mdi, int offset, uint16_t val)
	+{
	+ (uint16_t )(mdi->mi_addr + offset) = htole16(val);
	+}
	+
	+static __inline void
	+mmio_set_cfgdata32(struct mmio_devinst *mdi, int offset, uint32_t val)
	+{
	+ (uint32_t )(mdi->mi_addr + offset) = htole32(val);
	+}
	+
	+static __inline uint8_t
	+mmio_get_cfgdata8(struct mmio_devinst *mdi, int offset)
	+{
	+ return ((uint8_t )(mdi->mi_addr + offset));
	+}
	+
	+static __inline uint16_t
	+mmio_get_cfgdata16(struct mmio_devinst *mdi, int offset)
	+{
	+ return le16toh(((uint16_t )(mdi->mi_addr + offset)));
	+}
	+
	+static __inline uint32_t
	+mmio_get_cfgdata32(struct mmio_devinst *mdi, int offset)
	+{
	+ return le32toh(((uint32_t )(mdi->mi_addr + offset)));
	+}
	+
	+#endif /* _MMIO_EMUL_H_ */
	diff --git a/tests/sys/virtio/mmio_emul.c b/tests/sys/virtio/mmio_emul.c
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/mmio_emul.c
	@@ -0,0 +1,178 @@
	+#include <sys/param.h>
	+#include <sys/mman.h>
	+#include <sys/nv.h>
	+
	+#include <ctype.h>
	+#include <errno.h>
	+#include <fcntl.h>
	+#include <pthread.h>
	+#include <stdbool.h>
	+#include <stdio.h>
	+#include <stdlib.h>
	+#include <string.h>
	+#include <unistd.h>
	+
	+#include <dev/virtio/dbg/virtio_dbg.h>
	+
	+#include "config.h"
	+#include "debug.h"
	+#include "mmio_emul.h"
	+#include "virtio.h"
	+
	+SET_DECLARE(mmio_devemu_set, struct mmio_devemu);
	+
	+static struct mmio_devemu *
	+mmio_emul_finddev(const char *name)
	+{
	+ struct mmio_devemu *mdpp, mdp;
	+
	+ SET_FOREACH(mdpp, mmio_devemu_set) {
	+ mdp = *mdpp;
	+ if (!strcmp(mdp->me_emu, name)) {
	+ return (mdp);
	+ }
	+ }
	+
	+ return (NULL);
	+}
	+
	+static void *
	+mmio_emul_driver_init(void *arg)
	+{
	+ int error;
	+ int fd = (int)(long)arg;
	+
	+ error = ioctl(fd, VIRTIO_DBG_INIT);
	+ if (error < 0) {
	+ EPRINTLN("Control device initialization error: %s",
	+ strerror(errno));
	+ exit(1);
	+ }
	+ pthread_exit(NULL);
	+}
	+
	+static int
	+mmio_emul_control_init(struct mmio_devinst mdi, struct mmio_devemu mde, nvlist_t *nvl)
	+{
	+ pthread_t thread;
	+ char *mmio;
	+ int err;
	+ int fd;
	+
	+ fd = open(MMIO_CTRDEV, O_RDWR);
	+ if (fd == -1) {
	+ EPRINTLN("Control device open error: %s",
	+ strerror(errno));
	+ return (-1);
	+ }
	+
	+ mmio = mmap(NULL, MMIO_TOTAL_SIZE, PROT_READ \| PROT_WRITE,
	+ MAP_FILE \| MAP_SHARED, fd, 0);
	+ if (mmio == MAP_FAILED) {
	+ EPRINTLN("Control device mapping error: %s",
	+ strerror(errno));
	+ close(fd);
	+ return (-1);
	+ }
	+
	+ mdi->mi_fd = fd;
	+ mdi->mi_addr = mmio;
	+ mdi->mi_bytes = MMIO_TOTAL_SIZE;
	+
	+ /*
	+ * XXX Hack. We currently hardwire the block device ID. Propagate
	+ * the device type in a different way.
	+ */
	+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_MAGIC_VALUE, VIRTIO_MMIO_MAGIC_VIRT);
	+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_VERSION, 0x2);
	+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_DEVICE_ID, 0x2);
	+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_VENDOR_ID, VIRTIO_VENDOR);
	+
	+ err = (mde->me_init)(mdi, nvl);
	+ if (err != 0)
	+ return (err);
	+
	+ /*
	+ * Make the ioctl out of band, because we wll use this thread to to service
	+ * the register the writes triggered by the driver during device attach.
	+ */
	+ return (pthread_create(&thread, NULL, mmio_emul_driver_init, (void *)(long)fd));
	+}
	+
	+static int
	+mmio_emul_init(struct mmio_devemu mde, nvlist_t nvl)
	+{
	+ struct mmio_devinst *mdi;
	+ int err;
	+
	+ mdi = calloc(1, sizeof(struct mmio_devinst));
	+ if (mdi == NULL)
	+ return (ENOMEM);
	+
	+ snprintf(mdi->mi_name, sizeof(mdi->mi_name), "%s@mmio", mde->me_emu);
	+ mdi->mi_state = MIDEV_INVALID;
	+ mdi->mi_fd = -1;
	+
	+ err = mmio_emul_control_init(mdi, mde, nvl);
	+ if (err != 0) {
	+ free(mdi);
	+ return (err);
	+ }
	+
	+ return (0);
	+}
	+
	+int
	+mmio_parse_device(nvlist_t nvl, char opt)
	+{
	+ struct mmio_devemu *mde;
	+ char *emul = opt;
	+
	+ mde = mmio_emul_finddev(emul);
	+ if (mde == NULL) {
	+ EPRINTLN("unknown mmio device %s\n", emul);
	+ return (EINVAL);
	+ }
	+
	+ if (get_config_value_node(nvl, "devtype") != NULL) {
	+ EPRINTLN("device type already defined!");
	+ return (EINVAL);
	+ }
	+
	+ set_config_value_node(nvl, "devtype", mde->me_emu);
	+
	+ return (0);
	+}
	+
	+
	+void
	+mmio_print_supported_devices(void)
	+{
	+ struct mmio_devemu *mdpp, mdp;
	+
	+ SET_FOREACH(mdpp, mmio_devemu_set) {
	+ mdp = *mdpp;
	+ printf("%s\n", mdp->me_emu);
	+ }
	+}
	+
	+int
	+init_mmio(nvlist_t *nvl)
	+{
	+ struct mmio_devemu *mde;
	+ const char *emul;
	+
	+ emul = get_config_value_node(nvl, "devtype");
	+ if (emul == NULL) {
	+ EPRINTLN("mmio device missing devtype value");
	+ return (EINVAL);
	+ }
	+
	+ mde = mmio_emul_finddev(emul);
	+ if (mde == NULL) {
	+ EPRINTLN("mmio unknown device \"%s\"", emul);
	+ return (EINVAL);
	+ }
	+
	+ return (mmio_emul_init(mde, nvl));
	+}
	diff --git a/tests/sys/virtio/mmio_virtio_block.c b/tests/sys/virtio/mmio_virtio_block.c
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/mmio_virtio_block.c
	@@ -0,0 +1,560 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2011 NetApp, Inc.
	+ * All rights reserved.
	+ * Copyright 2020-2021 Joyent, Inc.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+#include <sys/param.h>
	+#include <sys/linker_set.h>
	+#include <sys/stat.h>
	+#include <sys/uio.h>
	+#include <sys/ioctl.h>
	+#include <sys/disk.h>
	+
	+#include <stdbool.h>
	+#include <errno.h>
	+#include <fcntl.h>
	+#include <stdio.h>
	+#include <stdlib.h>
	+#include <stdint.h>
	+#include <string.h>
	+#include <strings.h>
	+#include <unistd.h>
	+#include <assert.h>
	+#include <pthread.h>
	+#include <md5.h>
	+
	+#include <dev/virtio/dbg/virtio_dbg.h>
	+
	+#include "config.h"
	+#include "debug.h"
	+#include "mevent.h"
	+#include "mmio_emul.h"
	+#include "virtio.h"
	+#include "block_if.h"
	+#include "iov_emul.h"
	+
	+#define VTBLK_BSIZE 512
	+#define VTBLK_RINGSZ 128
	+
	+_Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request");
	+
	+#define VTBLK_S_OK 0
	+#define VTBLK_S_IOERR 1
	+#define VTBLK_S_UNSUPP 2
	+
	+#define VTBLK_BLK_ID_BYTES 20 + 1
	+
	+/* Capability bits */
	+#define VTBLK_F_BARRIER (1 << 0) /* Does host support barriers? */
	+#define VTBLK_F_SIZE_MAX (1 << 1) /* Indicates maximum segment size */
	+#define VTBLK_F_SEG_MAX (1 << 2) /* Indicates maximum # of segments */
	+#define VTBLK_F_GEOMETRY (1 << 4) /* Legacy geometry available */
	+#define VTBLK_F_RO (1 << 5) /* Disk is read-only */
	+#define VTBLK_F_BLK_SIZE (1 << 6) /* Block size of disk is available*/
	+#define VTBLK_F_SCSI (1 << 7) /* Supports scsi command passthru */
	+#define VTBLK_F_FLUSH (1 << 9) /* Writeback mode enabled after reset */
	+#define VTBLK_F_WCE (1 << 9) /* Legacy alias for FLUSH */
	+#define VTBLK_F_TOPOLOGY (1 << 10) /* Topology information is available */
	+#define VTBLK_F_CONFIG_WCE (1 << 11) /* Writeback mode available in config */
	+#define VTBLK_F_MQ (1 << 12) /* Multi-Queue */
	+#define VTBLK_F_DISCARD (1 << 13) /* Trim blocks */
	+#define VTBLK_F_WRITE_ZEROES (1 << 14) /* Write zeros */
	+
	+/*
	+ * Host capabilities
	+ */
	+#define VTBLK_S_HOSTCAPS \
	+ ( VTBLK_F_SEG_MAX \| \
	+ VTBLK_F_BLK_SIZE \| \
	+ VTBLK_F_FLUSH \| \
	+ VTBLK_F_TOPOLOGY )
	+ /* XXX Reactivate */
	+// VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */
	+
	+/*
	+ * The current blockif_delete() interface only allows a single delete
	+ * request at a time.
	+ */
	+#define VTBLK_MAX_DISCARD_SEG 1
	+
	+/*
	+ * An arbitrary limit to prevent excessive latency due to large
	+ * delete requests.
	+ */
	+#define VTBLK_MAX_DISCARD_SECT ((16 << 20) / VTBLK_BSIZE) /* 16 MiB */
	+
	+/*
	+ * Config space "registers"
	+ */
	+struct vtblk_config {
	+ uint64_t vbc_capacity;
	+ uint32_t vbc_size_max;
	+ uint32_t vbc_seg_max;
	+ struct {
	+ uint16_t cylinders;
	+ uint8_t heads;
	+ uint8_t sectors;
	+ } vbc_geometry;
	+ uint32_t vbc_blk_size;
	+ struct {
	+ uint8_t physical_block_exp;
	+ uint8_t alignment_offset;
	+ uint16_t min_io_size;
	+ uint32_t opt_io_size;
	+ } vbc_topology;
	+ uint8_t vbc_writeback;
	+ uint8_t unused0[1];
	+ uint16_t num_queues;
	+ uint32_t max_discard_sectors;
	+ uint32_t max_discard_seg;
	+ uint32_t discard_sector_alignment;
	+ uint32_t max_write_zeroes_sectors;
	+ uint32_t max_write_zeroes_seg;
	+ uint8_t write_zeroes_may_unmap;
	+ uint8_t unused1[3];
	+} __packed;
	+
	+/*
	+ * Fixed-size block header
	+ */
	+struct virtio_blk_hdr {
	+#define VBH_OP_READ 0
	+#define VBH_OP_WRITE 1
	+#define VBH_OP_SCSI_CMD 2
	+#define VBH_OP_SCSI_CMD_OUT 3
	+#define VBH_OP_FLUSH 4
	+#define VBH_OP_FLUSH_OUT 5
	+#define VBH_OP_IDENT 8
	+#define VBH_OP_DISCARD 11
	+#define VBH_OP_WRITE_ZEROES 13
	+
	+#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */
	+ uint32_t vbh_type;
	+ uint32_t vbh_ioprio;
	+ uint64_t vbh_sector;
	+} __packed;
	+
	+/*
	+ * Debug printf
	+ */
	+static int mmio_vtblk_debug;
	+#define DPRINTF(params) if (mmio_vtblk_debug) PRINTLN params
	+#define WPRINTF(params) PRINTLN params
	+
	+struct mmio_vtblk_ioreq {
	+ struct blockif_req io_req;
	+ struct mmio_vtblk_softc *io_sc;
	+ uint8_t *io_status;
	+ uint16_t io_idx;
	+ struct iov_emul *io_iove;
	+};
	+
	+struct virtio_blk_discard_write_zeroes {
	+ uint64_t sector;
	+ uint32_t num_sectors;
	+ struct {
	+ uint32_t unmap:1;
	+ uint32_t reserved:31;
	+ } flags;
	+};
	+
	+/*
	+ * Per-device softc
	+ */
	+struct mmio_vtblk_softc {
	+ struct virtio_softc vbsc_vs;
	+ pthread_mutex_t vsc_mtx;
	+ struct vqueue_info vbsc_vq;
	+ struct vtblk_config *vbsc_cfg;
	+ struct virtio_consts vbsc_consts;
	+ struct blockif_ctxt *bc;
	+ char vbsc_ident[VTBLK_BLK_ID_BYTES];
	+ struct mmio_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ];
	+};
	+
	+static void mmio_vtblk_reset(void *);
	+static void mmio_vtblk_notify(void , struct vqueue_info );
	+static int mmio_vtblk_cfgread(void , int, int, uint32_t );
	+static int mmio_vtblk_cfgwrite(void *, int, int, uint32_t);
	+
	+static struct virtio_consts vtblk_vi_consts = {
	+ .vc_name = "vtblk",
	+ .vc_nvq = 1,
	+ .vc_cfgsize = sizeof(struct vtblk_config),
	+ .vc_reset = mmio_vtblk_reset,
	+ .vc_qnotify = mmio_vtblk_notify,
	+ .vc_cfgread = mmio_vtblk_cfgread,
	+ .vc_cfgwrite = mmio_vtblk_cfgwrite,
	+ .vc_apply_features = NULL,
	+ .vc_hv_caps = VTBLK_S_HOSTCAPS,
	+};
	+
	+static void
	+mmio_vtblk_reset(void *vsc)
	+{
	+ struct mmio_vtblk_softc *sc = vsc;
	+
	+ DPRINTF(("vtblk: device reset requested !"));
	+ vi_reset_dev(&sc->vbsc_vs);
	+}
	+
	+static void
	+mmio_vtblk_done_locked(struct mmio_vtblk_ioreq *io, int err)
	+{
	+ struct mmio_vtblk_softc *sc = io->io_sc;
	+ int fd = sc->vbsc_vs.vs_mi->mi_fd;
	+
	+ /* convert errno into a virtio block error return */
	+ if (err == EOPNOTSUPP \|\| err == ENOSYS)
	+ *io->io_status = VTBLK_S_UNSUPP;
	+ else if (err != 0)
	+ *io->io_status = VTBLK_S_IOERR;
	+ else
	+ *io->io_status = VTBLK_S_OK;
	+
	+
	+ iove_export(fd, io->io_iove);
	+ iove_free(io->io_iove);
	+ io->io_iove = NULL;
	+
	+ /*
	+ * Return the descriptor back to the host.
	+ * We wrote 1 byte (our status) to host.
	+ */
	+ vq_relchain(&sc->vbsc_vq, io->io_idx, 1);
	+ vq_endchains(&sc->vbsc_vq, 0);
	+}
	+
	+static void
	+mmio_vtblk_done(struct blockif_req *br, int err)
	+{
	+ struct mmio_vtblk_ioreq *io = br->br_param;
	+ struct mmio_vtblk_softc *sc = io->io_sc;
	+
	+ pthread_mutex_lock(&sc->vsc_mtx);
	+ mmio_vtblk_done_locked(io, err);
	+ pthread_mutex_unlock(&sc->vsc_mtx);
	+}
	+
	+static void
	+mmio_vtblk_proc(struct mmio_vtblk_softc sc, struct vqueue_info vq)
	+{
	+ struct virtio_blk_hdr *vbh;
	+ struct mmio_vtblk_ioreq *io;
	+ int i, n;
	+ int err;
	+ ssize_t iolen;
	+ int writeop, type;
	+ struct vi_req req;
	+ struct iovec iov[BLOCKIF_IOV_MAX + 2];
	+ struct virtio_blk_discard_write_zeroes *discard;
	+
	+ n = vq_getchain(vq, iov, BLOCKIF_IOV_MAX + 2, &req);
	+
	+ /*
	+ * The first descriptor will be the read-only fixed header,
	+ * and the last is for status (hence +2 above and below).
	+ * The remaining iov's are the actual data I/O vectors.
	+ *
	+ * XXX - note - this fails on crash dump, which does a
	+ * VIRTIO_BLK_T_FLUSH with a zero transfer length
	+ */
	+ assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2);
	+
	+ io = &sc->vbsc_ios[req.idx];
	+ assert(req.readable != 0);
	+ assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr));
	+ vbh = (struct virtio_blk_hdr *)iov[0].iov_base;
	+ memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2));
	+ io->io_req.br_iovcnt = n - 2;
	+ io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE;
	+ io->io_status = (uint8_t *)iov[--n].iov_base;
	+ io->io_iove = req.iove;
	+ assert(req.writable != 0);
	+ assert(iov[n].iov_len == 1);
	+
	+ /*
	+ * XXX
	+ * The guest should not be setting the BARRIER flag because
	+ * we don't advertise the capability.
	+ */
	+ type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
	+ writeop = (type == VBH_OP_WRITE \|\| type == VBH_OP_DISCARD);
	+ /*
	+ * - Write op implies read-only descriptor
	+ * - Read/ident op implies write-only descriptor
	+ *
	+ * By taking away either the read-only fixed header or the write-only
	+ * status iovec, the following condition should hold true.
	+ */
	+ assert(n == (writeop ? req.readable : req.writable));
	+
	+ iolen = 0;
	+ for (i = 1; i < n; i++) {
	+ iolen += iov[i].iov_len;
	+ }
	+ io->io_req.br_resid = iolen;
	+
	+ DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld",
	+ writeop ? "write/discard" : "read/ident", iolen, i - 1,
	+ io->io_req.br_offset));
	+
	+ switch (type) {
	+ case VBH_OP_READ:
	+ err = blockif_read(sc->bc, &io->io_req);
	+ break;
	+ case VBH_OP_WRITE:
	+ err = blockif_write(sc->bc, &io->io_req);
	+ break;
	+ case VBH_OP_DISCARD:
	+ /*
	+ * We currently only support a single request, if the guest
	+ * has submitted a request that doesn't conform to the
	+ * requirements, we return a error.
	+ */
	+ if (iov[1].iov_len != sizeof (*discard)) {
	+ mmio_vtblk_done_locked(io, EINVAL);
	+ return;
	+ }
	+
	+ /* The segments to discard are provided rather than data */
	+ discard = (struct virtio_blk_discard_write_zeroes *)
	+ iov[1].iov_base;
	+
	+ /*
	+ * virtio v1.1 5.2.6.2:
	+ * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP
	+ * for discard and write zeroes commands if any unknown flag is
	+ * set. Furthermore, the device MUST set the status byte to
	+ * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag
	+ * is set.
	+ *
	+ * Currently there are no known flags for a DISCARD request.
	+ */
	+ if (discard->flags.unmap != 0 \|\| discard->flags.reserved != 0) {
	+ mmio_vtblk_done_locked(io, ENOTSUP);
	+ return;
	+ }
	+
	+ /* Make sure the request doesn't exceed our size limit */
	+ if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) {
	+ mmio_vtblk_done_locked(io, EINVAL);
	+ return;
	+ }
	+
	+ io->io_req.br_offset = discard->sector * VTBLK_BSIZE;
	+ io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE;
	+ err = blockif_delete(sc->bc, &io->io_req);
	+ break;
	+ case VBH_OP_FLUSH:
	+ case VBH_OP_FLUSH_OUT:
	+ err = blockif_flush(sc->bc, &io->io_req);
	+ break;
	+ case VBH_OP_IDENT:
	+ /* Assume a single buffer */
	+ /* S/n equal to buffer is not zero-terminated. */
	+ memset(iov[1].iov_base, 0, iov[1].iov_len);
	+ strncpy(iov[1].iov_base, sc->vbsc_ident,
	+ MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
	+ mmio_vtblk_done_locked(io, 0);
	+ return;
	+ default:
	+ mmio_vtblk_done_locked(io, EOPNOTSUPP);
	+ return;
	+ }
	+ assert(err == 0);
	+}
	+
	+static void
	+mmio_vtblk_notify(void vsc, struct vqueue_info vq)
	+{
	+ struct mmio_vtblk_softc *sc = vsc;
	+
	+ while (vq_has_descs(vq))
	+ mmio_vtblk_proc(sc, vq);
	+}
	+
	+static void
	+mmio_vtblk_resized(struct blockif_ctxt bctxt __unused, void arg,
	+ size_t new_size, uint64_t data __unused)
	+{
	+ struct mmio_vtblk_softc *sc;
	+
	+ sc = arg;
	+
	+ sc->vbsc_cfg->vbc_capacity = new_size / VTBLK_BSIZE; /* 512-byte units */
	+ /* XXX Handle resizing. */
	+ printf("UNIMPLEMENTED %s\n", __func__);
	+ exit(1);
	+}
	+
	+static void
	+mmio_vtblk_event(int fd, enum ev_type type, void *arg, uint64_t offset)
	+{
	+ struct mmio_vtblk_softc sc = (struct mmio_vtblk_softc )arg;
	+ struct mmio_devinst *mdi = sc->vbsc_vs.vs_mi;
	+
	+ assert(fd == mdi->mi_fd);
	+ assert(type == EVF_READ);
	+
	+ vi_mmio_write(&sc->vbsc_vs, offset);
	+
	+ /* Let in-progress operations continue. */
	+ ioctl(mdi->mi_fd, VIRTIO_DBG_ACK);
	+}
	+
	+static int
	+mmio_vtblk_init(struct mmio_devinst mdi, nvlist_t nvl)
	+{
	+ char bident[MI_NAMESZ];
	+ struct blockif_ctxt *bctxt;
	+ const char path, serial;
	+ MD5_CTX mdctx;
	+ u_char digest[16];
	+ struct mmio_vtblk_softc *sc;
	+ off_t size;
	+ int i, sectsz, sts, sto;
	+
	+ /*
	+ * The supplied backing file has to exist
	+ */
	+ /* Make sure the name fits */
	+ snprintf(bident, sizeof(bident), "%s", mdi->mi_name);
	+ bctxt = blockif_open(nvl, bident);
	+ if (bctxt == NULL) {
	+ perror("Could not open backing file");
	+ return (1);
	+ }
	+
	+ size = blockif_size(bctxt);
	+ sectsz = blockif_sectsz(bctxt);
	+ blockif_psectsz(bctxt, &sts, &sto);
	+
	+ sc = calloc(1, sizeof(struct mmio_vtblk_softc));
	+ sc->vbsc_cfg = (struct vtblk_config *)((uint64_t)mdi->mi_addr + VIRTIO_MMIO_CONFIG);
	+
	+ sc->bc = bctxt;
	+ for (i = 0; i < VTBLK_RINGSZ; i++) {
	+ struct mmio_vtblk_ioreq *io = &sc->vbsc_ios[i];
	+ io->io_req.br_callback = mmio_vtblk_done;
	+ io->io_req.br_param = io;
	+ io->io_sc = sc;
	+ io->io_idx = i;
	+ }
	+
	+ bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts));
	+ if (blockif_candelete(sc->bc))
	+ sc->vbsc_consts.vc_hv_caps \|= VTBLK_F_DISCARD;
	+
	+ pthread_mutex_init(&sc->vsc_mtx, NULL);
	+
	+ /* init virtio softc and virtqueues */
	+ vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, mdi, &sc->vbsc_vq);
	+ sc->vbsc_vs.vs_mtx = &sc->vsc_mtx;
	+
	+ sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
	+ /* sc->vbsc_vq.vq_notify = we have no per-queue notify */
	+
	+ /*
	+ * If an explicit identifier is not given, create an
	+ * identifier using parts of the md5 sum of the filename.
	+ */
	+ bzero(sc->vbsc_ident, VTBLK_BLK_ID_BYTES);
	+ if ((serial = get_config_value_node(nvl, "serial")) != NULL \|\|
	+ (serial = get_config_value_node(nvl, "ser")) != NULL) {
	+ strlcpy(sc->vbsc_ident, serial, VTBLK_BLK_ID_BYTES);
	+ } else {
	+ path = get_config_value_node(nvl, "path");
	+ MD5Init(&mdctx);
	+ MD5Update(&mdctx, path, strlen(path));
	+ MD5Final(digest, &mdctx);
	+ snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES,
	+ "BHYVE-%02X%02X-%02X%02X-%02X%02X",
	+ digest[0], digest[1], digest[2], digest[3], digest[4],
	+ digest[5]);
	+ }
	+
	+ /* setup virtio block config space */
	+ sc->vbsc_cfg->vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */
	+ sc->vbsc_cfg->vbc_size_max = 0; /* not negotiated */
	+
	+ /*
	+ * If Linux is presented with a seg_max greater than the virtio queue
	+ * size, it can stumble into situations where it violates its own
	+ * invariants and panics. For safety, we keep seg_max clamped, paying
	+ * heed to the two extra descriptors needed for the header and status
	+ * of a request.
	+ */
	+ sc->vbsc_cfg->vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX);
	+ sc->vbsc_cfg->vbc_geometry.cylinders = 0; /* no geometry */
	+ sc->vbsc_cfg->vbc_geometry.heads = 0;
	+ sc->vbsc_cfg->vbc_geometry.sectors = 0;
	+ sc->vbsc_cfg->vbc_blk_size = sectsz;
	+ sc->vbsc_cfg->vbc_topology.physical_block_exp =
	+ (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0;
	+ sc->vbsc_cfg->vbc_topology.alignment_offset =
	+ (sto != 0) ? ((sts - sto) / sectsz) : 0;
	+ sc->vbsc_cfg->vbc_topology.min_io_size = 0;
	+ sc->vbsc_cfg->vbc_topology.opt_io_size = 0;
	+ sc->vbsc_cfg->vbc_writeback = 0;
	+ sc->vbsc_cfg->max_discard_sectors = VTBLK_MAX_DISCARD_SECT;
	+ sc->vbsc_cfg->max_discard_seg = VTBLK_MAX_DISCARD_SEG;
	+ sc->vbsc_cfg->discard_sector_alignment = MAX(sectsz, sts) / VTBLK_BSIZE;
	+
	+ mevent_add(mdi->mi_fd, EVF_READ, mmio_vtblk_event, sc);
	+ blockif_register_resize_callback(sc->bc, mmio_vtblk_resized, sc);
	+
	+ return (0);
	+}
	+
	+static int
	+mmio_vtblk_cfgwrite(void *vsc __unused, int offset, int size __unused,
	+ uint32_t value __unused)
	+{
	+
	+ DPRINTF(("vtblk: write to readonly reg %d", offset));
	+ return (1);
	+}
	+
	+static int
	+mmio_vtblk_cfgread(void vsc, int offset, int size, uint32_t retval)
	+{
	+ struct mmio_vtblk_softc *sc = vsc;
	+ void *ptr;
	+
	+ /* our caller has already verified offset and size */
	+ ptr = (uint8_t *)sc->vbsc_cfg + offset;
	+ memcpy(retval, ptr, size);
	+ return (0);
	+}
	+
	+
	+static const struct mmio_devemu mmio_de_vblk = {
	+ .me_emu = "virtio-blk",
	+ .me_init = mmio_vtblk_init,
	+};
	+MMIO_EMUL_SET(mmio_de_vblk);
	diff --git a/tests/sys/virtio/virtio.h b/tests/sys/virtio/virtio.h
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/virtio.h
	@@ -0,0 +1,323 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
	+ * All rights reserved.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+#ifndef _BHYVE_VIRTIO_H_
	+#define _BHYVE_VIRTIO_H_
	+
	+#include <machine/atomic.h>
	+
	+#include <dev/virtio/virtio.h>
	+#include <dev/virtio/virtio_ring.h>
	+#include <dev/virtio/mmio/virtio_mmio.h>
	+
	+/*
	+ * These are derived from several virtio specifications.
	+ *
	+ * Some useful links:
	+ * https://github.com/rustyrussell/virtio-spec
	+ * http://people.redhat.com/pbonzini/virtio-spec.pdf
	+ */
	+
	+/*
	+ * A virtual device has zero or more "virtual queues" (virtqueue).
	+ * Each virtqueue uses at least two 4096-byte pages, laid out thus:
	+ *
	+ * +-----------------------------------------------+
	+ * \| "desc": <N> descriptors, 16 bytes each \|
	+ * \| ----------------------------------------- \|
	+ * \| "avail": 2 uint16; <N> uint16; 1 uint16 \|
	+ * \| ----------------------------------------- \|
	+ * \| pad to 4k boundary \|
	+ * +-----------------------------------------------+
	+ * \| "used": 2 x uint16; <N> elems; 1 uint16 \|
	+ * \| ----------------------------------------- \|
	+ * \| pad to 4k boundary \|
	+ * +-----------------------------------------------+
	+ *
	+ * The number <N> that appears here is always a power of two and is
	+ * limited to no more than 32768 (as it must fit in a 16-bit field).
	+ * If <N> is sufficiently large, the above will occupy more than
	+ * two pages. In any case, all pages must be physically contiguous
	+ * within the guest's physical address space.
	+ *
	+ * The <N> 16-byte "desc" descriptors consist of a 64-bit guest
	+ * physical address <addr>, a 32-bit length <len>, a 16-bit
	+ * <flags>, and a 16-bit <next> field (all in guest byte order).
	+ *
	+ * There are three flags that may be set :
	+ * NEXT descriptor is chained, so use its "next" field
	+ * WRITE descriptor is for host to write into guest RAM
	+ * (else host is to read from guest RAM)
	+ * INDIRECT descriptor address field is (guest physical)
	+ * address of a linear array of descriptors
	+ *
	+ * Unless INDIRECT is set, <len> is the number of bytes that may
	+ * be read/written from guest physical address <addr>. If
	+ * INDIRECT is set, WRITE is ignored and <len> provides the length
	+ * of the indirect descriptors (and <len> must be a multiple of
	+ * 16). Note that NEXT may still be set in the main descriptor
	+ * pointing to the indirect, and should be set in each indirect
	+ * descriptor that uses the next descriptor (these should generally
	+ * be numbered sequentially). However, INDIRECT must not be set
	+ * in the indirect descriptors. Upon reaching an indirect descriptor
	+ * without a NEXT bit, control returns to the direct descriptors.
	+ *
	+ * Except inside an indirect, each <next> value must be in the
	+ * range [0 .. N) (i.e., the half-open interval). (Inside an
	+ * indirect, each <next> must be in the range [0 .. <len>/16).)
	+ *
	+ * The "avail" data structures reside in the same pages as the
	+ * "desc" structures since both together are used by the device to
	+ * pass information to the hypervisor's virtual driver. These
	+ * begin with a 16-bit <flags> field and 16-bit index <idx>, then
	+ * have <N> 16-bit <ring> values, followed by one final 16-bit
	+ * field <used_event>. The <N> <ring> entries are simply indices
	+ * into the descriptor ring (and thus must meet the same
	+ * constraints as each <next> value). However, <idx> is counted
	+ * up from 0 (initially) and simply wraps around after 65535; it
	+ * is taken mod <N> to find the next available entry.
	+ *
	+ * The "used" ring occupies a separate page or pages, and contains
	+ * values written from the virtual driver back to the guest OS.
	+ * This begins with a 16-bit <flags> and 16-bit <idx>, then there
	+ * are <N> "vring_used" elements, followed by a 16-bit <avail_event>.
	+ * The <N> "vring_used" elements consist of a 32-bit <id> and a
	+ * 32-bit <len> (vu_tlen below). The <id> is simply the index of
	+ * the head of a descriptor chain the guest made available
	+ * earlier, and the <len> is the number of bytes actually written,
	+ * e.g., in the case of a network driver that provided a large
	+ * receive buffer but received only a small amount of data.
	+ *
	+ * The two event fields, <used_event> and <avail_event>, in the
	+ * avail and used rings (respectively -- note the reversal!), are
	+ * always provided, but are used only if the virtual device
	+ * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature
	+ * negotiation. Similarly, both rings provide a flag --
	+ * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in
	+ * their <flags> field, indicating that the guest does not need an
	+ * interrupt, or that the hypervisor driver does not need a
	+ * notify, when descriptors are added to the corresponding ring.
	+ * (These are provided only for interrupt optimization and need
	+ * not be implemented.)
	+ */
	+#define VRING_ALIGN 4096
	+
	+/*
	+ * PCI vendor/device IDs
	+ */
	+#define VIRTIO_VENDOR 0x1AF4
	+#define VIRTIO_DEV_NET 0x1000
	+#define VIRTIO_DEV_BLOCK 0x1001
	+#define VIRTIO_DEV_CONSOLE 0x1003
	+#define VIRTIO_DEV_SCSI 0x1004
	+#define VIRTIO_DEV_RANDOM 0x1005
	+#define VIRTIO_DEV_9P 0x1009
	+#define VIRTIO_DEV_INPUT 0x1052
	+
	+/*
	+ * PCI revision IDs
	+ */
	+#define VIRTIO_REV_INPUT 1
	+
	+/*
	+ * PCI subvendor IDs
	+ */
	+#define VIRTIO_SUBVEN_INPUT 0x108E
	+
	+/*
	+ * PCI subdevice IDs
	+ */
	+#define VIRTIO_SUBDEV_INPUT 0x1100
	+
	+/* From section 2.3, "Virtqueue Configuration", of the virtio specification */
	+static inline int
	+vring_size_aligned(u_int qsz)
	+{
	+ return (roundup2(vring_size(qsz, VRING_ALIGN), VRING_ALIGN));
	+}
	+
	+struct mmio_devinst;
	+struct vqueue_info;
	+
	+struct virtio_softc {
	+ struct virtio_consts vs_vc; / constants (see below) */
	+ int vs_flags; /* VIRTIO_* flags from above */
	+ pthread_mutex_t vs_mtx; / POSIX mutex, if any */
	+ struct mmio_devinst vs_mi; / MMIO device instance */
	+ uint32_t vs_negotiated_caps; /* negotiated capabilities */
	+ struct vqueue_info vs_queues; / one per vc_nvq */
	+ int vs_curq; /* current queue */
	+};
	+
	+#define VS_LOCK(vs) \
	+do { \
	+ if (vs->vs_mtx) \
	+ pthread_mutex_lock(vs->vs_mtx); \
	+} while (0)
	+
	+#define VS_UNLOCK(vs) \
	+do { \
	+ if (vs->vs_mtx) \
	+ pthread_mutex_unlock(vs->vs_mtx); \
	+} while (0)
	+
	+struct virtio_consts {
	+ const char vc_name; / name of driver (for diagnostics) */
	+ int vc_nvq; /* number of virtual queues */
	+ size_t vc_cfgsize; /* size of dev-specific config regs */
	+ void (vc_reset)(void ); /* called on virtual device reset */
	+ void (vc_qnotify)(void , struct vqueue_info *);
	+ /* called on QNOTIFY if no VQ notify */
	+ int (vc_cfgread)(void , int, int, uint32_t *);
	+ /* called to read config regs */
	+ int (vc_cfgwrite)(void , int, int, uint32_t);
	+ /* called to write config regs */
	+ void (vc_apply_features)(void , uint64_t);
	+ /* called to apply negotiated features */
	+ uint64_t vc_hv_caps; /* hypervisor-provided capabilities */
	+};
	+
	+/*
	+ * Data structure allocated (statically) per virtual queue.
	+ *
	+ * Drivers may change vq_qsize after a reset. When the guest OS
	+ * requests a device reset, the hypervisor first calls
	+ * vs->vs_vc->vc_reset(); then the data structure below is
	+ * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq).
	+ *
	+ * The remaining fields should only be fussed-with by the generic
	+ * code.
	+ *
	+ * Note: the addresses of vq_desc, vq_avail, and vq_used are all
	+ * computable from each other, but it's a lot simpler if we just
	+ * keep a pointer to each one. The event indices are similarly
	+ * (but more easily) computable, and this time we'll compute them:
	+ * they're just XX_ring[N].
	+ */
	+#define VQ_ALLOC 0x01 /* set once we have a pfn */
	+#define VQ_BROKED 0x02 /* ??? */
	+struct vqueue_info {
	+ uint16_t vq_qsize; /* size of this queue (a power of 2) */
	+ void (vq_notify)(void , struct vqueue_info *);
	+ /* called instead of vc_notify, if not NULL */
	+
	+ struct virtio_softc vq_vs; / backpointer to softc */
	+ uint16_t vq_num; /* we're the num'th queue in the softc */
	+
	+ uint16_t vq_flags; /* flags (see above) */
	+ uint16_t vq_last_avail; /* a recent value of vq_avail->idx */
	+ uint16_t vq_next_used; /* index of the next used slot to be filled */
	+ uint16_t vq_save_used; /* saved vq_used->idx; see vq_endchains */
	+
	+ uint32_t vq_offset; /* Offset in the control region */
	+
	+ struct vring_desc vq_desc; / descriptor array */
	+ struct vring_avail vq_avail; / the "avail" ring */
	+ struct vring_used vq_used; / the "used" ring */
	+};
	+
	+/* as noted above, these are sort of backwards, name-wise */
	+#define VQ_AVAIL_EVENT_IDX(vq) \
	+ ((uint16_t )&(vq)->vq_used->ring[(vq)->vq_qsize])
	+#define VQ_USED_EVENT_IDX(vq) \
	+ ((vq)->vq_avail->ring[(vq)->vq_qsize])
	+
	+/*
	+ * Is this ring ready for I/O?
	+ */
	+static inline int
	+vq_ring_ready(struct vqueue_info *vq)
	+{
	+
	+ return (vq->vq_flags & VQ_ALLOC);
	+}
	+
	+/*
	+ * Are there "available" descriptors? (This does not count
	+ * how many, just returns True if there are some.)
	+ */
	+static inline int
	+vq_has_descs(struct vqueue_info *vq)
	+{
	+
	+ return (vq_ring_ready(vq) && vq->vq_last_avail !=
	+ vq->vq_avail->idx);
	+}
	+
	+
	+static inline void
	+vq_kick_enable(struct vqueue_info *vq)
	+{
	+
	+ vq->vq_used->flags &= ~VRING_USED_F_NO_NOTIFY;
	+ /*
	+ * Full memory barrier to make sure the store to vq_used->flags
	+ * happens before the load from vq_avail->idx, which results from a
	+ * subsequent call to vq_has_descs().
	+ */
	+ atomic_thread_fence_seq_cst();
	+}
	+
	+static inline void
	+vq_kick_disable(struct vqueue_info *vq)
	+{
	+
	+ vq->vq_used->flags \|= VRING_USED_F_NO_NOTIFY;
	+}
	+
	+struct iovec;
	+
	+/*
	+ * Request description returned by vq_getchain.
	+ *
	+ * Writable iovecs start at iov[req.readable].
	+ */
	+struct vi_req {
	+ int readable; /* num of readable iovecs */
	+ int writable; /* num of writable iovecs */
	+ unsigned int idx; /* ring index */
	+ struct iov_emul iove; / Export io vector */
	+};
	+
	+void vi_softc_linkup(struct virtio_softc vs, struct virtio_consts vc,
	+ void dev_softc, struct mmio_devinst mi,
	+ struct vqueue_info *queues);
	+int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix);
	+void vi_reset_dev(struct virtio_softc *);
	+
	+int vq_getchain(struct vqueue_info vq, struct iovec iov, int niov,
	+ struct vi_req *reqp);
	+void vq_retchains(struct vqueue_info *vq, uint16_t n_chains);
	+void vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx,
	+ uint32_t iolen);
	+void vq_relchain_publish(struct vqueue_info *vq);
	+void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen);
	+void vq_endchains(struct vqueue_info *vq, int used_all_avail);
	+
	+void vi_mmio_write(struct virtio_softc *vs, uint64_t offset);
	+#endif /* _BHYVE_VIRTIO_H_ */
	diff --git a/tests/sys/virtio/virtio.c b/tests/sys/virtio/virtio.c
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/virtio.c
	@@ -0,0 +1,886 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
	+ * All rights reserved.
	+ * Copyright (c) 2019 Joyent, Inc.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+#include <sys/param.h>
	+#include <sys/ioctl.h>
	+#include <sys/uio.h>
	+
	+#include <errno.h>
	+#include <stdbool.h>
	+#include <stdio.h>
	+#include <stdint.h>
	+#include <stdlib.h>
	+#include <string.h>
	+#include <pthread.h>
	+#include <pthread_np.h>
	+
	+#include <dev/virtio/dbg/virtio_dbg.h>
	+
	+#include "debug.h"
	+#include "iov_emul.h"
	+#include "mmio_emul.h"
	+#include "virtio.h"
	+
	+/*
	+ * Functions for dealing with generalized "virtual devices" as
	+ * defined by <https://www.google.com/#output=search&q=virtio+spec>
	+ */
	+
	+/*
	+ * In case we decide to relax the "virtio softc comes at the
	+ * front of virtio-based device softc" constraint, let's use
	+ * this to convert.
	+ */
	+#define DEV_SOFTC(vs) ((void *)(vs))
	+
	+/*
	+ * Link a virtio_softc to its constants, the device softc, and
	+ * the PCI emulation.
	+ */
	+void
	+vi_softc_linkup(struct virtio_softc vs, struct virtio_consts vc,
	+ void dev_softc, struct mmio_devinst mdi,
	+ struct vqueue_info *queues)
	+{
	+ int i;
	+
	+ /* vs and dev_softc addresses must match */
	+ assert((void *)vs == dev_softc);
	+ vs->vs_vc = vc;
	+ vs->vs_mi = mdi;
	+
	+ vs->vs_queues = queues;
	+ for (i = 0; i < vc->vc_nvq; i++) {
	+ queues[i].vq_vs = vs;
	+ queues[i].vq_num = i;
	+ }
	+}
	+
	+/*
	+ * Deliver an interrupt to the guest device.
	+ */
	+static void
	+vq_interrupt(struct virtio_softc *vs)
	+{
	+ int fd = vs->vs_mi->mi_fd;
	+ int error;
	+
	+ mmio_set_cfgdata32(vs->vs_mi, VIRTIO_MMIO_INTERRUPT_STATUS, VIRTIO_MMIO_INT_VRING);
	+ error = ioctl(fd, VIRTIO_DBG_KICK);
	+ if (error != 0)
	+ EPRINTLN("device kick failed with %d\n", error);
	+
	+}
	+
	+/*
	+ * Reset device (device-wide). This erases all queues, i.e.,
	+ * all the queues become invalid (though we don't wipe out the
	+ * internal pointers, we just clear the VQ_ALLOC flag).
	+ *
	+ * It resets negotiated features to "none".
	+ */
	+void
	+vi_reset_dev(struct virtio_softc *vs)
	+{
	+ struct mmio_devinst *mdi = vs->vs_mi;
	+ struct vqueue_info *vq;
	+ int i, nvq;
	+
	+ if (vs->vs_mtx)
	+ assert(pthread_mutex_isowned_np(vs->vs_mtx));
	+
	+ nvq = vs->vs_vc->vc_nvq;
	+ for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
	+ vq->vq_flags = 0;
	+ vq->vq_last_avail = 0;
	+ vq->vq_next_used = 0;
	+ vq->vq_save_used = 0;
	+ /* XXX Is this right? How should we actually set it? */
	+ vq->vq_offset = UINT_MAX;
	+ }
	+ vs->vs_negotiated_caps = 0;
	+ vs->vs_curq = 0;
	+
	+ mdi->mi_state = MIDEV_INVALID;
	+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_INTERRUPT_STATUS, 0);
	+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_READY, 0);
	+
	+}
	+
	+/*
	+ * Initialize the currently-selected virtio queue (vs->vs_curq).
	+ * The guest just gave us a page frame number, from which we can
	+ * calculate the addresses of the queue.
	+ */
	+/* XXX Switch it back to using the virtio softc. */
	+static void
	+vi_vq_init(struct mmio_devinst mdi, struct vqueue_info vq)
	+{
	+ uint64_t offset;
	+
	+ offset = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_DESC_HIGH);
	+ offset <<= 32;
	+ offset \|= mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_DESC_LOW);
	+ vq->vq_desc = (struct vring_desc *)(mdi->mi_addr + offset);
	+
	+ offset = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_AVAIL_HIGH);
	+ offset <<= 32;
	+ offset \|= mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_AVAIL_LOW);
	+ vq->vq_avail = (struct vring_avail *)(mdi->mi_addr + offset);
	+
	+ offset = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_USED_HIGH);
	+ offset <<= 32;
	+ offset \|= mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_USED_LOW);
	+ vq->vq_used = (struct vring_used *)(mdi->mi_addr + offset);
	+
	+ /* Mark queue as allocated, and start at 0 when we use it. */
	+ vq->vq_flags = VQ_ALLOC;
	+ vq->vq_last_avail = 0;
	+ vq->vq_next_used = 0;
	+ vq->vq_save_used = 0;
	+}
	+
	+
	+/*
	+ * Helper inline for vq_getchain(): record the i'th "real"
	+ * descriptor.
	+ */
	+static inline void
	+_vq_record(int i, struct vring_desc vd, struct iovec iov,
	+ int n_iov, struct vi_req reqp, struct iov_emul wiove,
	+ struct iov_emul *riove)
	+{
	+ if (i >= n_iov)
	+ return;
	+
	+ /* XXX Handle OOM scenarios leading to iove_add failures. */
	+
	+ /* Preallocate a descriptor data region for the descriptor */
	+ if ((vd->flags & VRING_DESC_F_WRITE) == 0) {
	+ if (iove_add(riove, vd->addr, vd->len, &iov[i]) != 0)
	+ return;
	+
	+ reqp->readable++;
	+ } else {
	+ if (iove_add(wiove, vd->addr, vd->len, &iov[i]) != 0)
	+ return;
	+
	+ reqp->writable++;
	+ }
	+}
	+#define VQ_MAX_DESCRIPTORS 512 /* see below */
	+
	+static int
	+vq_import_indirect(struct vring_desc __unused **vdp)
	+{
	+ /* XXX Use the provided vd address to read in the indirect descriptor */
	+ printf("UNIMPLEMENTED %s\n", __func__);
	+ exit(1);
	+}
	+
	+/*
	+ * Examine the chain of descriptors starting at the "next one" to
	+ * make sure that they describe a sensible request. If so, return
	+ * the number of "real" descriptors that would be needed/used in
	+ * acting on this request. This may be smaller than the number of
	+ * available descriptors, e.g., if there are two available but
	+ * they are two separate requests, this just returns 1. Or, it
	+ * may be larger: if there are indirect descriptors involved,
	+ * there may only be one descriptor available but it may be an
	+ * indirect pointing to eight more. We return 8 in this case,
	+ * i.e., we do not count the indirect descriptors, only the "real"
	+ * ones.
	+ *
	+ * Basically, this vets the "flags" and "next" field of each
	+ * descriptor and tells you how many are involved. Since some may
	+ * be indirect, this also needs the vmctx (in the pci_devinst
	+ * at vs->vs_pi) so that it can find indirect descriptors.
	+ *
	+ * As we process each descriptor, we copy and adjust it (guest to
	+ * host address wise, also using the vmtctx) into the given iov[]
	+ * array (of the given size). If the array overflows, we stop
	+ * placing values into the array but keep processing descriptors,
	+ * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1.
	+ * So you, the caller, must not assume that iov[] is as big as the
	+ * return value (you can process the same thing twice to allocate
	+ * a larger iov array if needed, or supply a zero length to find
	+ * out how much space is needed).
	+ *
	+ * If some descriptor(s) are invalid, this prints a diagnostic message
	+ * and returns -1. If no descriptors are ready now it simply returns 0.
	+ *
	+ * You are assumed to have done a vq_ring_ready() if needed (note
	+ * that vq_has_descs() does one).
	+ */
	+int
	+vq_getchain(struct vqueue_info vq, struct iovec iov, int niov,
	+ struct vi_req *reqp)
	+{
	+ int i;
	+ u_int ndesc, n_indir;
	+ u_int idx, next;
	+ struct vi_req req;
	+ struct vring_desc vdir, vindir, *vp;
	+ struct virtio_softc *vs;
	+ const char *name;
	+ int error;
	+ struct iov_emul riove, wiove;
	+ int fd;
	+
	+ vs = vq->vq_vs;
	+ fd = vs->vs_mi->mi_fd;
	+ name = vs->vs_vc->vc_name;
	+ memset(&req, 0, sizeof(req));
	+
	+ vindir = NULL;
	+ riove = iove_alloc();
	+ wiove = iove_alloc();
	+ if (riove == NULL \|\| wiove == NULL) {
	+ iove_free(riove);
	+ iove_free(wiove);
	+ return (ENOMEM);
	+ }
	+
	+ /*
	+ * Note: it's the responsibility of the guest not to
	+ * update vq->vq_avail->idx until all of the descriptors
	+ * the guest has written are valid (including all their
	+ * "next" fields and "flags").
	+ *
	+ * Compute (vq_avail->idx - last_avail) in integers mod 2**16. This is
	+ * the number of descriptors the device has made available
	+ * since the last time we updated vq->vq_last_avail.
	+ *
	+ * We just need to do the subtraction as an unsigned int,
	+ * then trim off excess bits.
	+ */
	+ idx = vq->vq_last_avail;
	+ ndesc = (uint16_t)((u_int)vq->vq_avail->idx - idx);
	+ if (ndesc == 0)
	+ return (0);
	+ if (ndesc > vq->vq_qsize) {
	+ /* XXX need better way to diagnose issues */
	+ EPRINTLN(
	+ "%s: ndesc (%u) out of range, driver confused?",
	+ name, (u_int)ndesc);
	+ return (-1);
	+ }
	+
	+ /*
	+ * Now count/parse "involved" descriptors starting from
	+ * the head of the chain.
	+ *
	+ * To prevent loops, we could be more complicated and
	+ * check whether we're re-visiting a previously visited
	+ * index, but we just abort if the count gets excessive.
	+ */
	+ req.idx = next = vq->vq_avail->ring[idx & (vq->vq_qsize - 1)];
	+ req.iove = wiove;
	+ vq->vq_last_avail++;
	+ for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->next) {
	+ if (next >= vq->vq_qsize) {
	+ EPRINTLN(
	+ "%s: descriptor index %u out of range, "
	+ "driver confused?",
	+ name, next);
	+ goto error;
	+ }
	+ vdir = &vq->vq_desc[next];
	+ if ((vdir->flags & VRING_DESC_F_INDIRECT) == 0) {
	+ _vq_record(i, vdir, iov, niov, &req, wiove, riove);
	+ i++;
	+ } else if ((vs->vs_vc->vc_hv_caps &
	+ VIRTIO_RING_F_INDIRECT_DESC) == 0) {
	+ EPRINTLN(
	+ "%s: descriptor has forbidden INDIRECT flag, "
	+ "driver confused?",
	+ name);
	+ goto error;
	+ } else {
	+ n_indir = vdir->len / 16;
	+ if ((vdir->len & 0xf) \|\| n_indir == 0) {
	+ EPRINTLN(
	+ "%s: invalid indir len 0x%x, "
	+ "driver confused?",
	+ name, (u_int)vdir->len);
	+ goto error;
	+ }
	+
	+ error = vq_import_indirect(&vindir);
	+ if (error != 0)
	+ goto error;
	+ /*
	+ * Indirects start at the 0th, then follow
	+ * their own embedded "next"s until those run
	+ * out. Each one's indirect flag must be off
	+ * (we don't really have to check, could just
	+ * ignore errors...).
	+ */
	+ next = 0;
	+ for (;;) {
	+ vp = &vindir[next];
	+ if (vp->flags & VRING_DESC_F_INDIRECT) {
	+ EPRINTLN(
	+ "%s: indirect desc has INDIR flag,"
	+ " driver confused?",
	+ name);
	+ goto error;
	+ }
	+ _vq_record(i, vp, iov, niov, &req, wiove, riove);
	+ if (++i > VQ_MAX_DESCRIPTORS) {
	+ EPRINTLN(
	+ "%s: descriptor loop? count > %d - driver confused?",
	+ name, i);
	+ goto error;
	+ }
	+ if ((vp->flags & VRING_DESC_F_NEXT) == 0)
	+ break;
	+ next = vp->next;
	+ if (next >= n_indir) {
	+ EPRINTLN(
	+ "%s: invalid next %u > %u, "
	+ "driver confused?",
	+ name, (u_int)next, n_indir);
	+ goto error;
	+ }
	+ }
	+ }
	+ if ((vdir->flags & VRING_DESC_F_NEXT) == 0)
	+ goto done;
	+ }
	+
	+error:
	+ iove_free(riove);
	+ iove_free(wiove);
	+ free(vindir);
	+
	+ return (-1);
	+
	+done:
	+ /* Read in readable descriptors from the kernel. */
	+ error = iove_import(fd, riove);
	+ iove_free(riove);
	+ free(vindir);
	+
	+ if (error != 0) {
	+ EPRINTLN("Reading in data failed with %d", error);
	+ return (-1);
	+ }
	+
	+ *reqp = req;
	+ return (i);
	+}
	+
	+/*
	+ * Return the first n_chain request chains back to the available queue.
	+ *
	+ * (These chains are the ones you handled when you called vq_getchain()
	+ * and used its positive return value.)
	+ */
	+void
	+vq_retchains(struct vqueue_info *vq, uint16_t n_chains)
	+{
	+
	+ vq->vq_last_avail -= n_chains;
	+}
	+
	+void
	+vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
	+{
	+ struct vring_used *vuh;
	+ struct vring_used_elem *vue;
	+ uint16_t mask;
	+
	+ /*
	+ * Notes:
	+ * - mask is N-1 where N is a power of 2 so computes x % N
	+ * - vuh points to the "used" data shared with guest
	+ * - vue points to the "used" ring entry we want to update
	+ */
	+ mask = vq->vq_qsize - 1;
	+ vuh = vq->vq_used;
	+
	+ vue = &vuh->ring[vq->vq_next_used++ & mask];
	+ vue->id = idx;
	+ vue->len = iolen;
	+}
	+
	+void
	+vq_relchain_publish(struct vqueue_info *vq)
	+{
	+ /*
	+ * Ensure the used descriptor is visible before updating the index.
	+ * This is necessary on ISAs with memory ordering less strict than x86
	+ * (and even on x86 to act as a compiler barrier).
	+ */
	+ atomic_thread_fence_rel();
	+ vq->vq_used->idx = vq->vq_next_used;
	+}
	+
	+/*
	+ * Return specified request chain to the guest, setting its I/O length
	+ * to the provided value.
	+ *
	+ * (This chain is the one you handled when you called vq_getchain()
	+ * and used its positive return value.)
	+ */
	+void
	+vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
	+{
	+ vq_relchain_prepare(vq, idx, iolen);
	+ vq_relchain_publish(vq);
	+}
	+
	+/*
	+ * Driver has finished processing "available" chains and calling
	+ * vq_relchain on each one. If driver used all the available
	+ * chains, used_all should be set.
	+ *
	+ * If the "used" index moved we may need to inform the guest, i.e.,
	+ * deliver an interrupt. Even if the used index did NOT move we
	+ * may need to deliver an interrupt, if the avail ring is empty and
	+ * we are supposed to interrupt on empty.
	+ *
	+ * Note that used_all_avail is provided by the caller because it's
	+ * a snapshot of the ring state when he decided to finish interrupt
	+ * processing -- it's possible that descriptors became available after
	+ * that point. (It's also typically a constant 1/True as well.)
	+ */
	+void
	+vq_endchains(struct vqueue_info *vq, int used_all_avail)
	+{
	+ struct virtio_softc *vs;
	+ uint16_t event_idx, new_idx, old_idx;
	+ int intr;
	+
	+ /*
	+ * Interrupt generation: if we're using EVENT_IDX,
	+ * interrupt if we've crossed the event threshold.
	+ * Otherwise interrupt is generated if we added "used" entries,
	+ * but suppressed by VRING_AVAIL_F_NO_INTERRUPT.
	+ *
	+ * In any case, though, if NOTIFY_ON_EMPTY is set and the
	+ * entire avail was processed, we need to interrupt always.
	+ */
	+ vs = vq->vq_vs;
	+ old_idx = vq->vq_save_used;
	+ vq->vq_save_used = new_idx = vq->vq_used->idx;
	+
	+ /*
	+ * Use full memory barrier between "idx" store from preceding
	+ * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or
	+ * "flags" field below.
	+ */
	+ atomic_thread_fence_seq_cst();
	+ if (used_all_avail &&
	+ (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
	+ intr = 1;
	+ else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {
	+ event_idx = VQ_USED_EVENT_IDX(vq);
	+ /*
	+ * This calculation is per docs and the kernel
	+ * (see src/sys/dev/virtio/virtio_ring.h).
	+ */
	+ intr = (uint16_t)(new_idx - event_idx - 1) <
	+ (uint16_t)(new_idx - old_idx);
	+ } else {
	+ intr = new_idx != old_idx &&
	+ !(vq->vq_avail->flags & VRING_AVAIL_F_NO_INTERRUPT);
	+ }
	+ if (intr)
	+ vq_interrupt(vs);
	+}
	+
	+/* Note: these are in sorted order to make for a fast search */
	+static struct config_reg {
	+ uint16_t cr_offset; /* register offset */
	+ uint8_t cr_ro; /* true => reg is read only */
	+ const char cr_name; / name of reg */
	+} config_regs[] = {
	+ { VIRTIO_MMIO_MAGIC_VALUE, 1,"MMIO_MAGIC_VALUE" },
	+ { VIRTIO_MMIO_VERSION, 1, "VERSION" },
	+ { VIRTIO_MMIO_DEVICE_ID, 1, "DEVICE_ID" },
	+ { VIRTIO_MMIO_VENDOR_ID, 1, "VENDOR_ID" },
	+ { VIRTIO_MMIO_HOST_FEATURES, 1, "HOST_FEATURES" },
	+ { VIRTIO_MMIO_HOST_FEATURES_SEL, 0, "HOST_FEATURES_SEL" },
	+ { VIRTIO_MMIO_GUEST_FEATURES, 0, "GUEST_FEATURES" },
	+ { VIRTIO_MMIO_GUEST_FEATURES_SEL, 0, "GUEST_FEATURES_SEL" },
	+ { VIRTIO_MMIO_QUEUE_SEL, 0, "QUEUE_SEL" },
	+ { VIRTIO_MMIO_QUEUE_NUM_MAX, 1, "QUEUE_NUM_MAX" },
	+ { VIRTIO_MMIO_QUEUE_NUM, 0, "QUEUE_NUM" },
	+ { VIRTIO_MMIO_QUEUE_READY, 0, "QUEUE_READY" },
	+ { VIRTIO_MMIO_QUEUE_NOTIFY, 0, "QUEUE_NOTIFY" },
	+ { VIRTIO_MMIO_INTERRUPT_STATUS, 1, "INTERRUPT_STATUS" },
	+ { VIRTIO_MMIO_INTERRUPT_ACK, 0, "INTERRUPT_ACK" },
	+ { VIRTIO_MMIO_STATUS, 0, "STATUS" },
	+ { VIRTIO_MMIO_QUEUE_DESC_LOW, 0, "QUEUE_DESC_LOW" },
	+ { VIRTIO_MMIO_QUEUE_DESC_HIGH, 0, "QUEUE_DESC_HIGH" },
	+ { VIRTIO_MMIO_QUEUE_AVAIL_LOW, 0, "QUEUE_AVAIL_LOW" },
	+ { VIRTIO_MMIO_QUEUE_AVAIL_HIGH, 0, "QUEUE_AVAIL_HIGH" },
	+ { VIRTIO_MMIO_QUEUE_USED_LOW, 0, "QUEUE_USED_LOW" },
	+ { VIRTIO_MMIO_QUEUE_USED_HIGH, 0, "QUEUE_USED_HIGH" },
	+ { VIRTIO_MMIO_CONFIG_GENERATION, 1, "CONFIG_GENERATION" },
	+};
	+
	+static inline struct config_reg *
	+vi_find_cr(int offset) {
	+ u_int hi, lo, mid;
	+ struct config_reg *cr;
	+
	+ lo = 0;
	+ hi = sizeof(config_regs) / sizeof(*config_regs) - 1;
	+ while (hi >= lo) {
	+ mid = (hi + lo) >> 1;
	+ cr = &config_regs[mid];
	+ if (cr->cr_offset == offset)
	+ return (cr);
	+ if (cr->cr_offset < offset)
	+ lo = mid + 1;
	+ else
	+ hi = mid - 1;
	+ }
	+ return (NULL);
	+}
	+
	+static void
	+vi_handle_state_change(struct mmio_devinst *mdi, uint32_t status)
	+{
	+ switch (mdi->mi_state) {
	+ case MIDEV_INVALID:
	+ if (status & VIRTIO_CONFIG_STATUS_ACK)
	+ mdi->mi_state = MIDEV_ACKNOWLEDGED;
	+ break;
	+
	+ case MIDEV_ACKNOWLEDGED:
	+ if (status & VIRTIO_CONFIG_STATUS_DRIVER)
	+ mdi->mi_state = MIDEV_DRIVER_FOUND;
	+ break;
	+
	+ case MIDEV_DRIVER_FOUND:
	+ if (status & VIRTIO_CONFIG_S_FEATURES_OK)
	+ mdi->mi_state = MIDEV_FEATURES_OK;
	+ break;
	+
	+ case MIDEV_FEATURES_OK:
	+ if (status & VIRTIO_CONFIG_STATUS_DRIVER_OK)
	+ mdi->mi_state = MIDEV_LIVE;
	+
	+ break;
	+
	+ case MIDEV_LIVE:
	+ break;
	+
	+ case MIDEV_FAILED:
	+ mdi->mi_state = MIDEV_FAILED;
	+ break;
	+
	+ default:
	+ EPRINTLN("invalid device state %d", mdi->mi_state);
	+ exit(1);
	+ }
	+}
	+
	+static void
	+vi_handle_status(struct virtio_softc *vs, uint32_t status)
	+{
	+
	+ struct mmio_devinst *mdi = vs->vs_mi;
	+
	+ if (status & VIRTIO_CONFIG_STATUS_FAILED) {
	+ mdi->mi_state = MIDEV_FAILED;
	+ return;
	+ }
	+
	+ if (status & VIRTIO_CONFIG_STATUS_RESET) {
	+ mdi->mi_state = MIDEV_INVALID;
	+ vi_reset_dev(vs);
	+ return;
	+ }
	+
	+ vi_handle_state_change(mdi, status);
	+}
	+
	+static void
	+vi_handle_host_features_sel(struct virtio_softc *vs, uint32_t sel)
	+{
	+ uint64_t caps = vs->vs_vc->vc_hv_caps;
	+ struct mmio_devinst *mdi = vs->vs_mi;
	+
	+ if (sel > 1) {
	+ EPRINTLN("HOST_FEATURES SEL 0x%x, "
	+ "driver confused?", sel);
	+ return;
	+ }
	+
	+ if (sel == 1) {
	+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_HOST_FEATURES,
	+ (uint32_t)(caps >> 32));
	+ } else {
	+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_HOST_FEATURES,
	+ (uint32_t)caps);
	+ }
	+}
	+
	+static void
	+vi_handle_guest_features(struct virtio_softc *vs, uint32_t features)
	+{
	+ struct mmio_devinst *mdi = vs->vs_mi;
	+ struct virtio_consts *vc = vs->vs_vc;
	+ uint64_t caps;
	+ int hi;
	+
	+ /*
	+ * XXX Add asserts to ensure we are negotiating w/ the device
	+ * and not in the middle of an operation.
	+ */
	+
	+ hi = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_GUEST_FEATURES_SEL);
	+ if (hi > 1) {
	+ EPRINTLN("GUEST_FEATURES_SEL 0x%x, "
	+ "driver confused?", hi);
	+ return;
	+ }
	+
	+ if (hi == 1) {
	+ /* Update the upper bits, keep the lower ones intact. */
	+ caps = (vc->vc_hv_caps \| features) >> 32;
	+ vs->vs_negotiated_caps &= (vs->vs_negotiated_caps & (((1UL << 32) - 1)) << 32);
	+ vs->vs_negotiated_caps \|= (caps << 32);
	+ } else {
	+ /* Update the lower bits, keep the upper ones intact. */
	+ caps = (uint32_t)(vc->vc_hv_caps \| features);
	+ vs->vs_negotiated_caps &= (vs->vs_negotiated_caps & ((1UL << 32) - 1));
	+ vs->vs_negotiated_caps \|= caps;
	+
	+ /* The LSBs get sent second, we are ready to apply the features. */
	+ if (vc->vc_apply_features)
	+ (*vc->vc_apply_features)(DEV_SOFTC(vs),
	+ vs->vs_negotiated_caps);
	+ }
	+
	+}
	+
	+
	+static void
	+vi_handle_queue_sel(struct virtio_softc *vs)
	+{
	+ struct mmio_devinst *mdi = vs->vs_mi;
	+ struct vqueue_info *vq;
	+
	+ vs->vs_curq = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_SEL);
	+
	+ if (vs->vs_curq < 0 \|\| vs->vs_curq >= vs->vs_vc->vc_nvq) {
	+ EPRINTLN("Selected queue %d, driver confused?", vs->vs_curq);
	+ return;
	+ }
	+
	+ vq = &vs->vs_queues[vs->vs_curq];
	+ if (vq_ring_ready(vq)) {
	+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_READY, 1);
	+ return;
	+ }
	+
	+ /* Part of virtqueue initialization. */
	+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_NUM_MAX, vq->vq_qsize);
	+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_READY, 0);
	+
	+ return;
	+}
	+
	+static void
	+vi_handle_queue_num(struct virtio_softc *vs, int32_t qsize)
	+{
	+ struct vqueue_info *vq = &vs->vs_queues[vs->vs_curq];
	+
	+ if (qsize > vq->vq_qsize \|\| !powerof2(qsize)) {
	+ EPRINTLN("QUEUE_NUM %d is invalid, driver confused?", qsize);
	+ return;
	+ }
	+
	+ vq->vq_qsize = qsize;
	+}
	+
	+static void
	+vi_handle_queue_ready(struct virtio_softc *vs, uint32_t ready)
	+{
	+ struct vqueue_info *vq = &vs->vs_queues[vs->vs_curq];
	+ struct mmio_devinst *mdi = vs->vs_mi;
	+
	+ if (ready > 1) {
	+ EPRINTLN("QUEUE_READY has value %d, driver confused?", ready);
	+ return;
	+ }
	+
	+ if (ready == 1 && !vq_ring_ready(vq)) {
	+ vi_vq_init(mdi, vq);
	+ return;
	+ }
	+}
	+
	+static void
	+vi_handle_interrupt_ack(struct virtio_softc *vs, uint32_t ack)
	+{
	+ struct mmio_devinst *mdi = vs->vs_mi;
	+
	+ /*
	+ * Follow the protocol even if we are executing the
	+ * interrupt ourselves, so we are the ones that sent
	+ * the ACK from the kernel in the first place.
	+ */
	+ if (ack != 1) {
	+ EPRINTLN("INTERRUPT_ACK has value %d, "
	+ "driver confused?", ack);
	+ return;
	+ }
	+
	+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_INTERRUPT_ACK, 0);
	+}
	+
	+static void
	+vi_handle_queue_notify(struct virtio_softc *vs, uint32_t ind)
	+{
	+ struct virtio_consts *vc = vs->vs_vc;
	+ struct vqueue_info *vq;
	+
	+ if (ind >= (unsigned int)vc->vc_nvq) {
	+ EPRINTLN("%s: queue %d notify out of range",
	+ vc->vc_name, ind);
	+ }
	+
	+ vq = &vs->vs_queues[ind];
	+ if (vq->vq_notify) {
	+ (*vq->vq_notify)(DEV_SOFTC(vs), vq);
	+ } else if (vc->vc_qnotify) {
	+ (*vc->vc_qnotify)(DEV_SOFTC(vs), vq);
	+ } else {
	+ EPRINTLN("%s: qnotify value %d: missing vq/vc notify",
	+ vc->vc_name, ind);
	+ }
	+
	+}
	+
	+void
	+vi_mmio_write(struct virtio_softc *vs, uint64_t offset)
	+{
	+ /* Reported writes are always 32-bit. */
	+ const int size = 4;
	+
	+ struct mmio_devinst *mdi = vs->vs_mi;
	+ struct virtio_consts *vc;
	+ struct config_reg *cr;
	+ const char *name;
	+ uint32_t newoff;
	+ int32_t value;
	+ uint64_t max;
	+ int error;
	+
	+ if (vs->vs_mtx)
	+ pthread_mutex_lock(vs->vs_mtx);
	+
	+ vc = vs->vs_vc;
	+ name = vc->vc_name;
	+
	+ /* If writing in the config space, */
	+ if (offset >= VIRTIO_MMIO_CONFIG) {
	+ newoff = offset - VIRTIO_MMIO_CONFIG;
	+ max = vc->vc_cfgsize ? vc->vc_cfgsize : (mdi->mi_bytes - VIRTIO_MMIO_CONFIG);
	+ if (newoff + size > max)
	+ goto bad;
	+
	+ value = mmio_get_cfgdata32(mdi, offset);
	+
	+ if (vc->vc_cfgwrite != NULL)
	+ error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value);
	+ else
	+ error = 0;
	+ if (!error)
	+ goto done;
	+ }
	+
	+bad:
	+ cr = vi_find_cr(offset);
	+ if (cr == NULL) {
	+ EPRINTLN("%s: write to bad offset %jd",
	+ name, (uintmax_t)offset);
	+ goto done;
	+
	+ }
	+
	+ if (cr->cr_ro) {
	+ EPRINTLN("%s: write to read-only reg %s",
	+ name, cr->cr_name);
	+ goto done;
	+ }
	+
	+ value = mmio_get_cfgdata32(mdi, cr->cr_offset);
	+
	+ switch (cr->cr_offset) {
	+ case VIRTIO_MMIO_STATUS:
	+ vi_handle_status(vs, value);
	+ break;
	+
	+ case VIRTIO_MMIO_HOST_FEATURES_SEL:
	+ vi_handle_host_features_sel(vs, value);
	+ break;
	+
	+ case VIRTIO_MMIO_GUEST_FEATURES:
	+ vi_handle_guest_features(vs, value);
	+ break;
	+
	+ case VIRTIO_MMIO_QUEUE_SEL:
	+ vi_handle_queue_sel(vs);
	+ break;
	+
	+ case VIRTIO_MMIO_QUEUE_NUM:
	+ vi_handle_queue_num(vs, value);
	+ break;
	+
	+ case VIRTIO_MMIO_QUEUE_READY:
	+ vi_handle_queue_ready(vs, value);
	+ break;
	+
	+ case VIRTIO_MMIO_QUEUE_NOTIFY:
	+ vi_handle_queue_notify(vs, value);
	+ break;
	+
	+ case VIRTIO_MMIO_INTERRUPT_ACK:
	+ vi_handle_interrupt_ack(vs, value);
	+ break;
	+ default:
	+ EPRINTLN("Unhandled offset %d\n", cr->cr_offset);
	+ assert(0);
	+ }
	+
	+ goto done;
	+
	+done:
	+
	+ if (vs->vs_mtx)
	+ pthread_mutex_unlock(vs->vs_mtx);
	+}
	diff --git a/tests/sys/virtio/virtiodbg.c b/tests/sys/virtio/virtiodbg.c
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/virtio/virtiodbg.c
	@@ -0,0 +1,105 @@
	+#include <err.h>
	+#include <errno.h>
	+#include <stdio.h>
	+#include <stdint.h>
	+#include <stdlib.h>
	+#include <string.h>
	+#include <sysexits.h>
	+#include <unistd.h>
	+
	+#include "config.h"
	+#include "debug.h"
	+#include "mevent.h"
	+#include "mmio_emul.h"
	+
	+static void
	+virtiodbg_usage(int code)
	+{
	+ const char *progname;
	+
	+ progname = getprogname();
	+
	+ fprintf(stderr,
	+ "Usage: %s [-hot]\n"
	+ " -h: help\n"
	+ " -o: set config 'var' to 'value'\n"
	+ " -t: MMIO device type\n",
	+ progname);
	+ exit(code);
	+}
	+
	+static bool
	+virtiodbg_parse_config_option(nvlist_t nvl, const char option)
	+{
	+ const char *key;
	+ char *value;
	+
	+ key = option;
	+ value = strchr(option, '=');
	+ if (value == NULL \|\| value[1] == '\0')
	+ return (false);
	+
	+ *value = '\0';
	+
	+ set_config_value_node(nvl, key, value + 1);
	+ return (true);
	+}
	+
	+
	+static nvlist_t *
	+virtiodbg_optparse(int argc, char **argv)
	+{
	+ const char *optstr;
	+ nvlist_t *nvl;
	+ int c;
	+
	+ nvl = create_config_node("device");
	+
	+ optstr = "ho:t:";
	+ while ((c = getopt(argc, argv, optstr)) != -1) {
	+ switch (c) {
	+ case 't':
	+ if (strncmp(optarg, "help", strlen(optarg)) == 0) {
	+ mmio_print_supported_devices();
	+ exit(0);
	+ } else if (mmio_parse_device(nvl, optarg) != 0)
	+ exit(4);
	+ else
	+ break;
	+ case 'o':
	+ if (!virtiodbg_parse_config_option(nvl, optarg)) {
	+ errx(EX_USAGE,
	+ "invalid configuration option '%s'",
	+ optarg);
	+ }
	+ break;
	+ case 'h':
	+ virtiodbg_usage(0);
	+ default:
	+ virtiodbg_usage(1);
	+ }
	+ }
	+
	+ return (nvl);
	+}
	+
	+int
	+main(int argc, char *argv[])
	+{
	+ nvlist_t *nvl;
	+
	+ init_config();
	+ nvl = virtiodbg_optparse(argc, argv);
	+
	+ /* Exit if a device emulation finds an error in its initialization */
	+ if (init_mmio(nvl) != 0) {
	+ EPRINTLN("Device emulation initialization error: %s",
	+ strerror(errno));
	+ exit(4);
	+ }
	+
	+ /* Head off to the main event dispatch loop. */
	+ mevent_dispatch();
	+
	+ exit(4);
	+}

File Metadata

Mime Type: text/plain
Expires: Thu, Nov 27, 9:51 AM (14 h, 30 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 26252425
Default Alt Text: D45370.diff (160 KB)

D45370.diffNo OneTemporaryActions

D45370.diffView Options

File Metadata

Event Timeline

D45370.diff
No OneTemporary
Actions

D45370.diff
View Options