Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F137848118
D45370.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
160 KB
Referenced Files
None
Subscribers
None
D45370.diff
View Options
diff --git a/sys/conf/files b/sys/conf/files
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3438,6 +3438,7 @@
dev/virtio/mmio/virtio_mmio_cmdline.c optional virtio_mmio
dev/virtio/mmio/virtio_mmio_fdt.c optional virtio_mmio fdt
dev/virtio/mmio/virtio_mmio_if.m optional virtio_mmio
+dev/virtio/dbg/virtio_dbg.c optional virtio_dbg
dev/virtio/network/if_vtnet.c optional vtnet
dev/virtio/balloon/virtio_balloon.c optional virtio_balloon
dev/virtio/block/virtio_blk.c optional virtio_blk
diff --git a/sys/dev/virtio/dbg/virtio_dbg.h b/sys/dev/virtio/dbg/virtio_dbg.h
new file mode 100644
--- /dev/null
+++ b/sys/dev/virtio/dbg/virtio_dbg.h
@@ -0,0 +1,25 @@
+#ifndef _VIRTIO_DBG_
+#define _VIRTIO_DBG_
+
+#include <sys/cdefs.h>
+#include <sys/ioccom.h>
+
+struct vtdbg_transfer {
+ caddr_t vtdt_device;
+ caddr_t vtdt_driver;
+ size_t vtdt_len;
+};
+
+struct vtdbg_io_args {
+ struct vtdbg_transfer *transfers;
+ size_t cnt;
+ bool touser;
+};
+
+#define VIRTIO_DBG_INIT _IO('v', 1)
+#define VIRTIO_DBG_KICK _IO('v', 2)
+#define VIRTIO_DBG_ACK _IO('v', 3)
+#define VIRTIO_DBG_TRANSFER _IOWR('v', 4, struct vtdbg_io_args)
+
+
+#endif /* _VIRTIO_DBG_ */
diff --git a/sys/dev/virtio/dbg/virtio_dbg.c b/sys/dev/virtio/dbg/virtio_dbg.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/virtio/dbg/virtio_dbg.c
@@ -0,0 +1,970 @@
+/*-
+ * Copyright (c) 2024 Emil Tsalapatis
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/event.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/kthread.h>
+#include <sys/limits.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rman.h>
+#include <sys/rwlock.h>
+#include <sys/selinfo.h>
+#include <sys/stat.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_param.h>
+
+#include <machine/bus.h>
+#include <machine/pmap.h>
+#include <machine/resource.h>
+#include <machine/vmparam.h>
+
+#include <dev/virtio/virtio_config.h>
+#include <dev/virtio/virtqueue.h>
+#include <dev/virtio/dbg/virtio_dbg.h>
+#include <dev/virtio/mmio/virtio_mmio.h>
+
+#include "virtio_mmio_if.h"
+
+#define VTDBG_MAGIC ((uint64_t)0x84848484ULL)
+
+/*
+ * XXX Determine these sizes in a well-defined
+ * per-device fashion.
+ */
+#define VTDBG_MAPSZ (1024 * 1024 * 10)
+#define VTDBG_RESERVE_DEVSPACE (4096)
+
+/* XXX Remove after development is done. */
+#define VTDBG_WARN(format, ...) \
+ do { \
+ printf("(%s:%d) " format, __func__, __LINE__, ##__VA_ARGS__); \
+ } while (0)
+
+static device_t vtdbg_parent;
+static driver_t *vtdbg_driver;
+
+#define VTDBG_UPDATE_DESC (0x01)
+#define VTDBG_UPDATE_USED (0x02)
+#define VTDBG_UPDATE_AVAIL (0x04)
+#define VTDBG_INTR_PENDING (0x08)
+#define VTDBG_INTR_EXITING (0x10)
+
+/*
+ * Information on a debug device instance. Accessed
+ * through the control device's softc.
+ */
+struct vtdbg_softc {
+ struct mtx vtd_mtx;
+ struct cv vtd_cv;
+ struct knlist vtd_note;
+ uint32_t vtd_magic;
+
+ vm_object_t vtd_object;
+ vm_ooffset_t vtd_baseaddr;
+ size_t vtd_bytes;
+ size_t vtd_allocated;
+
+ virtqueue_intr_t *vtd_intr;
+ void *vtd_intr_arg;
+ struct proc *vtd_pintr;
+
+ vm_ooffset_t vtd_offset;
+
+ uint32_t vtd_flags;
+
+ device_t vtd_dev;
+};
+
+/*
+ * Subclass of vtmmio_softc that also lets the virtio device access
+ * vtdbg related information while also being usable from vtmmio_*
+ * methods. The vtdbg_softc * is the softc of the control device and
+ * is allocated dynamically when opening an instance of the control device,
+ * while the virtio_dbg_softc here is allocated during device_t creation.
+ */
+struct virtio_dbg_softc {
+ struct vtmmio_softc vtmdbg_mmio;
+ struct vtdbg_softc *vtmdbg_dbg;
+};
+
+/*
+ * Store the parent bus and driver pointers for the debug devices,
+ * because we need them when creating debug devices on-demand later on.
+ * We are hanging off of the nexus, so we are certain it's not going away.
+ */
+static void
+virtio_dbg_identify(driver_t *driver, device_t parent)
+{
+ vtdbg_parent = parent;
+ vtdbg_driver = driver;
+}
+
+static struct vtdbg_softc *
+vtmmio_get_vtdbg(device_t dev)
+{
+ struct virtio_dbg_softc *sc;
+
+ sc = device_get_softc(dev);
+ MPASS(sc->vtmdbg_dbg->vtd_magic == VTDBG_MAGIC);
+
+ return (sc->vtmdbg_dbg);
+}
+
+/*
+ * Explicitly turn polling into a no-op.
+ */
+static int
+virtio_dbg_poll(device_t dev)
+{
+
+ return (0);
+}
+
+
+/*
+ * Make sure the shared virtio device region between kernel and userspace
+ * is configured properly.
+ */
+static int
+virtio_dbg_probe(device_t dev)
+{
+ struct virtio_dbg_softc *sc;
+ struct vtmmio_softc *mmiosc;
+ uint32_t magic, version;
+
+ sc = device_get_softc(dev);
+ mmiosc = &sc->vtmdbg_mmio;
+
+ /* Fake platform to trigger virtio_mmio_note() on writes. */
+ sc->vtmdbg_mmio.platform = dev;
+
+ magic = vtmmio_read_config_4(mmiosc, VIRTIO_MMIO_MAGIC_VALUE);
+ if (magic != VIRTIO_MMIO_MAGIC_VIRT) {
+ device_printf(dev, "Bad magic value %#x\n", magic);
+ return (ENXIO);
+ }
+
+ version = vtmmio_read_config_4(mmiosc, VIRTIO_MMIO_VERSION);
+ if (version != 2) {
+ device_printf(dev, "Unsupported version: %#x\n", version);
+ return (ENXIO);
+ }
+
+ if (vtmmio_read_config_4(mmiosc, VIRTIO_MMIO_DEVICE_ID) == 0)
+ return (ENXIO);
+
+ device_set_desc(dev, "VirtIO Emulated MMIO adapter");
+
+ return (0);
+}
+
+/*
+ * Creates the virtio device corresponding to the transport instance.
+ */
+static int
+virtio_dbg_attach(device_t dev)
+{
+ struct virtio_dbg_softc *sc;
+ struct vtmmio_softc *mmiosc;
+ device_t child;
+
+ sc = device_get_softc(dev);
+ mmiosc = &sc->vtmdbg_mmio;
+
+ mmiosc->dev = dev;
+ mmiosc->vtmmio_version = vtmmio_read_config_4(mmiosc, VIRTIO_MMIO_VERSION);
+
+ vtmmio_reset(mmiosc);
+
+ /* Tell the host we've noticed this device. */
+ vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
+
+ mtx_lock(&Giant);
+ if ((child = device_add_child(dev, NULL, -1)) == NULL) {
+ device_printf(dev, "Cannot create child device.\n");
+ vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED);
+
+ DEVICE_DETACH(dev);
+ mtx_unlock(&Giant);
+
+ return (ENOMEM);
+ }
+
+ mmiosc->vtmmio_child_dev = child;
+ vtmmio_probe_and_attach_child(mmiosc);
+
+ mtx_unlock(&Giant);
+
+ return (0);
+}
+
+/*
+ * Recompute the queue descriptor to be an offset within the shared user/kernel
+ * device control region. Our userspace cannot meaningfully translate
+ * kernel physical addresses, so we transform the values in the queue
+ * descriptor address registers into offsets. Userspace finds the vq address
+ * by adding the offset to its own virtual address for the region.
+ */
+static void
+virtio_dbg_qdesc_offset(struct vtmmio_softc *sc, uint64_t baseaddr,
+ int hireg, int loreg)
+{
+ struct resource *res = sc->res[0];
+ uint32_t hi, lo;
+ uint64_t qaddr;
+
+ /* Read in the components of the physical address. */
+ hi = bus_read_4(res, hireg);
+ lo = bus_read_4(res, loreg);
+
+ /* Recompute into an offset into the vq control region. */
+ qaddr = (((uint64_t)hi) << 32 | (uint64_t)lo);
+ qaddr -= vtophys(baseaddr);
+
+ /* Update the register values. */
+ hi = (qaddr >> 32);
+ lo = (qaddr & ((1ULL << 32) - 1));
+
+ /* Direct bus write because to avoid triggering note(). */
+ bus_write_4(res, hireg, hi);
+ bus_write_4(res, loreg, lo);
+}
+
+/* Notify userspace of a write, and wait for a response. */
+static int
+virtio_dbg_note(device_t dev, size_t offset, int val)
+{
+ struct vtdbg_softc *vtdsc;
+ struct virtio_dbg_softc *sc;
+
+ sc = device_get_softc(dev);
+ vtdsc = sc->vtmdbg_dbg;
+ MPASS(vtdsc->vtd_magic == VTDBG_MAGIC);
+
+ /*
+ * Intercept writes to the QUEUE_{DESC, AVAIL, USED}_{HIGH, LOW}
+ * registers and instead pass to the user the offset from the beginning
+ * of the control region. Do not actually notify userspace of the writes,
+ * we will recompute and notify once we set VIRTIO_MMIO_QUEUE_READY.
+ *
+ * Both high and low registers are set together, so just track writes to
+ * the high address bits.
+ */
+ switch (offset) {
+ case VIRTIO_MMIO_QUEUE_DESC_HIGH:
+ vtdsc->vtd_flags |= VTDBG_UPDATE_DESC;
+ return (1);
+ case VIRTIO_MMIO_QUEUE_USED_HIGH:
+ vtdsc->vtd_flags |= VTDBG_UPDATE_USED;
+ return (1);
+ case VIRTIO_MMIO_QUEUE_AVAIL_HIGH:
+ vtdsc->vtd_flags |= VTDBG_UPDATE_AVAIL;
+ return (1);
+ }
+
+ /* Only forward the listed register writes to userspace. */
+ switch (offset) {
+ case VIRTIO_MMIO_HOST_FEATURES_SEL:
+ case VIRTIO_MMIO_GUEST_FEATURES:
+ case VIRTIO_MMIO_QUEUE_SEL:
+ case VIRTIO_MMIO_QUEUE_NUM:
+ case VIRTIO_MMIO_QUEUE_NOTIFY:
+ case VIRTIO_MMIO_INTERRUPT_ACK:
+ case VIRTIO_MMIO_STATUS:
+ break;
+ case VIRTIO_MMIO_QUEUE_READY:
+ /* if changed, transform the offsets. */
+ if (vtdsc->vtd_flags & VTDBG_UPDATE_DESC) {
+ virtio_dbg_qdesc_offset(&sc->vtmdbg_mmio, vtdsc->vtd_baseaddr,
+ VIRTIO_MMIO_QUEUE_DESC_HIGH, VIRTIO_MMIO_QUEUE_DESC_LOW);
+ vtdsc->vtd_flags &= ~VTDBG_UPDATE_DESC;
+ }
+
+ if (vtdsc->vtd_flags & VTDBG_UPDATE_USED) {
+ virtio_dbg_qdesc_offset(&sc->vtmdbg_mmio, vtdsc->vtd_baseaddr,
+ VIRTIO_MMIO_QUEUE_USED_HIGH, VIRTIO_MMIO_QUEUE_USED_LOW);
+ vtdsc->vtd_flags &= ~VTDBG_UPDATE_USED;
+ }
+
+ if (vtdsc->vtd_flags & VTDBG_UPDATE_AVAIL) {
+ virtio_dbg_qdesc_offset(&sc->vtmdbg_mmio, vtdsc->vtd_baseaddr,
+ VIRTIO_MMIO_QUEUE_AVAIL_HIGH, VIRTIO_MMIO_QUEUE_AVAIL_LOW);
+ vtdsc->vtd_flags &= ~VTDBG_UPDATE_AVAIL;
+ }
+ break;
+ default:
+ return (1);
+ }
+
+ mtx_lock(&vtdsc->vtd_mtx);
+ vtdsc->vtd_offset = offset;
+ KNOTE_LOCKED(&vtdsc->vtd_note, 0);
+
+ /*
+ * We cannot sleep here because this code is called holding non-sleepable locks.
+ * This is because this busy wait's corresponding operation for other transports is
+ * a VM exit, which is instantaneous from the point of view of the guest kernel.
+ * To prevent a "sleeping thread" panic, we busy wait here. There is always the
+ * danger of our VMM process leaving us hanging, but that is always a danger even
+ * with non-emulated virtio transports - it just isn't visible to the guest, since
+ * the VMM is normally on the host.
+ */
+ while (vtdsc->vtd_offset != 0) {
+ mtx_unlock(&vtdsc->vtd_mtx);
+ cpu_spinwait();
+ mtx_lock(&vtdsc->vtd_mtx);
+ }
+
+ mtx_unlock(&vtdsc->vtd_mtx);
+
+ return (1);
+}
+
+/*
+ * Pass interrupt information to the cdev. The cdev will be directly
+ * running the device interrupt handling code as an ioctl.
+ */
+static int
+virtio_dbg_setup_intr(device_t dev, device_t mmio_dev, void *handler, void *ih_user)
+{
+ struct vtdbg_softc *sc;
+
+ sc = vtmmio_get_vtdbg(dev);
+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
+
+ mtx_lock(&sc->vtd_mtx);
+ sc->vtd_intr = handler;
+ sc->vtd_intr_arg = ih_user;
+ mtx_unlock(&sc->vtd_mtx);
+
+ return (0);
+}
+
+static device_method_t virtio_dbg_methods[] = {
+ DEVMETHOD(device_attach, virtio_dbg_attach),
+ DEVMETHOD(device_identify, virtio_dbg_identify),
+ DEVMETHOD(device_probe, virtio_dbg_probe),
+
+ DEVMETHOD(virtio_mmio_poll, virtio_dbg_poll),
+ DEVMETHOD(virtio_mmio_note, virtio_dbg_note),
+ DEVMETHOD(virtio_mmio_setup_intr, virtio_dbg_setup_intr),
+
+ DEVMETHOD_END
+};
+
+DEFINE_CLASS_1(virtio_dbg, virtio_dbg_driver, virtio_dbg_methods,
+ sizeof(struct vtdbg_softc), vtmmio_driver);
+/*
+ * XXX We are currently hanging off of the nexus, not 100% it's the right way.
+ */
+DRIVER_MODULE(virtio_dbg, nexus, virtio_dbg_driver, 0, 0);
+MODULE_VERSION(virtio_dbg, 1);
+
+static struct cdev *vtdbg_dev;
+
+/*
+ * Create and map the device memory into the kernel.
+ */
+static int
+vtdbg_map_kernel(struct vtdbg_softc *sc)
+{
+ vm_object_t obj = sc->vtd_object;
+ size_t bytes = IDX_TO_OFF(obj->size);
+ vm_offset_t baseaddr, tmp;
+ vm_page_t m, end_m;
+ int error;
+
+ /* XXX Do not allow mapping twice. */
+
+ vm_object_reference(obj);
+
+ /*
+ * Populate the object with physically contiguous pages, because
+ * the object is used to back the virtio device control region.
+ */
+ VM_OBJECT_WLOCK(obj);
+ m = vm_page_alloc_contig(obj, 0, VM_ALLOC_NORMAL | VM_ALLOC_ZERO, obj->size,
+ 0, (uint64_t) -1, 1, 0, VM_MEMATTR_DEFAULT);
+ VM_OBJECT_WUNLOCK(obj);
+ if (m == NULL) {
+ vm_object_deallocate(obj);
+ return (ENOMEM);
+ }
+
+
+ baseaddr = VM_MIN_KERNEL_ADDRESS;
+ error = vm_map_find(kernel_map, obj, 0, &baseaddr, bytes, VM_MAX_KERNEL_ADDRESS,
+ VMFS_OPTIMAL_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error != KERN_SUCCESS) {
+ vm_object_deallocate(obj);
+ return (ENOMEM);
+ }
+
+ end_m = m + (bytes / PAGE_SIZE);
+ tmp = baseaddr;
+ for (; m < end_m; m++) {
+ vm_page_valid(m);
+ pmap_zero_page(m);
+ pmap_enter(kernel_pmap, tmp, m, VM_PROT_RW,
+ VM_PROT_RW | PMAP_ENTER_WIRED, 0);
+ tmp += PAGE_SIZE;
+ vm_page_xunbusy(m);
+ }
+
+
+ sc->vtd_baseaddr = baseaddr;
+ sc->vtd_bytes = bytes;
+
+ /* Reserve space for the device control region. */
+ sc->vtd_allocated = VTDBG_RESERVE_DEVSPACE;
+
+ return (0);
+}
+
+static void
+vtdbg_intr(void *arg)
+{
+ struct vtdbg_softc *sc = (struct vtdbg_softc *)arg;
+
+ mtx_lock(&sc->vtd_mtx);
+ while ((sc->vtd_flags & VTDBG_INTR_EXITING) == 0) {
+ if ((sc->vtd_flags & VTDBG_INTR_PENDING) == 0) {
+ cv_wait(&sc->vtd_cv, &sc->vtd_mtx);
+ continue;
+ }
+
+ sc->vtd_flags &= ~VTDBG_INTR_PENDING;
+ mtx_unlock(&sc->vtd_mtx);
+
+ if (sc->vtd_intr)
+ sc->vtd_intr(sc->vtd_intr_arg);
+
+ mtx_lock(&sc->vtd_mtx);
+ cv_wait(&sc->vtd_cv, &sc->vtd_mtx);
+ }
+
+ sc->vtd_pintr = NULL;
+ cv_signal(&sc->vtd_cv);
+
+ mtx_unlock(&sc->vtd_mtx);
+
+ kproc_exit(0);
+}
+
+/*
+ * Destroy the virtio transport instance when closing the
+ * corresponding control device fd.
+ */
+static void
+vtdbg_dtor(void *arg)
+{
+ struct virtio_dbg_softc *devsc;
+ struct vtdbg_softc *sc = (struct vtdbg_softc *)arg;
+ vm_offset_t sva, eva;
+ device_t dev;
+
+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
+
+ if (sc->vtd_pintr != NULL) {
+ mtx_lock(&sc->vtd_mtx);
+ sc->vtd_flags |= VTDBG_INTR_EXITING;
+ cv_signal(&sc->vtd_cv);
+ mtx_unlock(&sc->vtd_mtx);
+
+ mtx_lock(&sc->vtd_mtx);
+ while (sc->vtd_pintr != NULL)
+ cv_wait(&sc->vtd_cv, &sc->vtd_mtx);
+ mtx_unlock(&sc->vtd_mtx);
+ }
+
+ dev = sc->vtd_dev;
+ if (dev != NULL) {
+ devsc = device_get_softc(dev);
+
+ mtx_lock(&Giant);
+ DEVICE_DETACH(dev);
+ mtx_unlock(&Giant);
+
+ free(devsc->vtmdbg_mmio.res[0], M_DEVBUF);
+ device_delete_child(vtdbg_parent, dev);
+ }
+
+
+ if (sc->vtd_baseaddr != 0) {
+ sva = sc->vtd_baseaddr;
+ eva = sva + sc->vtd_bytes;
+ vm_map_remove(kernel_map, sva, eva);
+ pmap_remove(kernel_pmap, sva, eva);
+ }
+
+ vm_object_deallocate(sc->vtd_object);
+
+ knlist_delete(&sc->vtd_note, curthread, 0);
+ knlist_destroy(&sc->vtd_note);
+
+ cv_destroy(&sc->vtd_cv);
+ mtx_destroy(&sc->vtd_mtx);
+
+ free(sc, M_DEVBUF);
+}
+
+static int
+vtdbg_open(struct cdev *cdev, int oflags, int devtype, struct thread *td)
+{
+ size_t sz = round_page(VTDBG_MAPSZ);
+ struct vtdbg_softc *sc;
+ int error;
+
+ sc = malloc(sizeof(struct vtdbg_softc), M_DEVBUF, M_NOWAIT|M_ZERO);
+ if (sc == NULL)
+ return (ENOMEM);
+
+ sc->vtd_magic = VTDBG_MAGIC;
+ mtx_init(&sc->vtd_mtx, "vtdbg", NULL, MTX_DEF);
+ cv_init(&sc->vtd_cv, "vtdbg");
+
+ knlist_init_mtx(&sc->vtd_note, &sc->vtd_mtx);
+
+ /* Create the common userspace/kernel virtio device region. */
+ sc->vtd_object = vm_pager_allocate(OBJT_PHYS, NULL, sz, VM_PROT_ALL,
+ 0, thread0.td_ucred);
+ if (sc->vtd_object == NULL) {
+ vtdbg_dtor(sc);
+ return (ENOMEM);
+ }
+
+ error = vtdbg_map_kernel(sc);
+ if (error != 0) {
+ vtdbg_dtor(sc);
+ return (error);
+ }
+
+ error = kproc_create(vtdbg_intr, (void *)sc, &sc->vtd_pintr,
+ 0, 0, "vtdbg_intr");
+ if (error != 0) {
+ vtdbg_dtor(sc);
+ return (error);
+ }
+
+ error = devfs_set_cdevpriv((void *)sc, vtdbg_dtor);
+ if (error != 0)
+ vtdbg_dtor(sc);
+
+ return (error);
+}
+
+static int
+vtdbg_mmap_single(struct cdev *cdev, vm_ooffset_t *offset,
+ vm_size_t size, vm_object_t *objp, int nprot)
+{
+ struct vtdbg_softc *sc;
+ int error;
+
+ error = devfs_get_cdevpriv((void **)&sc);
+ if (error != 0)
+ return (error);
+
+ if (*offset + size > sc->vtd_bytes)
+ return (EINVAL);
+
+ vm_object_reference(sc->vtd_object);
+ *objp = sc->vtd_object;
+
+ return (0);
+}
+
+static void *
+vtdbg_ringalloc(device_t dev, size_t size)
+{
+ struct vtdbg_softc *sc = vtmmio_get_vtdbg(dev);
+ void *mem;
+
+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
+
+ mtx_lock(&sc->vtd_mtx);
+ if (sc->vtd_allocated + size > sc->vtd_bytes) {
+ mtx_unlock(&sc->vtd_mtx);
+ return (NULL);
+ }
+
+ mem = (void *)(sc->vtd_baseaddr + sc->vtd_allocated);
+ sc->vtd_allocated += size;
+
+ mtx_unlock(&sc->vtd_mtx);
+
+ return (mem);
+}
+
+static device_t
+vtdbg_create_transport(device_t parent, struct vtdbg_softc *vtdsc)
+{
+ struct virtio_dbg_softc *sc;
+ struct vtmmio_softc *mmiosc;
+ struct resource *res;
+ device_t transport;
+
+ int uid = 0;
+
+ transport = BUS_ADD_CHILD(parent, 0, virtio_dbg_driver.name, uid);
+ device_set_driver(transport, vtdbg_driver);
+
+ sc = device_get_softc(transport);
+ mmiosc = &sc->vtmdbg_mmio;
+
+ /*
+ * XXX Hack. Create the resource out of thin air to
+ * keep the vtmmio_write_* calls working. If we wanted to be uniform
+ * would be reserving the resource out of the RAM pseudobus,
+ * but it has no associated struct rman * instance,
+ * and we have already reserved this memory region
+ * by allocating it anyway so there is no possiblity
+ * of conflicts..
+ */
+ res = malloc(sizeof(*res), M_DEVBUF, M_WAITOK);
+ res->r_bushandle = vtdsc->vtd_baseaddr;
+ res->r_bustag = X86_BUS_SPACE_MEM;
+ mmiosc->res[0] = res;
+
+ /* Ring buffer allocation callback. */
+ mmiosc->vtmmio_ringalloc_cb = vtdbg_ringalloc;
+
+ return (transport);
+}
+
+static int
+vtdbg_linkup_transport(struct vtdbg_softc *vtdsc, device_t dev)
+{
+ struct virtio_dbg_softc *mmiosc;
+
+ mtx_lock(&vtdsc->vtd_mtx);
+ if (vtdsc->vtd_dev != NULL) {
+ mtx_unlock(&vtdsc->vtd_mtx);
+ return (EALREADY);
+ }
+
+ mmiosc = device_get_softc(dev);
+
+ /* Have the device and cdev be able to refer to each other. */
+ mmiosc->vtmdbg_dbg = vtdsc;
+ vtdsc->vtd_dev = dev;
+
+ mtx_unlock(&vtdsc->vtd_mtx);
+
+ return (0);
+}
+
+/*
+ * Create virtio device. This function does the initialization both
+ * for the emulated transport, and for the virtio device. These are
+ * normally (e.g., for MMIO)) created at boot time using vtmmio_probe/vtmmio_attach,
+ * and vtmmio_probe_and_attach_child, respectively. We do this initialization
+ * here because we are dynamically creating the devices after booting, so
+ * we must manually invoke the device probe and attach methods.
+ */
+static int
+vtdbg_init(void)
+{
+ struct virtio_dbg_softc *sc;
+ struct vtdbg_softc *vtdsc;
+ device_t transport;
+ int error;
+
+ /* Retrieve the mapping address/size. */
+ error = devfs_get_cdevpriv((void **)&vtdsc);
+ if (error != 0)
+ return (error);
+
+ MPASS(vtdsc->vtd_magic == VTDBG_MAGIC);
+
+ transport = vtdbg_create_transport(vtdbg_parent, vtdsc);
+
+ error = vtdbg_linkup_transport(vtdsc, transport);
+ if (error != 0)
+ goto err;
+
+ error = DEVICE_PROBE(transport);
+ if (error != 0)
+ goto err;
+
+ return (DEVICE_ATTACH(transport));
+
+err:
+ sc = device_get_softc(transport);
+
+ /*
+ * Release the resource but do not notify
+ * the parent bus as we didn't reserve it
+ * from it.
+ */
+ free(sc->vtmdbg_mmio.res[0], M_DEVBUF);
+
+ mtx_lock(&Giant);
+ device_delete_child(vtdbg_parent, transport);
+ mtx_unlock(&Giant);
+
+ vtdsc->vtd_dev = NULL;
+
+ return (error);
+}
+
+/*
+ * Kick the dedicated kernel interrupt process.
+ */
+static void
+vtdbg_kick(struct vtdbg_softc *sc)
+{
+ mtx_lock(&sc->vtd_mtx);
+ sc->vtd_flags |= VTDBG_INTR_PENDING;
+ cv_signal(&sc->vtd_cv);
+ mtx_unlock(&sc->vtd_mtx);
+}
+
+/*
+ * The mmio virtio code uses note() to let the host know there has been a write.
+ * The note() call suspends the thread until the userspace device has been properly
+ * emulated, at which point a userspace thread will allow it to resume.
+ *
+ * There can only be one unacknowledged interrupt outstanding at a time, so a single
+ * vtd_offset in the softc is enough.
+ */
+static void
+vtdbg_ack(struct vtdbg_softc *sc)
+{
+ mtx_lock(&sc->vtd_mtx);
+ sc->vtd_offset = 0;
+ wakeup(sc);
+ mtx_unlock(&sc->vtd_mtx);
+}
+
+/*
+ * Get virtio data in and out of the kernel, required by userspace to interact with
+ * the data pointed to by the virtqueue descriptors.
+ */
+static int
+vtdbg_io(struct vtdbg_softc *sc, struct vtdbg_io_args *args)
+{
+ struct vtdbg_transfer *tf;
+ caddr_t driver, device;
+ int error = 0;
+ size_t len;
+ int i;
+
+ tf = malloc(args->cnt * sizeof(*tf), M_DEVBUF, M_NOWAIT);
+ if (tf == NULL)
+ return (ENOMEM);
+
+ error = copyin(args->transfers, tf, args->cnt * (sizeof(*tf)));
+ if (error != 0) {
+ free(tf, M_DEVBUF);
+ return (error);
+ }
+
+ for (i = 0; i < args->cnt; i++) {
+ driver = (caddr_t)PHYS_TO_DMAP((vm_paddr_t)tf[i].vtdt_driver);
+ /* Translate from physical to kernel virtual. */
+ device = tf[i].vtdt_device;
+ len = tf[i].vtdt_len;
+
+ if (args->touser)
+ error = copyout(driver, device, len);
+ else
+ error = copyin(device, driver, len);
+
+ if (error != 0)
+ break;
+ }
+
+ free(tf, M_DEVBUF);
+
+ return (error);
+}
+
+
+static int
+vtdbg_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td)
+{
+ struct vtdbg_softc *sc;
+ int ret = 0;
+
+ ret = devfs_get_cdevpriv((void **)&sc);
+ if (ret != 0)
+ return (ret);
+
+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
+ switch (cmd) {
+ case VIRTIO_DBG_INIT:
+ ret = vtdbg_init();
+ break;
+ case VIRTIO_DBG_KICK:
+ vtdbg_kick(sc);
+ break;
+ case VIRTIO_DBG_ACK:
+ vtdbg_ack(sc);
+ break;
+ case VIRTIO_DBG_TRANSFER:
+ ret = vtdbg_io(sc, (struct vtdbg_io_args *)data);
+ break;
+ }
+
+ return (ret);
+}
+
+static int
+vtdbg_filt_attach(struct knote *kn)
+{
+ kn->kn_flags |= EV_CLEAR;
+ return (0);
+}
+
+static void
+vtdbg_filt_detach(struct knote *kn)
+{
+ struct vtdbg_softc *sc;
+ sc = (struct vtdbg_softc *)kn->kn_hook;
+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
+
+ knlist_remove(&sc->vtd_note, kn, 0);
+ kn->kn_hook = NULL;
+}
+
+static int
+vtdbg_filt_read(struct knote *kn, long hint)
+{
+ struct vtdbg_softc *sc;
+
+
+ sc = (struct vtdbg_softc *)kn->kn_hook;
+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
+ mtx_assert(&sc->vtd_mtx, MA_OWNED);
+
+ if (sc->vtd_offset == 0)
+ return (0);
+
+ kn->kn_data = sc->vtd_offset;
+
+ return (1);
+}
+
+struct filterops vtdbg_filtops = {
+ .f_isfd = 1,
+ .f_attach = vtdbg_filt_attach,
+ .f_detach = vtdbg_filt_detach,
+ .f_event = vtdbg_filt_read,
+};
+
+static int
+vtdbg_kqfilter(struct cdev *dev, struct knote *kn)
+{
+ struct vtdbg_softc *sc;
+ int error;
+
+ error = devfs_get_cdevpriv((void **)&sc);
+ if (error != 0)
+ return (error);
+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
+
+ if (kn->kn_filter != EVFILT_READ) {
+ kn->kn_data = EINVAL;
+ return (EINVAL);
+ }
+
+ kn->kn_fop = &vtdbg_filtops;
+ kn->kn_hook = sc;
+ knlist_add(&sc->vtd_note, kn, 0);
+
+ return (0);
+
+}
+
+static struct cdevsw vtdbg_cdevsw = {
+ .d_open = vtdbg_open,
+ .d_mmap_single = vtdbg_mmap_single,
+ .d_ioctl = vtdbg_ioctl,
+ .d_kqfilter = vtdbg_kqfilter,
+ .d_name = "vtdbg",
+ .d_version = D_VERSION,
+};
+
+static int
+vtdbg_dev_create(void)
+{
+ vtdbg_dev = make_dev(&vtdbg_cdevsw, 0, UID_ROOT, GID_OPERATOR,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP, "vtdbg");
+ if (vtdbg_dev == NULL)
+ return (ENOMEM);
+
+ return (0);
+}
+
+static void
+vtdbg_dev_destroy(void)
+{
+ MPASS(vtdbg_dev != NULL);
+ destroy_dev(vtdbg_dev);
+}
+
+static int
+vtdbg_loader(struct module *m, int what, void *arg)
+{
+ int err = 0;
+
+ switch (what) {
+ case MOD_LOAD:
+ err = vtdbg_dev_create();
+ break;
+ case MOD_UNLOAD:
+ vtdbg_dev_destroy();
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ return (err);
+}
+
+static moduledata_t vtdbg_moddata = {
+ "vtdbg",
+ vtdbg_loader,
+ NULL,
+};
+
+DECLARE_MODULE(vtdbg, vtdbg_moddata, SI_SUB_VFS, SI_ORDER_MIDDLE);
diff --git a/sys/dev/virtio/mmio/virtio_mmio.h b/sys/dev/virtio/mmio/virtio_mmio.h
--- a/sys/dev/virtio/mmio/virtio_mmio.h
+++ b/sys/dev/virtio/mmio/virtio_mmio.h
@@ -31,9 +31,12 @@
#ifndef _VIRTIO_MMIO_H
#define _VIRTIO_MMIO_H
+#ifdef _KERNEL
+
DECLARE_CLASS(vtmmio_driver);
struct vtmmio_virtqueue;
+typedef void *vtmmio_alloc_cb_t(device_t, size_t);
struct vtmmio_softc {
device_t dev;
@@ -51,10 +54,25 @@
int vtmmio_nvqs;
struct vtmmio_virtqueue *vtmmio_vqs;
void *ih;
+
+ vtmmio_alloc_cb_t *vtmmio_ringalloc_cb;
};
int vtmmio_probe(device_t);
int vtmmio_attach(device_t);
+void vtmmio_reset(struct vtmmio_softc *);
+uint8_t vtmmio_get_status(device_t);
+void vtmmio_set_status(device_t, uint8_t);
+void vtmmio_probe_and_attach_child(struct vtmmio_softc *);
+
+#define vtmmio_read_config_1(sc, o) \
+ bus_read_1((sc)->res[0], (o))
+#define vtmmio_read_config_2(sc, o) \
+ bus_read_2((sc)->res[0], (o))
+#define vtmmio_read_config_4(sc, o) \
+ bus_read_4((sc)->res[0], (o))
+
+#endif /* _KERNEL */
#define VIRTIO_MMIO_MAGIC_VALUE 0x000
#define VIRTIO_MMIO_VERSION 0x004
diff --git a/sys/dev/virtio/mmio/virtio_mmio.c b/sys/dev/virtio/mmio/virtio_mmio.c
--- a/sys/dev/virtio/mmio/virtio_mmio.c
+++ b/sys/dev/virtio/mmio/virtio_mmio.c
@@ -84,19 +84,15 @@
static void vtmmio_reinit_complete(device_t);
static void vtmmio_notify_virtqueue(device_t, uint16_t, bus_size_t);
static int vtmmio_config_generation(device_t);
-static uint8_t vtmmio_get_status(device_t);
-static void vtmmio_set_status(device_t, uint8_t);
static void vtmmio_read_dev_config(device_t, bus_size_t, void *, int);
static uint64_t vtmmio_read_dev_config_8(struct vtmmio_softc *, bus_size_t);
static void vtmmio_write_dev_config(device_t, bus_size_t, const void *, int);
static void vtmmio_describe_features(struct vtmmio_softc *, const char *,
uint64_t);
-static void vtmmio_probe_and_attach_child(struct vtmmio_softc *);
static int vtmmio_reinit_virtqueue(struct vtmmio_softc *, int);
static void vtmmio_free_interrupts(struct vtmmio_softc *);
static void vtmmio_free_virtqueues(struct vtmmio_softc *);
static void vtmmio_release_child_resources(struct vtmmio_softc *);
-static void vtmmio_reset(struct vtmmio_softc *);
static void vtmmio_select_virtqueue(struct vtmmio_softc *, int);
static void vtmmio_vq_intr(void *);
@@ -128,13 +124,6 @@
VIRTIO_MMIO_NOTE(sc->platform, (o), (v)); \
} while (0)
-#define vtmmio_read_config_1(sc, o) \
- bus_read_1((sc)->res[0], (o))
-#define vtmmio_read_config_2(sc, o) \
- bus_read_2((sc)->res[0], (o))
-#define vtmmio_read_config_4(sc, o) \
- bus_read_4((sc)->res[0], (o))
-
static device_method_t vtmmio_methods[] = {
/* Device interface. */
DEVMETHOD(device_attach, vtmmio_attach),
@@ -572,7 +561,7 @@
error = virtqueue_alloc(dev, idx, size,
VIRTIO_MMIO_QUEUE_NOTIFY, VIRTIO_MMIO_VRING_ALIGN,
- ~(vm_paddr_t)0, info, &vq);
+ ~(vm_paddr_t)0, info, &vq, sc->vtmmio_ringalloc_cb);
if (error) {
device_printf(dev,
"cannot allocate virtqueue %d: %d\n",
@@ -689,7 +678,7 @@
return (gen);
}
-static uint8_t
+uint8_t
vtmmio_get_status(device_t dev)
{
struct vtmmio_softc *sc;
@@ -699,7 +688,7 @@
return (vtmmio_read_config_4(sc, VIRTIO_MMIO_STATUS));
}
-static void
+void
vtmmio_set_status(device_t dev, uint8_t status)
{
struct vtmmio_softc *sc;
@@ -875,7 +864,7 @@
virtio_describe(dev, msg, features, sc->vtmmio_child_feat_desc);
}
-static void
+void
vtmmio_probe_and_attach_child(struct vtmmio_softc *sc)
{
device_t dev, child;
@@ -976,7 +965,7 @@
vtmmio_free_virtqueues(sc);
}
-static void
+void
vtmmio_reset(struct vtmmio_softc *sc)
{
diff --git a/sys/dev/virtio/pci/virtio_pci.c b/sys/dev/virtio/pci/virtio_pci.c
--- a/sys/dev/virtio/pci/virtio_pci.c
+++ b/sys/dev/virtio/pci/virtio_pci.c
@@ -362,7 +362,7 @@
notify_offset = vtpci_get_vq_notify_off(cn, idx);
error = virtqueue_alloc(dev, idx, size, notify_offset, align,
- ~(vm_paddr_t)0, info, &vq);
+ ~(vm_paddr_t)0, info, &vq, NULL);
if (error) {
device_printf(dev,
"cannot allocate virtqueue %d: %d\n", idx, error);
diff --git a/sys/dev/virtio/virtio.h b/sys/dev/virtio/virtio.h
--- a/sys/dev/virtio/virtio.h
+++ b/sys/dev/virtio/virtio.h
@@ -68,7 +68,9 @@
#define VIRTIO_DRIVER_MODULE(name, driver, evh, arg) \
DRIVER_MODULE(name, virtio_mmio, driver, evh, arg); \
- DRIVER_MODULE(name, virtio_pci, driver, evh, arg)
+ DRIVER_MODULE(name, virtio_pci, driver, evh, arg); \
+ DRIVER_MODULE(name, virtio_dbg, driver, evh, arg)
+
struct virtio_pnp_match {
uint32_t device_type;
@@ -82,6 +84,8 @@
MODULE_PNP_INFO("U32:device_type;D:#", virtio_mmio, driver, \
&driver ## _match, 1); \
MODULE_PNP_INFO("U32:device_type;D:#", virtio_pci, driver, \
+ &driver ## _match, 1) \
+ MODULE_PNP_INFO("U32:device_type;D:#", virtio_dbg, driver, \
&driver ## _match, 1)
#define VIRTIO_SIMPLE_PROBE(dev, driver) \
(virtio_simple_probe(dev, &driver ## _match))
diff --git a/sys/dev/virtio/virtqueue.h b/sys/dev/virtio/virtqueue.h
--- a/sys/dev/virtio/virtqueue.h
+++ b/sys/dev/virtio/virtqueue.h
@@ -34,6 +34,7 @@
/* Device callback for a virtqueue interrupt. */
typedef void virtqueue_intr_t(void *);
+typedef void *virtqueue_alloc_cb_t(device_t, size_t);
/*
* Hint on how long the next interrupt should be postponed. This is
@@ -67,7 +68,8 @@
int virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size,
bus_size_t notify_offset, int align, vm_paddr_t highaddr,
- struct vq_alloc_info *info, struct virtqueue **vqp);
+ struct vq_alloc_info *info, struct virtqueue **vqp,
+ virtqueue_alloc_cb_t *cb);
void *virtqueue_drain(struct virtqueue *vq, int *last);
void virtqueue_free(struct virtqueue *vq);
int virtqueue_reinit(struct virtqueue *vq, uint16_t size);
diff --git a/sys/dev/virtio/virtqueue.c b/sys/dev/virtio/virtqueue.c
--- a/sys/dev/virtio/virtqueue.c
+++ b/sys/dev/virtio/virtqueue.c
@@ -151,7 +151,8 @@
int
virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size,
bus_size_t notify_offset, int align, vm_paddr_t highaddr,
- struct vq_alloc_info *info, struct virtqueue **vqp)
+ struct vq_alloc_info *info, struct virtqueue **vqp,
+ virtqueue_alloc_cb_t alloc_cb)
{
struct virtqueue *vq;
int error;
@@ -206,8 +207,12 @@
}
vq->vq_ring_size = round_page(vring_size(size, align));
- vq->vq_ring_mem = contigmalloc(vq->vq_ring_size, M_DEVBUF,
- M_NOWAIT | M_ZERO, 0, highaddr, PAGE_SIZE, 0);
+ if (alloc_cb != NULL) {
+ vq->vq_ring_mem = alloc_cb(dev, vq->vq_ring_size);
+ } else {
+ vq->vq_ring_mem = contigmalloc(vq->vq_ring_size, M_DEVBUF,
+ M_NOWAIT | M_ZERO, 0, highaddr, PAGE_SIZE, 0);
+ }
if (vq->vq_ring_mem == NULL) {
device_printf(dev,
"cannot allocate memory for virtqueue ring\n");
diff --git a/tests/sys/Makefile b/tests/sys/Makefile
--- a/tests/sys/Makefile
+++ b/tests/sys/Makefile
@@ -33,6 +33,7 @@
TESTS_SUBDIRS+= ses
TESTS_SUBDIRS+= sys
TESTS_SUBDIRS+= vfs
+TESTS_SUBDIRS+= virtio
TESTS_SUBDIRS+= vm
TESTS_SUBDIRS+= vmm
diff --git a/tests/sys/virtio/Makefile b/tests/sys/virtio/Makefile
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/Makefile
@@ -0,0 +1,28 @@
+PROG= virtiodbg
+
+.PATH: ${SRCTOP}/sys/libkern
+
+SRCS= block_if.c \
+ config.c \
+ iov.c \
+ iov_emul.c \
+ mevent.c \
+ mmio_virtio_block.c \
+ mmio_emul.c \
+ virtio.c \
+ virtiodbg.c
+
+MAN=
+
+CFLAGS+=-I${.CURDIR} \
+ -I${SRCTOP}/sys
+
+LIBADD+= md nv pthread
+
+# Disable thread safety analysis since it only finds very simple bugs and
+# yields many false positives.
+NO_WTHREAD_SAFETY=
+
+NO_WCAST_ALIGN=
+
+.include <bsd.prog.mk>
diff --git a/tests/sys/virtio/block_if.h b/tests/sys/virtio/block_if.h
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/block_if.h
@@ -0,0 +1,84 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * The block API to be used by bhyve block-device emulations. The routines
+ * are thread safe, with no assumptions about the context of the completion
+ * callback - it may occur in the caller's context, or asynchronously in
+ * another thread.
+ */
+
+#ifndef _BLOCK_IF_H_
+#define _BLOCK_IF_H_
+
+#include <sys/nv.h>
+#include <sys/uio.h>
+#include <sys/unistd.h>
+
+/*
+ * BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in
+ * a single request. BLOCKIF_RING_MAX is the maxmimum number of
+ * pending requests that can be queued.
+ */
+#define BLOCKIF_IOV_MAX 128 /* not practical to be IOV_MAX */
+#define BLOCKIF_RING_MAX 128
+
+struct blockif_req {
+ int br_iovcnt;
+ off_t br_offset;
+ ssize_t br_resid;
+ void (*br_callback)(struct blockif_req *req, int err);
+ void *br_param;
+ struct iovec br_iov[BLOCKIF_IOV_MAX];
+};
+
+struct mmio_devinst;
+struct blockif_ctxt;
+
+typedef void blockif_resize_cb(struct blockif_ctxt *, void *, size_t, uint64_t);
+
+int blockif_legacy_config(nvlist_t *nvl, const char *opts);
+struct blockif_ctxt *blockif_open(nvlist_t *nvl, const char *ident);
+int blockif_register_resize_callback(struct blockif_ctxt *bc,
+ blockif_resize_cb *cb, void *cb_arg);
+off_t blockif_size(struct blockif_ctxt *bc);
+void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h,
+ uint8_t *s);
+int blockif_sectsz(struct blockif_ctxt *bc);
+void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off);
+int blockif_queuesz(struct blockif_ctxt *bc);
+int blockif_is_ro(struct blockif_ctxt *bc);
+int blockif_candelete(struct blockif_ctxt *bc);
+int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_close(struct blockif_ctxt *bc);
+
+#endif /* _BLOCK_IF_H_ */
diff --git a/tests/sys/virtio/block_if.c b/tests/sys/virtio/block_if.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/block_if.c
@@ -0,0 +1,980 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ * Copyright 2020 Joyent, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/queue.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+
+#include <assert.h>
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <signal.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <machine/atomic.h>
+#include <machine/vmm_snapshot.h>
+
+#include "config.h"
+#include "debug.h"
+#include "mevent.h"
+#include "block_if.h"
+
+#define BLOCKIF_SIG 0xb109b109
+
+#define BLOCKIF_NUMTHR 8
+#define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
+
+enum blockop {
+ BOP_READ,
+ BOP_WRITE,
+ BOP_FLUSH,
+ BOP_DELETE
+};
+
+enum blockstat {
+ BST_FREE,
+ BST_BLOCK,
+ BST_PEND,
+ BST_BUSY,
+ BST_DONE
+};
+
+struct blockif_elem {
+ TAILQ_ENTRY(blockif_elem) be_link;
+ struct blockif_req *be_req;
+ enum blockop be_op;
+ enum blockstat be_status;
+ pthread_t be_tid;
+ off_t be_block;
+};
+
+struct blockif_ctxt {
+ unsigned int bc_magic;
+ int bc_fd;
+ int bc_ischr;
+ int bc_isgeom;
+ int bc_candelete;
+ int bc_rdonly;
+ off_t bc_size;
+ int bc_sectsz;
+ int bc_psectsz;
+ int bc_psectoff;
+ int bc_closing;
+ int bc_paused;
+ pthread_t bc_btid[BLOCKIF_NUMTHR];
+ pthread_mutex_t bc_mtx;
+ pthread_cond_t bc_cond;
+ pthread_cond_t bc_work_done_cond;
+ blockif_resize_cb *bc_resize_cb;
+ void *bc_resize_cb_arg;
+ struct mevent *bc_resize_event;
+
+ /* Request elements and free/pending/busy queues */
+ TAILQ_HEAD(, blockif_elem) bc_freeq;
+ TAILQ_HEAD(, blockif_elem) bc_pendq;
+ TAILQ_HEAD(, blockif_elem) bc_busyq;
+ struct blockif_elem bc_reqs[BLOCKIF_MAXREQ];
+ int bc_bootindex;
+};
+
+static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
+
+struct blockif_sig_elem {
+ pthread_mutex_t bse_mtx;
+ pthread_cond_t bse_cond;
+ int bse_pending;
+ struct blockif_sig_elem *bse_next;
+};
+
+static struct blockif_sig_elem *blockif_bse_head;
+
+static int
+blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
+ enum blockop op)
+{
+ struct blockif_elem *be, *tbe;
+ off_t off;
+ int i;
+
+ be = TAILQ_FIRST(&bc->bc_freeq);
+ assert(be != NULL);
+ assert(be->be_status == BST_FREE);
+ TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
+ be->be_req = breq;
+ be->be_op = op;
+ switch (op) {
+ case BOP_READ:
+ case BOP_WRITE:
+ case BOP_DELETE:
+ off = breq->br_offset;
+ for (i = 0; i < breq->br_iovcnt; i++)
+ off += breq->br_iov[i].iov_len;
+ break;
+ default:
+ off = OFF_MAX;
+ }
+ be->be_block = off;
+ TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
+ if (tbe->be_block == breq->br_offset)
+ break;
+ }
+ if (tbe == NULL) {
+ TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
+ if (tbe->be_block == breq->br_offset)
+ break;
+ }
+ }
+ if (tbe == NULL)
+ be->be_status = BST_PEND;
+ else
+ be->be_status = BST_BLOCK;
+ TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
+ return (be->be_status == BST_PEND);
+}
+
+static int
+blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
+{
+ struct blockif_elem *be;
+
+ TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
+ if (be->be_status == BST_PEND)
+ break;
+ assert(be->be_status == BST_BLOCK);
+ }
+ if (be == NULL)
+ return (0);
+ TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
+ be->be_status = BST_BUSY;
+ be->be_tid = t;
+ TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
+ *bep = be;
+ return (1);
+}
+
+static void
+blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
+{
+ struct blockif_elem *tbe;
+
+ if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
+ TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
+ else
+ TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
+ TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
+ if (tbe->be_req->br_offset == be->be_block)
+ tbe->be_status = BST_PEND;
+ }
+ be->be_tid = 0;
+ be->be_status = BST_FREE;
+ be->be_req = NULL;
+ TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
+}
+
+static int
+blockif_flush_bc(struct blockif_ctxt *bc)
+{
+ if (bc->bc_ischr) {
+ if (ioctl(bc->bc_fd, DIOCGFLUSH))
+ return (errno);
+ } else if (fsync(bc->bc_fd))
+ return (errno);
+
+ return (0);
+}
+
+static void
+blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
+{
+ struct spacectl_range range;
+ struct blockif_req *br;
+ off_t arg[2];
+ ssize_t n;
+ size_t clen, len, off, boff, voff;
+ int i, err;
+
+ br = be->be_req;
+ assert(br->br_resid >= 0);
+
+ if (br->br_iovcnt <= 1)
+ buf = NULL;
+ err = 0;
+ switch (be->be_op) {
+ case BOP_READ:
+ if (buf == NULL) {
+ if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
+ br->br_offset)) < 0)
+ err = errno;
+ else
+ br->br_resid -= n;
+ break;
+ }
+ i = 0;
+ off = voff = 0;
+ while (br->br_resid > 0) {
+ len = MIN(br->br_resid, MAXPHYS);
+ n = pread(bc->bc_fd, buf, len, br->br_offset + off);
+ if (n < 0) {
+ err = errno;
+ break;
+ }
+ len = (size_t)n;
+ boff = 0;
+ do {
+ clen = MIN(len - boff, br->br_iov[i].iov_len -
+ voff);
+ memcpy((uint8_t *)br->br_iov[i].iov_base + voff,
+ buf + boff, clen);
+ if (clen < br->br_iov[i].iov_len - voff)
+ voff += clen;
+ else {
+ i++;
+ voff = 0;
+ }
+ boff += clen;
+ } while (boff < len);
+ off += len;
+ br->br_resid -= len;
+ }
+ break;
+ case BOP_WRITE:
+ if (bc->bc_rdonly) {
+ err = EROFS;
+ break;
+ }
+ if (buf == NULL) {
+ if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
+ br->br_offset)) < 0)
+ err = errno;
+ else
+ br->br_resid -= n;
+ break;
+ }
+ i = 0;
+ off = voff = 0;
+ while (br->br_resid > 0) {
+ len = MIN(br->br_resid, MAXPHYS);
+ boff = 0;
+ do {
+ clen = MIN(len - boff, br->br_iov[i].iov_len -
+ voff);
+ memcpy(buf + boff,
+ (uint8_t *)br->br_iov[i].iov_base + voff,
+ clen);
+ if (clen < br->br_iov[i].iov_len - voff)
+ voff += clen;
+ else {
+ i++;
+ voff = 0;
+ }
+ boff += clen;
+ } while (boff < len);
+
+ n = pwrite(bc->bc_fd, buf, len, br->br_offset + off);
+ if (n < 0) {
+ err = errno;
+ break;
+ }
+ off += n;
+ br->br_resid -= n;
+ }
+ break;
+ case BOP_FLUSH:
+ err = blockif_flush_bc(bc);
+ break;
+ case BOP_DELETE:
+ if (!bc->bc_candelete)
+ err = EOPNOTSUPP;
+ else if (bc->bc_rdonly)
+ err = EROFS;
+ else if (bc->bc_ischr) {
+ arg[0] = br->br_offset;
+ arg[1] = br->br_resid;
+ if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
+ err = errno;
+ else
+ br->br_resid = 0;
+ } else {
+ range.r_offset = br->br_offset;
+ range.r_len = br->br_resid;
+
+ while (range.r_len > 0) {
+ if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC,
+ &range, 0, &range) != 0) {
+ err = errno;
+ break;
+ }
+ }
+ if (err == 0)
+ br->br_resid = 0;
+ }
+ break;
+ default:
+ err = EINVAL;
+ break;
+ }
+
+ be->be_status = BST_DONE;
+
+ (*br->br_callback)(br, err);
+}
+
+static inline bool
+blockif_empty(const struct blockif_ctxt *bc)
+{
+ return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq));
+}
+
+static void *
+blockif_thr(void *arg)
+{
+ struct blockif_ctxt *bc;
+ struct blockif_elem *be;
+ pthread_t t;
+ uint8_t *buf;
+
+ bc = arg;
+ if (bc->bc_isgeom)
+ buf = malloc(MAXPHYS);
+ else
+ buf = NULL;
+ t = pthread_self();
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ for (;;) {
+ while (blockif_dequeue(bc, t, &be)) {
+ pthread_mutex_unlock(&bc->bc_mtx);
+ blockif_proc(bc, be, buf);
+ pthread_mutex_lock(&bc->bc_mtx);
+ blockif_complete(bc, be);
+ }
+
+ /* If none to work, notify the main thread */
+ if (blockif_empty(bc))
+ pthread_cond_broadcast(&bc->bc_work_done_cond);
+
+ /* Check ctxt status here to see if exit requested */
+ if (bc->bc_closing)
+ break;
+
+ pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
+ }
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ if (buf)
+ free(buf);
+ pthread_exit(NULL);
+ return (NULL);
+}
+
+static void
+blockif_sigcont_handler(int signal __unused, enum ev_type type __unused,
+ void *arg __unused, uint64_t data __unused)
+{
+ struct blockif_sig_elem *bse;
+
+ for (;;) {
+ /*
+ * Process the entire list even if not intended for
+ * this thread.
+ */
+ do {
+ bse = blockif_bse_head;
+ if (bse == NULL)
+ return;
+ } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
+ (uintptr_t)bse,
+ (uintptr_t)bse->bse_next));
+
+ pthread_mutex_lock(&bse->bse_mtx);
+ bse->bse_pending = 0;
+ pthread_cond_signal(&bse->bse_cond);
+ pthread_mutex_unlock(&bse->bse_mtx);
+ }
+}
+
+static void
+blockif_init(void)
+{
+ mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
+ (void) signal(SIGCONT, SIG_IGN);
+}
+
+
+struct blockif_ctxt *
+blockif_open(nvlist_t *nvl, const char *ident)
+{
+ char tname[MAXCOMLEN + 1];
+ char name[MAXPATHLEN];
+ const char *path, *pssval, *ssval, *bootindex_val;
+ char *cp;
+ struct blockif_ctxt *bc;
+ struct stat sbuf;
+ struct diocgattr_arg arg;
+ off_t size, psectsz, psectoff;
+ int extra, fd, i, sectsz;
+ int ro, candelete, geom, ssopt, pssopt;
+ int nodelete;
+ int bootindex;
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+ cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE };
+#endif
+
+ pthread_once(&blockif_once, blockif_init);
+
+ fd = -1;
+ extra = 0;
+ ssopt = 0;
+ ro = 0;
+ nodelete = 0;
+ bootindex = -1;
+
+ if (get_config_bool_node_default(nvl, "nocache", false))
+ extra |= O_DIRECT;
+ if (get_config_bool_node_default(nvl, "nodelete", false))
+ nodelete = 1;
+ if (get_config_bool_node_default(nvl, "sync", false) ||
+ get_config_bool_node_default(nvl, "direct", false))
+ extra |= O_SYNC;
+ if (get_config_bool_node_default(nvl, "ro", false))
+ ro = 1;
+ ssval = get_config_value_node(nvl, "sectorsize");
+ if (ssval != NULL) {
+ ssopt = strtol(ssval, &cp, 10);
+ if (cp == ssval) {
+ EPRINTLN("Invalid sector size \"%s\"", ssval);
+ goto err;
+ }
+ if (*cp == '\0') {
+ pssopt = ssopt;
+ } else if (*cp == '/') {
+ pssval = cp + 1;
+ pssopt = strtol(pssval, &cp, 10);
+ if (cp == pssval || *cp != '\0') {
+ EPRINTLN("Invalid sector size \"%s\"", ssval);
+ goto err;
+ }
+ } else {
+ EPRINTLN("Invalid sector size \"%s\"", ssval);
+ goto err;
+ }
+ }
+
+ bootindex_val = get_config_value_node(nvl, "bootindex");
+ if (bootindex_val != NULL) {
+ bootindex = atoi(bootindex_val);
+ }
+
+ path = get_config_value_node(nvl, "path");
+ if (path == NULL) {
+ EPRINTLN("Missing \"path\" for block device.");
+ goto err;
+ }
+
+ fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra);
+ if (fd < 0 && !ro) {
+ /* Attempt a r/w fail with a r/o open */
+ fd = open(path, O_RDONLY | extra);
+ ro = 1;
+ }
+
+ if (fd < 0) {
+ warn("Could not open backing file: %s", path);
+ goto err;
+ }
+
+ if (fstat(fd, &sbuf) < 0) {
+ warn("Could not stat backing file %s", path);
+ goto err;
+ }
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
+ CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF);
+ if (ro)
+ cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
+
+ if (caph_rights_limit(fd, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ /*
+ * Deal with raw devices
+ */
+ size = sbuf.st_size;
+ sectsz = DEV_BSIZE;
+ psectsz = psectoff = 0;
+ candelete = geom = 0;
+ if (S_ISCHR(sbuf.st_mode)) {
+ if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
+ ioctl(fd, DIOCGSECTORSIZE, §sz)) {
+ perror("Could not fetch dev blk/sector size");
+ goto err;
+ }
+ assert(size != 0);
+ assert(sectsz != 0);
+ if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
+ ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
+ strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
+ arg.len = sizeof(arg.value.i);
+ if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
+ candelete = arg.value.i;
+ if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
+ geom = 1;
+ } else {
+ psectsz = sbuf.st_blksize;
+ /* Avoid fallback implementation */
+ candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1;
+ }
+
+#ifndef WITHOUT_CAPSICUM
+ if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ if (ssopt != 0) {
+ if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
+ ssopt > pssopt) {
+ EPRINTLN("Invalid sector size %d/%d",
+ ssopt, pssopt);
+ goto err;
+ }
+
+ /*
+ * Some backend drivers (e.g. cd0, ada0) require that the I/O
+ * size be a multiple of the device's sector size.
+ *
+ * Validate that the emulated sector size complies with this
+ * requirement.
+ */
+ if (S_ISCHR(sbuf.st_mode)) {
+ if (ssopt < sectsz || (ssopt % sectsz) != 0) {
+ EPRINTLN("Sector size %d incompatible "
+ "with underlying device sector size %d",
+ ssopt, sectsz);
+ goto err;
+ }
+ }
+
+ sectsz = ssopt;
+ psectsz = pssopt;
+ psectoff = 0;
+ }
+
+ bc = calloc(1, sizeof(struct blockif_ctxt));
+ if (bc == NULL) {
+ perror("calloc");
+ goto err;
+ }
+
+ bc->bc_magic = BLOCKIF_SIG;
+ bc->bc_fd = fd;
+ bc->bc_ischr = S_ISCHR(sbuf.st_mode);
+ bc->bc_isgeom = geom;
+ bc->bc_candelete = candelete;
+ bc->bc_rdonly = ro;
+ bc->bc_size = size;
+ bc->bc_sectsz = sectsz;
+ bc->bc_psectsz = psectsz;
+ bc->bc_psectoff = psectoff;
+ pthread_mutex_init(&bc->bc_mtx, NULL);
+ pthread_cond_init(&bc->bc_cond, NULL);
+ bc->bc_paused = 0;
+ pthread_cond_init(&bc->bc_work_done_cond, NULL);
+ TAILQ_INIT(&bc->bc_freeq);
+ TAILQ_INIT(&bc->bc_pendq);
+ TAILQ_INIT(&bc->bc_busyq);
+ bc->bc_bootindex = bootindex;
+ for (i = 0; i < BLOCKIF_MAXREQ; i++) {
+ bc->bc_reqs[i].be_status = BST_FREE;
+ TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
+ }
+
+ for (i = 0; i < BLOCKIF_NUMTHR; i++) {
+ pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
+ snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
+ pthread_set_name_np(bc->bc_btid[i], tname);
+ }
+
+ return (bc);
+err:
+ if (fd >= 0)
+ close(fd);
+ return (NULL);
+}
+
+static void
+blockif_resized(int fd, enum ev_type type __unused, void *arg,
+ uint64_t data __unused)
+{
+ struct blockif_ctxt *bc;
+ struct stat sb;
+ off_t mediasize;
+
+ if (fstat(fd, &sb) != 0)
+ return;
+
+ if (S_ISCHR(sb.st_mode)) {
+ if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) {
+ EPRINTLN("blockif_resized: get mediasize failed: %s",
+ strerror(errno));
+ return;
+ }
+ } else
+ mediasize = sb.st_size;
+
+ bc = arg;
+ pthread_mutex_lock(&bc->bc_mtx);
+ if (mediasize != bc->bc_size) {
+ bc->bc_size = mediasize;
+ bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size, 0);
+ }
+ pthread_mutex_unlock(&bc->bc_mtx);
+}
+
+int
+blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb,
+ void *cb_arg)
+{
+ struct stat sb;
+ int err;
+
+ if (cb == NULL)
+ return (EINVAL);
+
+ err = 0;
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ if (bc->bc_resize_cb != NULL) {
+ err = EBUSY;
+ goto out;
+ }
+
+ assert(bc->bc_closing == 0);
+
+ if (fstat(bc->bc_fd, &sb) != 0) {
+ err = errno;
+ goto out;
+ }
+
+ bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE,
+ EVFF_ATTRIB, blockif_resized, bc);
+ if (bc->bc_resize_event == NULL) {
+ err = ENXIO;
+ goto out;
+ }
+
+ bc->bc_resize_cb = cb;
+ bc->bc_resize_cb_arg = cb_arg;
+out:
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ return (err);
+}
+
+static int
+blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
+ enum blockop op)
+{
+ int err;
+
+ err = 0;
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ assert(!bc->bc_paused);
+ if (!TAILQ_EMPTY(&bc->bc_freeq)) {
+ /*
+ * Enqueue and inform the block i/o thread
+ * that there is work available
+ */
+ if (blockif_enqueue(bc, breq, op))
+ pthread_cond_signal(&bc->bc_cond);
+ } else {
+ /*
+ * Callers are not allowed to enqueue more than
+ * the specified blockif queue limit. Return an
+ * error to indicate that the queue length has been
+ * exceeded.
+ */
+ err = E2BIG;
+ }
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ return (err);
+}
+
+int
+blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (blockif_request(bc, breq, BOP_READ));
+}
+
+int
+blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (blockif_request(bc, breq, BOP_WRITE));
+}
+
+int
+blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (blockif_request(bc, breq, BOP_FLUSH));
+}
+
+int
+blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (blockif_request(bc, breq, BOP_DELETE));
+}
+
+int
+blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+ struct blockif_elem *be;
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ /* XXX: not waiting while paused */
+
+ /*
+ * Check pending requests.
+ */
+ TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
+ if (be->be_req == breq)
+ break;
+ }
+ if (be != NULL) {
+ /*
+ * Found it.
+ */
+ blockif_complete(bc, be);
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ return (0);
+ }
+
+ /*
+ * Check in-flight requests.
+ */
+ TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
+ if (be->be_req == breq)
+ break;
+ }
+ if (be == NULL) {
+ /*
+ * Didn't find it.
+ */
+ pthread_mutex_unlock(&bc->bc_mtx);
+ return (EINVAL);
+ }
+
+ /*
+ * Interrupt the processing thread to force it return
+ * prematurely via it's normal callback path.
+ */
+ while (be->be_status == BST_BUSY) {
+ struct blockif_sig_elem bse, *old_head;
+
+ pthread_mutex_init(&bse.bse_mtx, NULL);
+ pthread_cond_init(&bse.bse_cond, NULL);
+
+ bse.bse_pending = 1;
+
+ do {
+ old_head = blockif_bse_head;
+ bse.bse_next = old_head;
+ } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
+ (uintptr_t)old_head,
+ (uintptr_t)&bse));
+
+ pthread_kill(be->be_tid, SIGCONT);
+
+ pthread_mutex_lock(&bse.bse_mtx);
+ while (bse.bse_pending)
+ pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
+ pthread_mutex_unlock(&bse.bse_mtx);
+ }
+
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ /*
+ * The processing thread has been interrupted. Since it's not
+ * clear if the callback has been invoked yet, return EBUSY.
+ */
+ return (EBUSY);
+}
+
+int
+blockif_close(struct blockif_ctxt *bc)
+{
+ void *jval;
+ int i;
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ /*
+ * Stop the block i/o thread
+ */
+ pthread_mutex_lock(&bc->bc_mtx);
+ bc->bc_closing = 1;
+ if (bc->bc_resize_event != NULL)
+ mevent_disable(bc->bc_resize_event);
+ pthread_mutex_unlock(&bc->bc_mtx);
+ pthread_cond_broadcast(&bc->bc_cond);
+ for (i = 0; i < BLOCKIF_NUMTHR; i++)
+ pthread_join(bc->bc_btid[i], &jval);
+
+ /* XXX Cancel queued i/o's ??? */
+
+ /*
+ * Release resources
+ */
+ bc->bc_magic = 0;
+ close(bc->bc_fd);
+ free(bc);
+
+ return (0);
+}
+
+/*
+ * Return virtual C/H/S values for a given block. Use the algorithm
+ * outlined in the VHD specification to calculate values.
+ */
+void
+blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
+{
+ off_t sectors; /* total sectors of the block dev */
+ off_t hcyl; /* cylinders times heads */
+ uint16_t secpt; /* sectors per track */
+ uint8_t heads;
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ sectors = bc->bc_size / bc->bc_sectsz;
+
+ /* Clamp the size to the largest possible with CHS */
+ if (sectors > 65535L * 16 * 255)
+ sectors = 65535L * 16 * 255;
+
+ if (sectors >= 65536L * 16 * 63) {
+ secpt = 255;
+ heads = 16;
+ hcyl = sectors / secpt;
+ } else {
+ secpt = 17;
+ hcyl = sectors / secpt;
+ heads = (hcyl + 1023) / 1024;
+
+ if (heads < 4)
+ heads = 4;
+
+ if (hcyl >= (heads * 1024) || heads > 16) {
+ secpt = 31;
+ heads = 16;
+ hcyl = sectors / secpt;
+ }
+ if (hcyl >= (heads * 1024)) {
+ secpt = 63;
+ heads = 16;
+ hcyl = sectors / secpt;
+ }
+ }
+
+ *c = hcyl / heads;
+ *h = heads;
+ *s = secpt;
+}
+
+/*
+ * Accessors
+ */
+off_t
+blockif_size(struct blockif_ctxt *bc)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (bc->bc_size);
+}
+
+int
+blockif_sectsz(struct blockif_ctxt *bc)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (bc->bc_sectsz);
+}
+
+void
+blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ *size = bc->bc_psectsz;
+ *off = bc->bc_psectoff;
+}
+
+int
+blockif_queuesz(struct blockif_ctxt *bc)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (BLOCKIF_MAXREQ - 1);
+}
+
+int
+blockif_is_ro(struct blockif_ctxt *bc)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (bc->bc_rdonly);
+}
+
+int
+blockif_candelete(struct blockif_ctxt *bc)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (bc->bc_candelete);
+}
diff --git a/tests/sys/virtio/config.h b/tests/sys/virtio/config.h
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/config.h
@@ -0,0 +1,129 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 John H. Baldwin <jhb@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef __CONFIG_H__
+#define __CONFIG_H__
+
+#include <sys/nv.h>
+
+/*-
+ * Manages a configuration database backed by an nv(9) list.
+ *
+ * The database only stores string values. Callers should parse
+ * values into other types if needed. String values can reference
+ * other configuration variables using a '%(name)' syntax. In this
+ * case, the name must be the full path of the configuration
+ * variable. The % character can be escaped with a preceding \ to
+ * avoid expansion. Any \ characters must be escaped.
+ *
+ * Configuration variables are stored in a tree. The full path of a
+ * variable is specified as a dot-separated name similar to sysctl(8)
+ * OIDs.
+ */
+
+/*
+ * Fetches the value of a configuration variable. If the "raw" value
+ * contains references to other configuration variables, this function
+ * expands those references and returns a pointer to the parsed
+ * string. The string's storage is only stable until the next call to
+ * this function.
+ *
+ * If no node is found, returns NULL.
+ *
+ * If 'parent' is NULL, 'name' is assumed to be a top-level variable.
+ */
+const char *get_config_value_node(const nvlist_t *parent, const char *name);
+
+/*
+ * Similar to get_config_value_node but expects a full path to the
+ * leaf node.
+ */
+const char *get_config_value(const char *path);
+
+/* Initializes the tree to an empty state. */
+void init_config(void);
+
+/*
+ * Creates an existing configuration node via a dot-separated OID
+ * path. Will fail if the path names an existing leaf configuration
+ * variable. If the node already exists, this returns a pointer to
+ * the existing node.
+ */
+nvlist_t *create_config_node(const char *path);
+
+/*
+ * Looks for an existing configuration node via a dot-separated OID
+ * path. Will fail if the path names an existing leaf configuration
+ * variable.
+ */
+nvlist_t *find_config_node(const char *path);
+
+/*
+ * Similar to the above, but treats the path relative to an existing
+ * 'parent' node rather than as an absolute path.
+ */
+nvlist_t *create_relative_config_node(nvlist_t *parent, const char *path);
+nvlist_t *find_relative_config_node(nvlist_t *parent, const char *path);
+
+/*
+ * Adds or replaces the value of the specified variable.
+ *
+ * If 'parent' is NULL, 'name' is assumed to be a top-level variable.
+ */
+void set_config_value_node(nvlist_t *parent, const char *name,
+ const char *value);
+
+/*
+ * Similar to set_config_value_node but only sets value if it's unset yet.
+ */
+void set_config_value_node_if_unset(nvlist_t *const parent,
+ const char *const name, const char *const value);
+
+/*
+ * Similar to set_config_value_node but expects a full path to the
+ * leaf node.
+ */
+void set_config_value(const char *path, const char *value);
+
+/*
+ * Similar to set_config_value but only sets the value if it's unset yet.
+ */
+void set_config_value_if_unset(const char *const path,
+ const char *const value);
+
+/* Convenience wrappers for boolean variables. */
+bool get_config_bool(const char *path);
+bool get_config_bool_node(const nvlist_t *parent, const char *name);
+bool get_config_bool_default(const char *path, bool def);
+bool get_config_bool_node_default(const nvlist_t *parent, const char *name,
+ bool def);
+void set_config_bool(const char *path, bool value);
+void set_config_bool_node(nvlist_t *parent, const char *name, bool value);
+
+void dump_config(void);
+
+#endif /* !__CONFIG_H__ */
diff --git a/tests/sys/virtio/config.c b/tests/sys/virtio/config.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/config.c
@@ -0,0 +1,464 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 John H. Baldwin <jhb@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include <assert.h>
+#include <err.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config.h"
+
+static nvlist_t *config_root;
+
+void
+init_config(void)
+{
+
+ config_root = nvlist_create(0);
+ if (config_root == NULL)
+ err(4, "Failed to create configuration root nvlist");
+}
+
+static nvlist_t *
+_lookup_config_node(nvlist_t *parent, const char *path, bool create)
+{
+ char *copy, *name, *tofree;
+ nvlist_t *nvl, *new_nvl;
+
+ copy = strdup(path);
+ if (copy == NULL)
+ errx(4, "Failed to allocate memory");
+ tofree = copy;
+ nvl = parent;
+ while ((name = strsep(©, ".")) != NULL) {
+ if (*name == '\0') {
+ warnx("Invalid configuration node: %s", path);
+ nvl = NULL;
+ break;
+ }
+ if (nvlist_exists_nvlist(nvl, name))
+ /*
+ * XXX-MJ it is incorrect to cast away the const
+ * qualifier like this since the contract with nvlist
+ * says that values are immutable, and some consumers
+ * will indeed add nodes to the returned nvlist. In
+ * practice, however, it appears to be harmless with the
+ * current nvlist implementation, so we just live with
+ * it until the implementation is reworked.
+ */
+ nvl = __DECONST(nvlist_t *,
+ nvlist_get_nvlist(nvl, name));
+ else if (nvlist_exists(nvl, name)) {
+ for (copy = tofree; copy < name; copy++)
+ if (*copy == '\0')
+ *copy = '.';
+ warnx(
+ "Configuration node %s is a child of existing variable %s",
+ path, tofree);
+ nvl = NULL;
+ break;
+ } else if (create) {
+ /*
+ * XXX-MJ as with the case above, "new_nvl" shouldn't be
+ * mutated after its ownership is given to "nvl".
+ */
+ new_nvl = nvlist_create(0);
+ if (new_nvl == NULL)
+ errx(4, "Failed to allocate memory");
+ nvlist_move_nvlist(nvl, name, new_nvl);
+ nvl = new_nvl;
+ } else {
+ nvl = NULL;
+ break;
+ }
+ }
+ free(tofree);
+ return (nvl);
+}
+
+nvlist_t *
+create_config_node(const char *path)
+{
+
+ return (_lookup_config_node(config_root, path, true));
+}
+
+nvlist_t *
+find_config_node(const char *path)
+{
+
+ return (_lookup_config_node(config_root, path, false));
+}
+
+nvlist_t *
+create_relative_config_node(nvlist_t *parent, const char *path)
+{
+
+ return (_lookup_config_node(parent, path, true));
+}
+
+nvlist_t *
+find_relative_config_node(nvlist_t *parent, const char *path)
+{
+
+ return (_lookup_config_node(parent, path, false));
+}
+
+void
+set_config_value_node(nvlist_t *parent, const char *name, const char *value)
+{
+
+ if (strchr(name, '.') != NULL)
+ errx(4, "Invalid config node name %s", name);
+ if (parent == NULL)
+ parent = config_root;
+ if (nvlist_exists_string(parent, name))
+ nvlist_free_string(parent, name);
+ else if (nvlist_exists(parent, name))
+ errx(4,
+ "Attempting to add value %s to existing node %s of list %p",
+ value, name, parent);
+ nvlist_add_string(parent, name, value);
+}
+
+void
+set_config_value_node_if_unset(nvlist_t *const parent, const char *const name,
+ const char *const value)
+{
+ if (get_config_value_node(parent, name) != NULL) {
+ return;
+ }
+
+ set_config_value_node(parent, name, value);
+}
+
+void
+set_config_value(const char *path, const char *value)
+{
+ const char *name;
+ char *node_name;
+ nvlist_t *nvl;
+
+ /* Look for last separator. */
+ name = strrchr(path, '.');
+ if (name == NULL) {
+ nvl = config_root;
+ name = path;
+ } else {
+ node_name = strndup(path, name - path);
+ if (node_name == NULL)
+ errx(4, "Failed to allocate memory");
+ nvl = create_config_node(node_name);
+ if (nvl == NULL)
+ errx(4, "Failed to create configuration node %s",
+ node_name);
+ free(node_name);
+
+ /* Skip over '.'. */
+ name++;
+ }
+
+ if (nvlist_exists_nvlist(nvl, name))
+ errx(4, "Attempting to add value %s to existing node %s",
+ value, path);
+ set_config_value_node(nvl, name, value);
+}
+
+void
+set_config_value_if_unset(const char *const path, const char *const value)
+{
+ if (get_config_value(path) != NULL) {
+ return;
+ }
+
+ set_config_value(path, value);
+}
+
+static const char *
+get_raw_config_value(const char *path)
+{
+ const char *name;
+ char *node_name;
+ nvlist_t *nvl;
+
+ /* Look for last separator. */
+ name = strrchr(path, '.');
+ if (name == NULL) {
+ nvl = config_root;
+ name = path;
+ } else {
+ node_name = strndup(path, name - path);
+ if (node_name == NULL)
+ errx(4, "Failed to allocate memory");
+ nvl = find_config_node(node_name);
+ free(node_name);
+ if (nvl == NULL)
+ return (NULL);
+
+ /* Skip over '.'. */
+ name++;
+ }
+
+ if (nvlist_exists_string(nvl, name))
+ return (nvlist_get_string(nvl, name));
+ if (nvlist_exists_nvlist(nvl, name))
+ warnx("Attempting to fetch value of node %s", path);
+ return (NULL);
+}
+
+static char *
+_expand_config_value(const char *value, int depth)
+{
+ FILE *valfp;
+ const char *cp, *vp;
+ char *nestedval, *path, *valbuf;
+ size_t valsize;
+
+ valfp = open_memstream(&valbuf, &valsize);
+ if (valfp == NULL)
+ errx(4, "Failed to allocate memory");
+
+ vp = value;
+ while (*vp != '\0') {
+ switch (*vp) {
+ case '%':
+ if (depth > 15) {
+ warnx(
+ "Too many recursive references in configuration value");
+ fputc('%', valfp);
+ vp++;
+ break;
+ }
+ if (vp[1] != '(' || vp[2] == '\0')
+ cp = NULL;
+ else
+ cp = strchr(vp + 2, ')');
+ if (cp == NULL) {
+ warnx(
+ "Invalid reference in configuration value \"%s\"",
+ value);
+ fputc('%', valfp);
+ vp++;
+ break;
+ }
+ vp += 2;
+
+ if (cp == vp) {
+ warnx(
+ "Empty reference in configuration value \"%s\"",
+ value);
+ vp++;
+ break;
+ }
+
+ /* Allocate a C string holding the path. */
+ path = strndup(vp, cp - vp);
+ if (path == NULL)
+ errx(4, "Failed to allocate memory");
+
+ /* Advance 'vp' past the reference. */
+ vp = cp + 1;
+
+ /* Fetch the referenced value. */
+ cp = get_raw_config_value(path);
+ if (cp == NULL)
+ warnx(
+ "Failed to fetch referenced configuration variable %s",
+ path);
+ else {
+ nestedval = _expand_config_value(cp, depth + 1);
+ fputs(nestedval, valfp);
+ free(nestedval);
+ }
+ free(path);
+ break;
+ case '\\':
+ vp++;
+ if (*vp == '\0') {
+ warnx(
+ "Trailing \\ in configuration value \"%s\"",
+ value);
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ fputc(*vp, valfp);
+ vp++;
+ break;
+ }
+ }
+ fclose(valfp);
+ return (valbuf);
+}
+
+static const char *
+expand_config_value(const char *value)
+{
+ static char *valbuf;
+
+ if (strchr(value, '%') == NULL)
+ return (value);
+
+ free(valbuf);
+ valbuf = _expand_config_value(value, 0);
+ return (valbuf);
+}
+
+const char *
+get_config_value(const char *path)
+{
+ const char *value;
+
+ value = get_raw_config_value(path);
+ if (value == NULL)
+ return (NULL);
+ return (expand_config_value(value));
+}
+
+const char *
+get_config_value_node(const nvlist_t *parent, const char *name)
+{
+
+ if (strchr(name, '.') != NULL)
+ errx(4, "Invalid config node name %s", name);
+ if (parent == NULL)
+ parent = config_root;
+ if (nvlist_exists_nvlist(parent, name))
+ warnx("Attempt to fetch value of node %s of list %p", name,
+ parent);
+ if (!nvlist_exists_string(parent, name))
+ return (NULL);
+
+ return (expand_config_value(nvlist_get_string(parent, name)));
+}
+
+static bool
+_bool_value(const char *name, const char *value)
+{
+
+ if (strcasecmp(value, "true") == 0 ||
+ strcasecmp(value, "on") == 0 ||
+ strcasecmp(value, "yes") == 0 ||
+ strcmp(value, "1") == 0)
+ return (true);
+ if (strcasecmp(value, "false") == 0 ||
+ strcasecmp(value, "off") == 0 ||
+ strcasecmp(value, "no") == 0 ||
+ strcmp(value, "0") == 0)
+ return (false);
+ err(4, "Invalid value %s for boolean variable %s", value, name);
+}
+
+bool
+get_config_bool(const char *path)
+{
+ const char *value;
+
+ value = get_config_value(path);
+ if (value == NULL)
+ err(4, "Failed to fetch boolean variable %s", path);
+ return (_bool_value(path, value));
+}
+
+bool
+get_config_bool_default(const char *path, bool def)
+{
+ const char *value;
+
+ value = get_config_value(path);
+ if (value == NULL)
+ return (def);
+ return (_bool_value(path, value));
+}
+
+bool
+get_config_bool_node(const nvlist_t *parent, const char *name)
+{
+ const char *value;
+
+ value = get_config_value_node(parent, name);
+ if (value == NULL)
+ err(4, "Failed to fetch boolean variable %s", name);
+ return (_bool_value(name, value));
+}
+
+bool
+get_config_bool_node_default(const nvlist_t *parent, const char *name,
+ bool def)
+{
+ const char *value;
+
+ value = get_config_value_node(parent, name);
+ if (value == NULL)
+ return (def);
+ return (_bool_value(name, value));
+}
+
+void
+set_config_bool(const char *path, bool value)
+{
+
+ set_config_value(path, value ? "true" : "false");
+}
+
+void
+set_config_bool_node(nvlist_t *parent, const char *name, bool value)
+{
+
+ set_config_value_node(parent, name, value ? "true" : "false");
+}
+
+static void
+dump_tree(const char *prefix, const nvlist_t *nvl)
+{
+ const char *name;
+ void *cookie;
+ int type;
+
+ cookie = NULL;
+ while ((name = nvlist_next(nvl, &type, &cookie)) != NULL) {
+ if (type == NV_TYPE_NVLIST) {
+ char *new_prefix;
+
+ asprintf(&new_prefix, "%s%s.", prefix, name);
+ dump_tree(new_prefix, nvlist_get_nvlist(nvl, name));
+ free(new_prefix);
+ } else {
+ assert(type == NV_TYPE_STRING);
+ printf("%s%s=%s\n", prefix, name,
+ nvlist_get_string(nvl, name));
+ }
+ }
+}
+
+void
+dump_config(void)
+{
+ dump_tree("", config_root);
+}
diff --git a/tests/sys/virtio/debug.h b/tests/sys/virtio/debug.h
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/debug.h
@@ -0,0 +1,40 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2019 Vincenzo Maffione <vmaffione@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _DEBUG_H_
+#define _DEBUG_H_
+
+
+#define FPRINTLN(filep, fmt, arg...) \
+ do { \
+ fprintf(filep, fmt "\n", ##arg); \
+ } while (0)
+
+#define PRINTLN(fmt, arg...) FPRINTLN(stdout, fmt, ##arg)
+#define EPRINTLN(fmt, arg...) FPRINTLN(stderr, fmt, ##arg)
+
+#endif
diff --git a/tests/sys/virtio/iov.h b/tests/sys/virtio/iov.h
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/iov.h
@@ -0,0 +1,42 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>.
+ * Copyright (c) 2018 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _IOV_H_
+#define _IOV_H_
+
+void seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2,
+ int *niov2, size_t seek);
+void truncate_iov(struct iovec *iov, int *niov, size_t length);
+size_t count_iov(const struct iovec *iov, int niov);
+ssize_t iov_to_buf(const struct iovec *iov, int niov, void **buf);
+ssize_t buf_to_iov(const void *buf, size_t buflen, const struct iovec *iov,
+ int niov, size_t seek);
+
+#endif /* _IOV_H_ */
diff --git a/tests/sys/virtio/iov.c b/tests/sys/virtio/iov.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/iov.c
@@ -0,0 +1,146 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>.
+ * Copyright (c) 2018 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+
+#include <stdlib.h>
+#include <string.h>
+#include "iov.h"
+
+void
+seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2, int *niov2,
+ size_t seek)
+{
+ size_t remainder = 0;
+ size_t left = seek;
+ int i, j;
+
+ for (i = 0; i < niov1; i++) {
+ size_t toseek = MIN(left, iov1[i].iov_len);
+ left -= toseek;
+
+ if (toseek == iov1[i].iov_len)
+ continue;
+
+ if (left == 0) {
+ remainder = toseek;
+ break;
+ }
+ }
+
+ for (j = i; j < niov1; j++) {
+ iov2[j - i].iov_base = (char *)iov1[j].iov_base + remainder;
+ iov2[j - i].iov_len = iov1[j].iov_len - remainder;
+ remainder = 0;
+ }
+
+ *niov2 = j - i;
+}
+
+size_t
+count_iov(const struct iovec *iov, int niov)
+{
+ size_t total = 0;
+ int i;
+
+ for (i = 0; i < niov; i++)
+ total += iov[i].iov_len;
+
+ return (total);
+}
+
+void
+truncate_iov(struct iovec *iov, int *niov, size_t length)
+{
+ size_t done = 0;
+ int i;
+
+ for (i = 0; i < *niov; i++) {
+ size_t toseek = MIN(length - done, iov[i].iov_len);
+ done += toseek;
+
+ if (toseek <= iov[i].iov_len) {
+ iov[i].iov_len = toseek;
+ *niov = i + 1;
+ return;
+ }
+ }
+}
+
+ssize_t
+iov_to_buf(const struct iovec *iov, int niov, void **buf)
+{
+ size_t ptr, total;
+ int i;
+
+ total = count_iov(iov, niov);
+ *buf = realloc(*buf, total);
+ if (*buf == NULL)
+ return (-1);
+
+ for (i = 0, ptr = 0; i < niov; i++) {
+ memcpy((uint8_t *)*buf + ptr, iov[i].iov_base, iov[i].iov_len);
+ ptr += iov[i].iov_len;
+ }
+
+ return (total);
+}
+
+ssize_t
+buf_to_iov(const void *buf, size_t buflen, const struct iovec *iov, int niov,
+ size_t seek)
+{
+ struct iovec *diov;
+ size_t off = 0, len;
+ int i;
+
+ if (seek > 0) {
+ int ndiov;
+
+ diov = malloc(sizeof(struct iovec) * niov);
+ seek_iov(iov, niov, diov, &ndiov, seek);
+ iov = diov;
+ niov = ndiov;
+ }
+
+ for (i = 0; i < niov && off < buflen; i++) {
+ len = MIN(iov[i].iov_len, buflen - off);
+ memcpy(iov[i].iov_base, (const uint8_t *)buf + off, len);
+ off += len;
+ }
+
+ if (seek > 0)
+ free(diov);
+
+ return ((ssize_t)off);
+}
+
diff --git a/tests/sys/virtio/iov_emul.h b/tests/sys/virtio/iov_emul.h
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/iov_emul.h
@@ -0,0 +1,20 @@
+#ifndef _IOV_EMUL_E
+#define _IOV_EMUL_E
+
+struct virtio_softc;
+
+struct iov_emul {
+ struct vtdbg_transfer *iove_tf;
+ size_t iove_maxcnt;
+ size_t iove_ind;
+};
+
+#define IOVE_INIT (16)
+
+struct iov_emul *iove_alloc(void);
+void iove_free(struct iov_emul *iove);
+int iove_add(struct iov_emul *iove, uint64_t phys, size_t len, struct iovec *iov);
+int iove_import(int fd, struct iov_emul *iove);
+int iove_export(int fd, struct iov_emul *iove);
+
+#endif /* _IOV_EMUL_E */
diff --git a/tests/sys/virtio/iov_emul.c b/tests/sys/virtio/iov_emul.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/iov_emul.c
@@ -0,0 +1,106 @@
+#include <sys/param.h>
+#include <sys/uio.h>
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+
+#include <dev/virtio/dbg/virtio_dbg.h>
+
+#include "debug.h"
+#include "iov_emul.h"
+#include "mmio_emul.h"
+#include "virtio.h"
+
+struct iov_emul *
+iove_alloc(void)
+{
+ struct iov_emul *iove;
+
+ iove = calloc(1, sizeof(*iove));
+
+ iove->iove_tf = calloc(IOVE_INIT, sizeof(*iove->iove_tf));
+ if (iove->iove_tf == NULL) {
+ free(iove);
+ return (NULL);
+ }
+
+ iove->iove_maxcnt = IOVE_INIT;
+
+ return (iove);
+}
+
+void
+iove_free(struct iov_emul *iove)
+{
+ size_t i;
+
+ for (i = 0; i < iove->iove_ind; i++)
+ free(iove->iove_tf[i].vtdt_device);
+
+ free(iove);
+}
+
+
+int
+iove_add(struct iov_emul *iove, uint64_t phys, size_t len, struct iovec *iov)
+{
+ struct vtdbg_transfer *tf = iove->iove_tf;
+ size_t ind = iove->iove_ind;
+ char *base;
+
+ if (ind == iove->iove_maxcnt){
+ tf = reallocarray(tf, 2 * iove->iove_maxcnt,
+ sizeof(*tf));
+ if (tf == NULL)
+ return (ENOMEM);
+ iove->iove_tf = tf;
+ iove->iove_maxcnt *= 2;
+ }
+
+ base = malloc(len);
+ if (base == NULL)
+ return (ENOMEM);
+
+ iove->iove_tf[ind].vtdt_device = base;
+ iove->iove_tf[ind].vtdt_driver = (caddr_t) phys;
+ iove->iove_tf[ind].vtdt_len = len;
+ iove->iove_ind += 1;
+
+ iov->iov_base = base;
+ iov->iov_len = len;
+
+ return (0);
+}
+
+
+/*
+ * Import a read IO vector from the kernel.
+ */
+int
+iove_import(int fd, struct iov_emul *iove)
+{
+ struct vtdbg_io_args args = {
+ .transfers = iove->iove_tf,
+ .cnt = iove->iove_ind,
+ .touser = true,
+ };
+
+ return (ioctl(fd, VIRTIO_DBG_TRANSFER, &args));
+}
+
+/*
+ * Export a write IO vector to the kernel.
+ */
+int
+iove_export(int fd, struct iov_emul *iove)
+{
+ struct vtdbg_io_args args = {
+ .transfers = iove->iove_tf,
+ .cnt = iove->iove_ind,
+ .touser = false,
+ };
+
+ return (ioctl(fd, VIRTIO_DBG_TRANSFER, &args));
+}
+
diff --git a/tests/sys/virtio/mevent.h b/tests/sys/virtio/mevent.h
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/mevent.h
@@ -0,0 +1,60 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _MEVENT_H_
+#define _MEVENT_H_
+
+enum ev_type {
+ EVF_READ,
+ EVF_WRITE,
+ EVF_TIMER,
+ EVF_SIGNAL,
+ EVF_VNODE,
+};
+
+/* Filter flags for EVF_VNODE */
+#define EVFF_ATTRIB 0x0001
+
+typedef void mevent_cb_t(int, enum ev_type, void *, uint64_t);
+struct mevent;
+
+struct mevent *mevent_add(int fd, enum ev_type type, mevent_cb_t *func,
+ void *param);
+struct mevent *mevent_add_flags(int fd, enum ev_type type, int fflags,
+ mevent_cb_t *func, void *param);
+struct mevent *mevent_add_disabled(int fd, enum ev_type type,
+ mevent_cb_t *func, void *param);
+int mevent_enable(struct mevent *evp);
+int mevent_disable(struct mevent *evp);
+int mevent_delete(struct mevent *evp);
+int mevent_delete_close(struct mevent *evp);
+int mevent_timer_update(struct mevent *evp, int msecs);
+
+void mevent_dispatch(void);
+
+#endif /* _MEVENT_H_ */
diff --git a/tests/sys/virtio/mevent.c b/tests/sys/virtio/mevent.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/mevent.c
@@ -0,0 +1,564 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Micro event library for FreeBSD, designed for a single i/o thread
+ * using kqueue, and having events be persistent by default.
+ */
+
+#include <sys/cdefs.h>
+#include <assert.h>
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/event.h>
+#include <sys/time.h>
+
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "mevent.h"
+
+#define MEVENT_MAX 64
+
+static pthread_t mevent_tid;
+static pthread_once_t mevent_once = PTHREAD_ONCE_INIT;
+static int mevent_timid = 43;
+static int mevent_pipefd[2];
+static int mfd;
+static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
+
+struct mevent {
+ mevent_cb_t *me_func;
+#define me_msecs me_fd
+ int me_fd;
+ int me_timid;
+ enum ev_type me_type;
+ void *me_param;
+ int me_cq;
+ int me_state; /* Desired kevent flags. */
+ int me_closefd;
+ int me_fflags;
+ LIST_ENTRY(mevent) me_list;
+};
+
+enum mevent_update_type {
+ UPDATE_ENABLE,
+ UPDATE_DISABLE,
+ UPDATE_TIMER,
+};
+
+static LIST_HEAD(listhead, mevent) global_head, change_head;
+
+static void
+mevent_qlock(void)
+{
+ pthread_mutex_lock(&mevent_lmutex);
+}
+
+static void
+mevent_qunlock(void)
+{
+ pthread_mutex_unlock(&mevent_lmutex);
+}
+
+static void
+mevent_pipe_read(int fd, enum ev_type type __unused, void *param __unused,
+ uint64_t data __unused)
+{
+ char buf[MEVENT_MAX];
+ int status;
+
+ /*
+ * Drain the pipe read side. The fd is non-blocking so this is
+ * safe to do.
+ */
+ do {
+ status = read(fd, buf, sizeof(buf));
+ } while (status == MEVENT_MAX);
+}
+
+static void
+mevent_notify(void)
+{
+ char c = '\0';
+
+ /*
+ * If calling from outside the i/o thread, write a byte on the
+ * pipe to force the i/o thread to exit the blocking kevent call.
+ */
+ if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
+ write(mevent_pipefd[1], &c, 1);
+ }
+}
+
+static void
+mevent_init(void)
+{
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+#endif
+
+ mfd = kqueue();
+ assert(mfd > 0);
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_KQUEUE);
+ if (caph_rights_limit(mfd, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ LIST_INIT(&change_head);
+ LIST_INIT(&global_head);
+}
+
+static int
+mevent_kq_filter(struct mevent *mevp)
+{
+ int retval;
+
+ retval = 0;
+
+ if (mevp->me_type == EVF_READ)
+ retval = EVFILT_READ;
+
+ if (mevp->me_type == EVF_WRITE)
+ retval = EVFILT_WRITE;
+
+ if (mevp->me_type == EVF_TIMER)
+ retval = EVFILT_TIMER;
+
+ if (mevp->me_type == EVF_SIGNAL)
+ retval = EVFILT_SIGNAL;
+
+ if (mevp->me_type == EVF_VNODE)
+ retval = EVFILT_VNODE;
+
+ return (retval);
+}
+
+static int
+mevent_kq_flags(struct mevent *mevp)
+{
+ int retval;
+
+ retval = mevp->me_state;
+
+ if (mevp->me_type == EVF_VNODE)
+ retval |= EV_CLEAR;
+
+ return (retval);
+}
+
+static int
+mevent_kq_fflags(struct mevent *mevp)
+{
+ int retval;
+
+ retval = 0;
+
+ switch (mevp->me_type) {
+ case EVF_VNODE:
+ if ((mevp->me_fflags & EVFF_ATTRIB) != 0)
+ retval |= NOTE_ATTRIB;
+ break;
+ case EVF_READ:
+ case EVF_WRITE:
+ case EVF_TIMER:
+ case EVF_SIGNAL:
+ break;
+ }
+
+ return (retval);
+}
+
+static void
+mevent_populate(struct mevent *mevp, struct kevent *kev)
+{
+ if (mevp->me_type == EVF_TIMER) {
+ kev->ident = mevp->me_timid;
+ kev->data = mevp->me_msecs;
+ } else {
+ kev->ident = mevp->me_fd;
+ kev->data = 0;
+ }
+ kev->filter = mevent_kq_filter(mevp);
+ kev->flags = mevent_kq_flags(mevp);
+ kev->fflags = mevent_kq_fflags(mevp);
+ kev->udata = mevp;
+}
+
+static int
+mevent_build(struct kevent *kev)
+{
+ struct mevent *mevp, *tmpp;
+ int i;
+
+ i = 0;
+
+ mevent_qlock();
+
+ LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
+ if (mevp->me_closefd) {
+ /*
+ * A close of the file descriptor will remove the
+ * event
+ */
+ close(mevp->me_fd);
+ } else {
+ mevent_populate(mevp, &kev[i]);
+ i++;
+ }
+
+ mevp->me_cq = 0;
+ LIST_REMOVE(mevp, me_list);
+
+ if (mevp->me_state & EV_DELETE) {
+ free(mevp);
+ } else {
+ LIST_INSERT_HEAD(&global_head, mevp, me_list);
+ }
+
+ assert(i < MEVENT_MAX);
+ }
+
+ mevent_qunlock();
+
+ return (i);
+}
+
+static void
+mevent_handle(struct kevent *kev, int numev)
+{
+ struct mevent *mevp;
+ uint64_t data;
+ int i;
+
+ for (i = 0; i < numev; i++) {
+ mevp = kev[i].udata;
+ data = kev[i].data;
+
+ /* XXX check for EV_ERROR ? */
+
+ (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param, data);
+ }
+}
+
+static struct mevent *
+mevent_add_state(int tfd, enum ev_type type, mevent_cb_t *func, void *param,
+ int state, int fflags)
+{
+ struct kevent kev;
+ struct mevent *lp, *mevp;
+ int ret;
+
+ if (tfd < 0 || func == NULL) {
+ return (NULL);
+ }
+
+ mevp = NULL;
+
+ pthread_once(&mevent_once, mevent_init);
+
+ mevent_qlock();
+
+ /*
+ * Verify that the fd/type tuple is not present in any list
+ */
+ LIST_FOREACH(lp, &global_head, me_list) {
+ if (type != EVF_TIMER && lp->me_fd == tfd &&
+ lp->me_type == type) {
+ goto exit;
+ }
+ }
+
+ LIST_FOREACH(lp, &change_head, me_list) {
+ if (type != EVF_TIMER && lp->me_fd == tfd &&
+ lp->me_type == type) {
+ goto exit;
+ }
+ }
+
+ /*
+ * Allocate an entry and populate it.
+ */
+ mevp = calloc(1, sizeof(struct mevent));
+ if (mevp == NULL) {
+ goto exit;
+ }
+
+ if (type == EVF_TIMER) {
+ mevp->me_msecs = tfd;
+ mevp->me_timid = mevent_timid++;
+ } else
+ mevp->me_fd = tfd;
+ mevp->me_type = type;
+ mevp->me_func = func;
+ mevp->me_param = param;
+ mevp->me_state = state;
+ mevp->me_fflags = fflags;
+
+ /*
+ * Try to add the event. If this fails, report the failure to
+ * the caller.
+ */
+ mevent_populate(mevp, &kev);
+ ret = kevent(mfd, &kev, 1, NULL, 0, NULL);
+ if (ret == -1) {
+ free(mevp);
+ mevp = NULL;
+ goto exit;
+ }
+
+ mevp->me_state &= ~EV_ADD;
+ LIST_INSERT_HEAD(&global_head, mevp, me_list);
+
+exit:
+ mevent_qunlock();
+
+ return (mevp);
+}
+
+struct mevent *
+mevent_add(int tfd, enum ev_type type, mevent_cb_t *func, void *param)
+{
+
+ return (mevent_add_state(tfd, type, func, param, EV_ADD, 0));
+}
+
+struct mevent *
+mevent_add_flags(int tfd, enum ev_type type, int fflags, mevent_cb_t *func, void *param)
+{
+
+ return (mevent_add_state(tfd, type, func, param, EV_ADD, fflags));
+}
+
+struct mevent *
+mevent_add_disabled(int tfd, enum ev_type type, mevent_cb_t *func, void *param)
+{
+
+ return (mevent_add_state(tfd, type, func, param, EV_ADD | EV_DISABLE, 0));
+}
+
+static int
+mevent_update(struct mevent *evp, enum mevent_update_type type, int msecs)
+{
+ int newstate;
+
+ mevent_qlock();
+
+ /*
+ * It's not possible to update a deleted event
+ */
+ assert((evp->me_state & EV_DELETE) == 0);
+
+ newstate = evp->me_state;
+ if (type == UPDATE_ENABLE) {
+ newstate |= EV_ENABLE;
+ newstate &= ~EV_DISABLE;
+ } else if (type == UPDATE_DISABLE) {
+ newstate |= EV_DISABLE;
+ newstate &= ~EV_ENABLE;
+ } else {
+ assert(type == UPDATE_TIMER);
+ assert(evp->me_type == EVF_TIMER);
+ newstate |= EV_ADD;
+ evp->me_msecs = msecs;
+ }
+
+ /*
+ * No update needed if enable/disable had no effect
+ */
+ if (evp->me_state != newstate || type == UPDATE_TIMER) {
+ evp->me_state = newstate;
+
+ /*
+ * Place the entry onto the changed list if not
+ * already there.
+ */
+ if (evp->me_cq == 0) {
+ evp->me_cq = 1;
+ LIST_REMOVE(evp, me_list);
+ LIST_INSERT_HEAD(&change_head, evp, me_list);
+ mevent_notify();
+ }
+ }
+
+ mevent_qunlock();
+
+ return (0);
+}
+
+int
+mevent_enable(struct mevent *evp)
+{
+ return (mevent_update(evp, UPDATE_ENABLE, -1));
+}
+
+int
+mevent_disable(struct mevent *evp)
+{
+ return (mevent_update(evp, UPDATE_DISABLE, -1));
+}
+
+int
+mevent_timer_update(struct mevent *evp, int msecs)
+{
+ return (mevent_update(evp, UPDATE_TIMER, msecs));
+}
+
+static int
+mevent_delete_event(struct mevent *evp, int closefd)
+{
+ mevent_qlock();
+
+ /*
+ * Place the entry onto the changed list if not already there, and
+ * mark as to be deleted.
+ */
+ if (evp->me_cq == 0) {
+ evp->me_cq = 1;
+ LIST_REMOVE(evp, me_list);
+ LIST_INSERT_HEAD(&change_head, evp, me_list);
+ mevent_notify();
+ }
+ evp->me_state = EV_DELETE;
+
+ if (closefd)
+ evp->me_closefd = 1;
+
+ mevent_qunlock();
+
+ return (0);
+}
+
+int
+mevent_delete(struct mevent *evp)
+{
+
+ return (mevent_delete_event(evp, 0));
+}
+
+int
+mevent_delete_close(struct mevent *evp)
+{
+
+ return (mevent_delete_event(evp, 1));
+}
+
+static void
+mevent_set_name(void)
+{
+
+ pthread_set_name_np(mevent_tid, "mevent");
+}
+
+void
+mevent_dispatch(void)
+{
+ struct kevent changelist[MEVENT_MAX];
+ struct kevent eventlist[MEVENT_MAX];
+ struct mevent *pipev;
+ int numev;
+ int ret;
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+#endif
+
+ mevent_tid = pthread_self();
+ mevent_set_name();
+
+ pthread_once(&mevent_once, mevent_init);
+
+ /*
+ * Open the pipe that will be used for other threads to force
+ * the blocking kqueue call to exit by writing to it. Set the
+ * descriptor to non-blocking.
+ */
+ ret = pipe(mevent_pipefd);
+ if (ret < 0) {
+ perror("pipe");
+ exit(0);
+ }
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
+ if (caph_rights_limit(mevent_pipefd[0], &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+ if (caph_rights_limit(mevent_pipefd[1], &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ /*
+ * Add internal event handler for the pipe write fd
+ */
+ pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
+ assert(pipev != NULL);
+
+ for (;;) {
+ /*
+ * Build changelist if required.
+ * XXX the changelist can be put into the blocking call
+ * to eliminate the extra syscall. Currently better for
+ * debug.
+ */
+ numev = mevent_build(changelist);
+ if (numev) {
+ ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
+ if (ret == -1) {
+ perror("Error return from kevent change");
+ }
+ }
+
+ /*
+ * Block awaiting events
+ */
+ ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
+ if (ret == -1 && errno != EINTR) {
+ perror("Error return from kevent monitor");
+ }
+
+ /*
+ * Handle reported events
+ */
+ mevent_handle(eventlist, ret);
+ }
+}
diff --git a/tests/sys/virtio/mmio_emul.h b/tests/sys/virtio/mmio_emul.h
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/mmio_emul.h
@@ -0,0 +1,117 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _MMIO_EMUL_H_
+#define _MMIO_EMUL_H_
+
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/kernel.h>
+#include <sys/nv.h>
+#include <sys/_pthreadtypes.h>
+
+#include <assert.h>
+
+#define MI_NAMESZ (40)
+
+struct mmio_devinst;
+
+struct mmio_devemu {
+ const char *me_emu; /* Name of device emulation */
+
+ /* instance creation */
+ int (*me_init)(struct mmio_devinst *, nvlist_t *);
+ void (*me_write)(struct mmio_devinst *mdi, uint64_t offset,
+ int size, uint32_t value);
+};
+#define MMIO_EMUL_SET(x) DATA_SET(mmio_devemu_set, x)
+
+enum mmio_devstate {
+ MIDEV_INVALID,
+ MIDEV_ACKNOWLEDGED,
+ MIDEV_DRIVER_FOUND,
+ MIDEV_FEATURES_OK,
+ MIDEV_LIVE,
+ MIDEV_FAILED,
+ MIDEV_DEVICE_STATES,
+};
+
+struct mmio_devinst {
+ struct mmio_devemu *mi_d;
+ char mi_name[MI_NAMESZ];
+ char *mi_addr; /* VQ control region */
+ size_t mi_bytes; /* Size of region in bytes */
+ int mi_fd; /* File descriptor for the region. */
+ enum mmio_devstate mi_state;
+};
+
+/* XXX Sensible default until proven otherwise. But we need to link it with the in-kernel header. */
+#define MMIO_TOTAL_SIZE (1024 * 1024 * 10)
+#define MMIO_CTRDEV ("/dev/vtdbg")
+
+int init_mmio(nvlist_t *nvl);
+void mmio_print_supported_devices(void);
+int mmio_parse_device(nvlist_t *nvl, char *opt);
+
+static __inline void
+mmio_set_cfgdata8(struct mmio_devinst *mdi, int offset, uint8_t val)
+{
+ *(uint8_t *)(mdi->mi_addr + offset) = val;
+}
+
+static __inline void
+mmio_set_cfgdata16(struct mmio_devinst *mdi, int offset, uint16_t val)
+{
+ *(uint16_t *)(mdi->mi_addr + offset) = htole16(val);
+}
+
+static __inline void
+mmio_set_cfgdata32(struct mmio_devinst *mdi, int offset, uint32_t val)
+{
+ *(uint32_t *)(mdi->mi_addr + offset) = htole32(val);
+}
+
+static __inline uint8_t
+mmio_get_cfgdata8(struct mmio_devinst *mdi, int offset)
+{
+ return (*(uint8_t *)(mdi->mi_addr + offset));
+}
+
+static __inline uint16_t
+mmio_get_cfgdata16(struct mmio_devinst *mdi, int offset)
+{
+ return le16toh((*(uint16_t *)(mdi->mi_addr + offset)));
+}
+
+static __inline uint32_t
+mmio_get_cfgdata32(struct mmio_devinst *mdi, int offset)
+{
+ return le32toh((*(uint32_t *)(mdi->mi_addr + offset)));
+}
+
+#endif /* _MMIO_EMUL_H_ */
diff --git a/tests/sys/virtio/mmio_emul.c b/tests/sys/virtio/mmio_emul.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/mmio_emul.c
@@ -0,0 +1,178 @@
+#include <sys/param.h>
+#include <sys/mman.h>
+#include <sys/nv.h>
+
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <dev/virtio/dbg/virtio_dbg.h>
+
+#include "config.h"
+#include "debug.h"
+#include "mmio_emul.h"
+#include "virtio.h"
+
+SET_DECLARE(mmio_devemu_set, struct mmio_devemu);
+
+static struct mmio_devemu *
+mmio_emul_finddev(const char *name)
+{
+ struct mmio_devemu **mdpp, *mdp;
+
+ SET_FOREACH(mdpp, mmio_devemu_set) {
+ mdp = *mdpp;
+ if (!strcmp(mdp->me_emu, name)) {
+ return (mdp);
+ }
+ }
+
+ return (NULL);
+}
+
+static void *
+mmio_emul_driver_init(void *arg)
+{
+ int error;
+ int fd = (int)(long)arg;
+
+ error = ioctl(fd, VIRTIO_DBG_INIT);
+ if (error < 0) {
+ EPRINTLN("Control device initialization error: %s",
+ strerror(errno));
+ exit(1);
+ }
+ pthread_exit(NULL);
+}
+
+static int
+mmio_emul_control_init(struct mmio_devinst *mdi, struct mmio_devemu *mde, nvlist_t *nvl)
+{
+ pthread_t thread;
+ char *mmio;
+ int err;
+ int fd;
+
+ fd = open(MMIO_CTRDEV, O_RDWR);
+ if (fd == -1) {
+ EPRINTLN("Control device open error: %s",
+ strerror(errno));
+ return (-1);
+ }
+
+ mmio = mmap(NULL, MMIO_TOTAL_SIZE, PROT_READ | PROT_WRITE,
+ MAP_FILE | MAP_SHARED, fd, 0);
+ if (mmio == MAP_FAILED) {
+ EPRINTLN("Control device mapping error: %s",
+ strerror(errno));
+ close(fd);
+ return (-1);
+ }
+
+ mdi->mi_fd = fd;
+ mdi->mi_addr = mmio;
+ mdi->mi_bytes = MMIO_TOTAL_SIZE;
+
+ /*
+ * XXX Hack. We currently hardwire the block device ID. Propagate
+ * the device type in a different way.
+ */
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_MAGIC_VALUE, VIRTIO_MMIO_MAGIC_VIRT);
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_VERSION, 0x2);
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_DEVICE_ID, 0x2);
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_VENDOR_ID, VIRTIO_VENDOR);
+
+ err = (mde->me_init)(mdi, nvl);
+ if (err != 0)
+ return (err);
+
+ /*
+ * Make the ioctl out of band, because we wll use this thread to to service
+ * the register the writes triggered by the driver during device attach.
+ */
+ return (pthread_create(&thread, NULL, mmio_emul_driver_init, (void *)(long)fd));
+}
+
+static int
+mmio_emul_init(struct mmio_devemu *mde, nvlist_t *nvl)
+{
+ struct mmio_devinst *mdi;
+ int err;
+
+ mdi = calloc(1, sizeof(struct mmio_devinst));
+ if (mdi == NULL)
+ return (ENOMEM);
+
+ snprintf(mdi->mi_name, sizeof(mdi->mi_name), "%s@mmio", mde->me_emu);
+ mdi->mi_state = MIDEV_INVALID;
+ mdi->mi_fd = -1;
+
+ err = mmio_emul_control_init(mdi, mde, nvl);
+ if (err != 0) {
+ free(mdi);
+ return (err);
+ }
+
+ return (0);
+}
+
+int
+mmio_parse_device(nvlist_t *nvl, char *opt)
+{
+ struct mmio_devemu *mde;
+ char *emul = opt;
+
+ mde = mmio_emul_finddev(emul);
+ if (mde == NULL) {
+ EPRINTLN("unknown mmio device %s\n", emul);
+ return (EINVAL);
+ }
+
+ if (get_config_value_node(nvl, "devtype") != NULL) {
+ EPRINTLN("device type already defined!");
+ return (EINVAL);
+ }
+
+ set_config_value_node(nvl, "devtype", mde->me_emu);
+
+ return (0);
+}
+
+
+void
+mmio_print_supported_devices(void)
+{
+ struct mmio_devemu **mdpp, *mdp;
+
+ SET_FOREACH(mdpp, mmio_devemu_set) {
+ mdp = *mdpp;
+ printf("%s\n", mdp->me_emu);
+ }
+}
+
+int
+init_mmio(nvlist_t *nvl)
+{
+ struct mmio_devemu *mde;
+ const char *emul;
+
+ emul = get_config_value_node(nvl, "devtype");
+ if (emul == NULL) {
+ EPRINTLN("mmio device missing devtype value");
+ return (EINVAL);
+ }
+
+ mde = mmio_emul_finddev(emul);
+ if (mde == NULL) {
+ EPRINTLN("mmio unknown device \"%s\"", emul);
+ return (EINVAL);
+ }
+
+ return (mmio_emul_init(mde, nvl));
+}
diff --git a/tests/sys/virtio/mmio_virtio_block.c b/tests/sys/virtio/mmio_virtio_block.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/mmio_virtio_block.c
@@ -0,0 +1,560 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ * Copyright 2020-2021 Joyent, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+
+#include <stdbool.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <md5.h>
+
+#include <dev/virtio/dbg/virtio_dbg.h>
+
+#include "config.h"
+#include "debug.h"
+#include "mevent.h"
+#include "mmio_emul.h"
+#include "virtio.h"
+#include "block_if.h"
+#include "iov_emul.h"
+
+#define VTBLK_BSIZE 512
+#define VTBLK_RINGSZ 128
+
+_Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request");
+
+#define VTBLK_S_OK 0
+#define VTBLK_S_IOERR 1
+#define VTBLK_S_UNSUPP 2
+
+#define VTBLK_BLK_ID_BYTES 20 + 1
+
+/* Capability bits */
+#define VTBLK_F_BARRIER (1 << 0) /* Does host support barriers? */
+#define VTBLK_F_SIZE_MAX (1 << 1) /* Indicates maximum segment size */
+#define VTBLK_F_SEG_MAX (1 << 2) /* Indicates maximum # of segments */
+#define VTBLK_F_GEOMETRY (1 << 4) /* Legacy geometry available */
+#define VTBLK_F_RO (1 << 5) /* Disk is read-only */
+#define VTBLK_F_BLK_SIZE (1 << 6) /* Block size of disk is available*/
+#define VTBLK_F_SCSI (1 << 7) /* Supports scsi command passthru */
+#define VTBLK_F_FLUSH (1 << 9) /* Writeback mode enabled after reset */
+#define VTBLK_F_WCE (1 << 9) /* Legacy alias for FLUSH */
+#define VTBLK_F_TOPOLOGY (1 << 10) /* Topology information is available */
+#define VTBLK_F_CONFIG_WCE (1 << 11) /* Writeback mode available in config */
+#define VTBLK_F_MQ (1 << 12) /* Multi-Queue */
+#define VTBLK_F_DISCARD (1 << 13) /* Trim blocks */
+#define VTBLK_F_WRITE_ZEROES (1 << 14) /* Write zeros */
+
+/*
+ * Host capabilities
+ */
+#define VTBLK_S_HOSTCAPS \
+ ( VTBLK_F_SEG_MAX | \
+ VTBLK_F_BLK_SIZE | \
+ VTBLK_F_FLUSH | \
+ VTBLK_F_TOPOLOGY )
+ /* XXX Reactivate */
+// VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */
+
+/*
+ * The current blockif_delete() interface only allows a single delete
+ * request at a time.
+ */
+#define VTBLK_MAX_DISCARD_SEG 1
+
+/*
+ * An arbitrary limit to prevent excessive latency due to large
+ * delete requests.
+ */
+#define VTBLK_MAX_DISCARD_SECT ((16 << 20) / VTBLK_BSIZE) /* 16 MiB */
+
+/*
+ * Config space "registers"
+ */
+struct vtblk_config {
+ uint64_t vbc_capacity;
+ uint32_t vbc_size_max;
+ uint32_t vbc_seg_max;
+ struct {
+ uint16_t cylinders;
+ uint8_t heads;
+ uint8_t sectors;
+ } vbc_geometry;
+ uint32_t vbc_blk_size;
+ struct {
+ uint8_t physical_block_exp;
+ uint8_t alignment_offset;
+ uint16_t min_io_size;
+ uint32_t opt_io_size;
+ } vbc_topology;
+ uint8_t vbc_writeback;
+ uint8_t unused0[1];
+ uint16_t num_queues;
+ uint32_t max_discard_sectors;
+ uint32_t max_discard_seg;
+ uint32_t discard_sector_alignment;
+ uint32_t max_write_zeroes_sectors;
+ uint32_t max_write_zeroes_seg;
+ uint8_t write_zeroes_may_unmap;
+ uint8_t unused1[3];
+} __packed;
+
+/*
+ * Fixed-size block header
+ */
+struct virtio_blk_hdr {
+#define VBH_OP_READ 0
+#define VBH_OP_WRITE 1
+#define VBH_OP_SCSI_CMD 2
+#define VBH_OP_SCSI_CMD_OUT 3
+#define VBH_OP_FLUSH 4
+#define VBH_OP_FLUSH_OUT 5
+#define VBH_OP_IDENT 8
+#define VBH_OP_DISCARD 11
+#define VBH_OP_WRITE_ZEROES 13
+
+#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */
+ uint32_t vbh_type;
+ uint32_t vbh_ioprio;
+ uint64_t vbh_sector;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int mmio_vtblk_debug;
+#define DPRINTF(params) if (mmio_vtblk_debug) PRINTLN params
+#define WPRINTF(params) PRINTLN params
+
+struct mmio_vtblk_ioreq {
+ struct blockif_req io_req;
+ struct mmio_vtblk_softc *io_sc;
+ uint8_t *io_status;
+ uint16_t io_idx;
+ struct iov_emul *io_iove;
+};
+
+struct virtio_blk_discard_write_zeroes {
+ uint64_t sector;
+ uint32_t num_sectors;
+ struct {
+ uint32_t unmap:1;
+ uint32_t reserved:31;
+ } flags;
+};
+
+/*
+ * Per-device softc
+ */
+struct mmio_vtblk_softc {
+ struct virtio_softc vbsc_vs;
+ pthread_mutex_t vsc_mtx;
+ struct vqueue_info vbsc_vq;
+ struct vtblk_config *vbsc_cfg;
+ struct virtio_consts vbsc_consts;
+ struct blockif_ctxt *bc;
+ char vbsc_ident[VTBLK_BLK_ID_BYTES];
+ struct mmio_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ];
+};
+
+static void mmio_vtblk_reset(void *);
+static void mmio_vtblk_notify(void *, struct vqueue_info *);
+static int mmio_vtblk_cfgread(void *, int, int, uint32_t *);
+static int mmio_vtblk_cfgwrite(void *, int, int, uint32_t);
+
+static struct virtio_consts vtblk_vi_consts = {
+ .vc_name = "vtblk",
+ .vc_nvq = 1,
+ .vc_cfgsize = sizeof(struct vtblk_config),
+ .vc_reset = mmio_vtblk_reset,
+ .vc_qnotify = mmio_vtblk_notify,
+ .vc_cfgread = mmio_vtblk_cfgread,
+ .vc_cfgwrite = mmio_vtblk_cfgwrite,
+ .vc_apply_features = NULL,
+ .vc_hv_caps = VTBLK_S_HOSTCAPS,
+};
+
+static void
+mmio_vtblk_reset(void *vsc)
+{
+ struct mmio_vtblk_softc *sc = vsc;
+
+ DPRINTF(("vtblk: device reset requested !"));
+ vi_reset_dev(&sc->vbsc_vs);
+}
+
+static void
+mmio_vtblk_done_locked(struct mmio_vtblk_ioreq *io, int err)
+{
+ struct mmio_vtblk_softc *sc = io->io_sc;
+ int fd = sc->vbsc_vs.vs_mi->mi_fd;
+
+ /* convert errno into a virtio block error return */
+ if (err == EOPNOTSUPP || err == ENOSYS)
+ *io->io_status = VTBLK_S_UNSUPP;
+ else if (err != 0)
+ *io->io_status = VTBLK_S_IOERR;
+ else
+ *io->io_status = VTBLK_S_OK;
+
+
+ iove_export(fd, io->io_iove);
+ iove_free(io->io_iove);
+ io->io_iove = NULL;
+
+ /*
+ * Return the descriptor back to the host.
+ * We wrote 1 byte (our status) to host.
+ */
+ vq_relchain(&sc->vbsc_vq, io->io_idx, 1);
+ vq_endchains(&sc->vbsc_vq, 0);
+}
+
+static void
+mmio_vtblk_done(struct blockif_req *br, int err)
+{
+ struct mmio_vtblk_ioreq *io = br->br_param;
+ struct mmio_vtblk_softc *sc = io->io_sc;
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+ mmio_vtblk_done_locked(io, err);
+ pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+static void
+mmio_vtblk_proc(struct mmio_vtblk_softc *sc, struct vqueue_info *vq)
+{
+ struct virtio_blk_hdr *vbh;
+ struct mmio_vtblk_ioreq *io;
+ int i, n;
+ int err;
+ ssize_t iolen;
+ int writeop, type;
+ struct vi_req req;
+ struct iovec iov[BLOCKIF_IOV_MAX + 2];
+ struct virtio_blk_discard_write_zeroes *discard;
+
+ n = vq_getchain(vq, iov, BLOCKIF_IOV_MAX + 2, &req);
+
+ /*
+ * The first descriptor will be the read-only fixed header,
+ * and the last is for status (hence +2 above and below).
+ * The remaining iov's are the actual data I/O vectors.
+ *
+ * XXX - note - this fails on crash dump, which does a
+ * VIRTIO_BLK_T_FLUSH with a zero transfer length
+ */
+ assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2);
+
+ io = &sc->vbsc_ios[req.idx];
+ assert(req.readable != 0);
+ assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr));
+ vbh = (struct virtio_blk_hdr *)iov[0].iov_base;
+ memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2));
+ io->io_req.br_iovcnt = n - 2;
+ io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE;
+ io->io_status = (uint8_t *)iov[--n].iov_base;
+ io->io_iove = req.iove;
+ assert(req.writable != 0);
+ assert(iov[n].iov_len == 1);
+
+ /*
+ * XXX
+ * The guest should not be setting the BARRIER flag because
+ * we don't advertise the capability.
+ */
+ type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
+ writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD);
+ /*
+ * - Write op implies read-only descriptor
+ * - Read/ident op implies write-only descriptor
+ *
+ * By taking away either the read-only fixed header or the write-only
+ * status iovec, the following condition should hold true.
+ */
+ assert(n == (writeop ? req.readable : req.writable));
+
+ iolen = 0;
+ for (i = 1; i < n; i++) {
+ iolen += iov[i].iov_len;
+ }
+ io->io_req.br_resid = iolen;
+
+ DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld",
+ writeop ? "write/discard" : "read/ident", iolen, i - 1,
+ io->io_req.br_offset));
+
+ switch (type) {
+ case VBH_OP_READ:
+ err = blockif_read(sc->bc, &io->io_req);
+ break;
+ case VBH_OP_WRITE:
+ err = blockif_write(sc->bc, &io->io_req);
+ break;
+ case VBH_OP_DISCARD:
+ /*
+ * We currently only support a single request, if the guest
+ * has submitted a request that doesn't conform to the
+ * requirements, we return a error.
+ */
+ if (iov[1].iov_len != sizeof (*discard)) {
+ mmio_vtblk_done_locked(io, EINVAL);
+ return;
+ }
+
+ /* The segments to discard are provided rather than data */
+ discard = (struct virtio_blk_discard_write_zeroes *)
+ iov[1].iov_base;
+
+ /*
+ * virtio v1.1 5.2.6.2:
+ * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP
+ * for discard and write zeroes commands if any unknown flag is
+ * set. Furthermore, the device MUST set the status byte to
+ * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag
+ * is set.
+ *
+ * Currently there are no known flags for a DISCARD request.
+ */
+ if (discard->flags.unmap != 0 || discard->flags.reserved != 0) {
+ mmio_vtblk_done_locked(io, ENOTSUP);
+ return;
+ }
+
+ /* Make sure the request doesn't exceed our size limit */
+ if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) {
+ mmio_vtblk_done_locked(io, EINVAL);
+ return;
+ }
+
+ io->io_req.br_offset = discard->sector * VTBLK_BSIZE;
+ io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE;
+ err = blockif_delete(sc->bc, &io->io_req);
+ break;
+ case VBH_OP_FLUSH:
+ case VBH_OP_FLUSH_OUT:
+ err = blockif_flush(sc->bc, &io->io_req);
+ break;
+ case VBH_OP_IDENT:
+ /* Assume a single buffer */
+ /* S/n equal to buffer is not zero-terminated. */
+ memset(iov[1].iov_base, 0, iov[1].iov_len);
+ strncpy(iov[1].iov_base, sc->vbsc_ident,
+ MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
+ mmio_vtblk_done_locked(io, 0);
+ return;
+ default:
+ mmio_vtblk_done_locked(io, EOPNOTSUPP);
+ return;
+ }
+ assert(err == 0);
+}
+
+static void
+mmio_vtblk_notify(void *vsc, struct vqueue_info *vq)
+{
+ struct mmio_vtblk_softc *sc = vsc;
+
+ while (vq_has_descs(vq))
+ mmio_vtblk_proc(sc, vq);
+}
+
+static void
+mmio_vtblk_resized(struct blockif_ctxt *bctxt __unused, void *arg,
+ size_t new_size, uint64_t data __unused)
+{
+ struct mmio_vtblk_softc *sc;
+
+ sc = arg;
+
+ sc->vbsc_cfg->vbc_capacity = new_size / VTBLK_BSIZE; /* 512-byte units */
+ /* XXX Handle resizing. */
+ printf("UNIMPLEMENTED %s\n", __func__);
+ exit(1);
+}
+
+static void
+mmio_vtblk_event(int fd, enum ev_type type, void *arg, uint64_t offset)
+{
+ struct mmio_vtblk_softc *sc = (struct mmio_vtblk_softc *)arg;
+ struct mmio_devinst *mdi = sc->vbsc_vs.vs_mi;
+
+ assert(fd == mdi->mi_fd);
+ assert(type == EVF_READ);
+
+ vi_mmio_write(&sc->vbsc_vs, offset);
+
+ /* Let in-progress operations continue. */
+ ioctl(mdi->mi_fd, VIRTIO_DBG_ACK);
+}
+
+static int
+mmio_vtblk_init(struct mmio_devinst *mdi, nvlist_t *nvl)
+{
+ char bident[MI_NAMESZ];
+ struct blockif_ctxt *bctxt;
+ const char *path, *serial;
+ MD5_CTX mdctx;
+ u_char digest[16];
+ struct mmio_vtblk_softc *sc;
+ off_t size;
+ int i, sectsz, sts, sto;
+
+ /*
+ * The supplied backing file has to exist
+ */
+ /* Make sure the name fits */
+ snprintf(bident, sizeof(bident), "%s", mdi->mi_name);
+ bctxt = blockif_open(nvl, bident);
+ if (bctxt == NULL) {
+ perror("Could not open backing file");
+ return (1);
+ }
+
+ size = blockif_size(bctxt);
+ sectsz = blockif_sectsz(bctxt);
+ blockif_psectsz(bctxt, &sts, &sto);
+
+ sc = calloc(1, sizeof(struct mmio_vtblk_softc));
+ sc->vbsc_cfg = (struct vtblk_config *)((uint64_t)mdi->mi_addr + VIRTIO_MMIO_CONFIG);
+
+ sc->bc = bctxt;
+ for (i = 0; i < VTBLK_RINGSZ; i++) {
+ struct mmio_vtblk_ioreq *io = &sc->vbsc_ios[i];
+ io->io_req.br_callback = mmio_vtblk_done;
+ io->io_req.br_param = io;
+ io->io_sc = sc;
+ io->io_idx = i;
+ }
+
+ bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts));
+ if (blockif_candelete(sc->bc))
+ sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD;
+
+ pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+ /* init virtio softc and virtqueues */
+ vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, mdi, &sc->vbsc_vq);
+ sc->vbsc_vs.vs_mtx = &sc->vsc_mtx;
+
+ sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
+ /* sc->vbsc_vq.vq_notify = we have no per-queue notify */
+
+ /*
+ * If an explicit identifier is not given, create an
+ * identifier using parts of the md5 sum of the filename.
+ */
+ bzero(sc->vbsc_ident, VTBLK_BLK_ID_BYTES);
+ if ((serial = get_config_value_node(nvl, "serial")) != NULL ||
+ (serial = get_config_value_node(nvl, "ser")) != NULL) {
+ strlcpy(sc->vbsc_ident, serial, VTBLK_BLK_ID_BYTES);
+ } else {
+ path = get_config_value_node(nvl, "path");
+ MD5Init(&mdctx);
+ MD5Update(&mdctx, path, strlen(path));
+ MD5Final(digest, &mdctx);
+ snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES,
+ "BHYVE-%02X%02X-%02X%02X-%02X%02X",
+ digest[0], digest[1], digest[2], digest[3], digest[4],
+ digest[5]);
+ }
+
+ /* setup virtio block config space */
+ sc->vbsc_cfg->vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */
+ sc->vbsc_cfg->vbc_size_max = 0; /* not negotiated */
+
+ /*
+ * If Linux is presented with a seg_max greater than the virtio queue
+ * size, it can stumble into situations where it violates its own
+ * invariants and panics. For safety, we keep seg_max clamped, paying
+ * heed to the two extra descriptors needed for the header and status
+ * of a request.
+ */
+ sc->vbsc_cfg->vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX);
+ sc->vbsc_cfg->vbc_geometry.cylinders = 0; /* no geometry */
+ sc->vbsc_cfg->vbc_geometry.heads = 0;
+ sc->vbsc_cfg->vbc_geometry.sectors = 0;
+ sc->vbsc_cfg->vbc_blk_size = sectsz;
+ sc->vbsc_cfg->vbc_topology.physical_block_exp =
+ (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0;
+ sc->vbsc_cfg->vbc_topology.alignment_offset =
+ (sto != 0) ? ((sts - sto) / sectsz) : 0;
+ sc->vbsc_cfg->vbc_topology.min_io_size = 0;
+ sc->vbsc_cfg->vbc_topology.opt_io_size = 0;
+ sc->vbsc_cfg->vbc_writeback = 0;
+ sc->vbsc_cfg->max_discard_sectors = VTBLK_MAX_DISCARD_SECT;
+ sc->vbsc_cfg->max_discard_seg = VTBLK_MAX_DISCARD_SEG;
+ sc->vbsc_cfg->discard_sector_alignment = MAX(sectsz, sts) / VTBLK_BSIZE;
+
+ mevent_add(mdi->mi_fd, EVF_READ, mmio_vtblk_event, sc);
+ blockif_register_resize_callback(sc->bc, mmio_vtblk_resized, sc);
+
+ return (0);
+}
+
+static int
+mmio_vtblk_cfgwrite(void *vsc __unused, int offset, int size __unused,
+ uint32_t value __unused)
+{
+
+ DPRINTF(("vtblk: write to readonly reg %d", offset));
+ return (1);
+}
+
+static int
+mmio_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval)
+{
+ struct mmio_vtblk_softc *sc = vsc;
+ void *ptr;
+
+ /* our caller has already verified offset and size */
+ ptr = (uint8_t *)sc->vbsc_cfg + offset;
+ memcpy(retval, ptr, size);
+ return (0);
+}
+
+
+static const struct mmio_devemu mmio_de_vblk = {
+ .me_emu = "virtio-blk",
+ .me_init = mmio_vtblk_init,
+};
+MMIO_EMUL_SET(mmio_de_vblk);
diff --git a/tests/sys/virtio/virtio.h b/tests/sys/virtio/virtio.h
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/virtio.h
@@ -0,0 +1,323 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _BHYVE_VIRTIO_H_
+#define _BHYVE_VIRTIO_H_
+
+#include <machine/atomic.h>
+
+#include <dev/virtio/virtio.h>
+#include <dev/virtio/virtio_ring.h>
+#include <dev/virtio/mmio/virtio_mmio.h>
+
+/*
+ * These are derived from several virtio specifications.
+ *
+ * Some useful links:
+ * https://github.com/rustyrussell/virtio-spec
+ * http://people.redhat.com/pbonzini/virtio-spec.pdf
+ */
+
+/*
+ * A virtual device has zero or more "virtual queues" (virtqueue).
+ * Each virtqueue uses at least two 4096-byte pages, laid out thus:
+ *
+ * +-----------------------------------------------+
+ * | "desc": <N> descriptors, 16 bytes each |
+ * | ----------------------------------------- |
+ * | "avail": 2 uint16; <N> uint16; 1 uint16 |
+ * | ----------------------------------------- |
+ * | pad to 4k boundary |
+ * +-----------------------------------------------+
+ * | "used": 2 x uint16; <N> elems; 1 uint16 |
+ * | ----------------------------------------- |
+ * | pad to 4k boundary |
+ * +-----------------------------------------------+
+ *
+ * The number <N> that appears here is always a power of two and is
+ * limited to no more than 32768 (as it must fit in a 16-bit field).
+ * If <N> is sufficiently large, the above will occupy more than
+ * two pages. In any case, all pages must be physically contiguous
+ * within the guest's physical address space.
+ *
+ * The <N> 16-byte "desc" descriptors consist of a 64-bit guest
+ * physical address <addr>, a 32-bit length <len>, a 16-bit
+ * <flags>, and a 16-bit <next> field (all in guest byte order).
+ *
+ * There are three flags that may be set :
+ * NEXT descriptor is chained, so use its "next" field
+ * WRITE descriptor is for host to write into guest RAM
+ * (else host is to read from guest RAM)
+ * INDIRECT descriptor address field is (guest physical)
+ * address of a linear array of descriptors
+ *
+ * Unless INDIRECT is set, <len> is the number of bytes that may
+ * be read/written from guest physical address <addr>. If
+ * INDIRECT is set, WRITE is ignored and <len> provides the length
+ * of the indirect descriptors (and <len> must be a multiple of
+ * 16). Note that NEXT may still be set in the main descriptor
+ * pointing to the indirect, and should be set in each indirect
+ * descriptor that uses the next descriptor (these should generally
+ * be numbered sequentially). However, INDIRECT must not be set
+ * in the indirect descriptors. Upon reaching an indirect descriptor
+ * without a NEXT bit, control returns to the direct descriptors.
+ *
+ * Except inside an indirect, each <next> value must be in the
+ * range [0 .. N) (i.e., the half-open interval). (Inside an
+ * indirect, each <next> must be in the range [0 .. <len>/16).)
+ *
+ * The "avail" data structures reside in the same pages as the
+ * "desc" structures since both together are used by the device to
+ * pass information to the hypervisor's virtual driver. These
+ * begin with a 16-bit <flags> field and 16-bit index <idx>, then
+ * have <N> 16-bit <ring> values, followed by one final 16-bit
+ * field <used_event>. The <N> <ring> entries are simply indices
+ * into the descriptor ring (and thus must meet the same
+ * constraints as each <next> value). However, <idx> is counted
+ * up from 0 (initially) and simply wraps around after 65535; it
+ * is taken mod <N> to find the next available entry.
+ *
+ * The "used" ring occupies a separate page or pages, and contains
+ * values written from the virtual driver back to the guest OS.
+ * This begins with a 16-bit <flags> and 16-bit <idx>, then there
+ * are <N> "vring_used" elements, followed by a 16-bit <avail_event>.
+ * The <N> "vring_used" elements consist of a 32-bit <id> and a
+ * 32-bit <len> (vu_tlen below). The <id> is simply the index of
+ * the head of a descriptor chain the guest made available
+ * earlier, and the <len> is the number of bytes actually written,
+ * e.g., in the case of a network driver that provided a large
+ * receive buffer but received only a small amount of data.
+ *
+ * The two event fields, <used_event> and <avail_event>, in the
+ * avail and used rings (respectively -- note the reversal!), are
+ * always provided, but are used only if the virtual device
+ * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature
+ * negotiation. Similarly, both rings provide a flag --
+ * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in
+ * their <flags> field, indicating that the guest does not need an
+ * interrupt, or that the hypervisor driver does not need a
+ * notify, when descriptors are added to the corresponding ring.
+ * (These are provided only for interrupt optimization and need
+ * not be implemented.)
+ */
+#define VRING_ALIGN 4096
+
+/*
+ * PCI vendor/device IDs
+ */
+#define VIRTIO_VENDOR 0x1AF4
+#define VIRTIO_DEV_NET 0x1000
+#define VIRTIO_DEV_BLOCK 0x1001
+#define VIRTIO_DEV_CONSOLE 0x1003
+#define VIRTIO_DEV_SCSI 0x1004
+#define VIRTIO_DEV_RANDOM 0x1005
+#define VIRTIO_DEV_9P 0x1009
+#define VIRTIO_DEV_INPUT 0x1052
+
+/*
+ * PCI revision IDs
+ */
+#define VIRTIO_REV_INPUT 1
+
+/*
+ * PCI subvendor IDs
+ */
+#define VIRTIO_SUBVEN_INPUT 0x108E
+
+/*
+ * PCI subdevice IDs
+ */
+#define VIRTIO_SUBDEV_INPUT 0x1100
+
+/* From section 2.3, "Virtqueue Configuration", of the virtio specification */
+static inline int
+vring_size_aligned(u_int qsz)
+{
+ return (roundup2(vring_size(qsz, VRING_ALIGN), VRING_ALIGN));
+}
+
+struct mmio_devinst;
+struct vqueue_info;
+
+struct virtio_softc {
+ struct virtio_consts *vs_vc; /* constants (see below) */
+ int vs_flags; /* VIRTIO_* flags from above */
+ pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */
+ struct mmio_devinst *vs_mi; /* MMIO device instance */
+ uint32_t vs_negotiated_caps; /* negotiated capabilities */
+ struct vqueue_info *vs_queues; /* one per vc_nvq */
+ int vs_curq; /* current queue */
+};
+
+#define VS_LOCK(vs) \
+do { \
+ if (vs->vs_mtx) \
+ pthread_mutex_lock(vs->vs_mtx); \
+} while (0)
+
+#define VS_UNLOCK(vs) \
+do { \
+ if (vs->vs_mtx) \
+ pthread_mutex_unlock(vs->vs_mtx); \
+} while (0)
+
+struct virtio_consts {
+ const char *vc_name; /* name of driver (for diagnostics) */
+ int vc_nvq; /* number of virtual queues */
+ size_t vc_cfgsize; /* size of dev-specific config regs */
+ void (*vc_reset)(void *); /* called on virtual device reset */
+ void (*vc_qnotify)(void *, struct vqueue_info *);
+ /* called on QNOTIFY if no VQ notify */
+ int (*vc_cfgread)(void *, int, int, uint32_t *);
+ /* called to read config regs */
+ int (*vc_cfgwrite)(void *, int, int, uint32_t);
+ /* called to write config regs */
+ void (*vc_apply_features)(void *, uint64_t);
+ /* called to apply negotiated features */
+ uint64_t vc_hv_caps; /* hypervisor-provided capabilities */
+};
+
+/*
+ * Data structure allocated (statically) per virtual queue.
+ *
+ * Drivers may change vq_qsize after a reset. When the guest OS
+ * requests a device reset, the hypervisor first calls
+ * vs->vs_vc->vc_reset(); then the data structure below is
+ * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq).
+ *
+ * The remaining fields should only be fussed-with by the generic
+ * code.
+ *
+ * Note: the addresses of vq_desc, vq_avail, and vq_used are all
+ * computable from each other, but it's a lot simpler if we just
+ * keep a pointer to each one. The event indices are similarly
+ * (but more easily) computable, and this time we'll compute them:
+ * they're just XX_ring[N].
+ */
+#define VQ_ALLOC 0x01 /* set once we have a pfn */
+#define VQ_BROKED 0x02 /* ??? */
+struct vqueue_info {
+ uint16_t vq_qsize; /* size of this queue (a power of 2) */
+ void (*vq_notify)(void *, struct vqueue_info *);
+ /* called instead of vc_notify, if not NULL */
+
+ struct virtio_softc *vq_vs; /* backpointer to softc */
+ uint16_t vq_num; /* we're the num'th queue in the softc */
+
+ uint16_t vq_flags; /* flags (see above) */
+ uint16_t vq_last_avail; /* a recent value of vq_avail->idx */
+ uint16_t vq_next_used; /* index of the next used slot to be filled */
+ uint16_t vq_save_used; /* saved vq_used->idx; see vq_endchains */
+
+ uint32_t vq_offset; /* Offset in the control region */
+
+ struct vring_desc *vq_desc; /* descriptor array */
+ struct vring_avail *vq_avail; /* the "avail" ring */
+ struct vring_used *vq_used; /* the "used" ring */
+};
+
+/* as noted above, these are sort of backwards, name-wise */
+#define VQ_AVAIL_EVENT_IDX(vq) \
+ (*(uint16_t *)&(vq)->vq_used->ring[(vq)->vq_qsize])
+#define VQ_USED_EVENT_IDX(vq) \
+ ((vq)->vq_avail->ring[(vq)->vq_qsize])
+
+/*
+ * Is this ring ready for I/O?
+ */
+static inline int
+vq_ring_ready(struct vqueue_info *vq)
+{
+
+ return (vq->vq_flags & VQ_ALLOC);
+}
+
+/*
+ * Are there "available" descriptors? (This does not count
+ * how many, just returns True if there are some.)
+ */
+static inline int
+vq_has_descs(struct vqueue_info *vq)
+{
+
+ return (vq_ring_ready(vq) && vq->vq_last_avail !=
+ vq->vq_avail->idx);
+}
+
+
+static inline void
+vq_kick_enable(struct vqueue_info *vq)
+{
+
+ vq->vq_used->flags &= ~VRING_USED_F_NO_NOTIFY;
+ /*
+ * Full memory barrier to make sure the store to vq_used->flags
+ * happens before the load from vq_avail->idx, which results from a
+ * subsequent call to vq_has_descs().
+ */
+ atomic_thread_fence_seq_cst();
+}
+
+static inline void
+vq_kick_disable(struct vqueue_info *vq)
+{
+
+ vq->vq_used->flags |= VRING_USED_F_NO_NOTIFY;
+}
+
+struct iovec;
+
+/*
+ * Request description returned by vq_getchain.
+ *
+ * Writable iovecs start at iov[req.readable].
+ */
+struct vi_req {
+ int readable; /* num of readable iovecs */
+ int writable; /* num of writable iovecs */
+ unsigned int idx; /* ring index */
+ struct iov_emul *iove; /* Export io vector */
+};
+
+void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
+ void *dev_softc, struct mmio_devinst *mi,
+ struct vqueue_info *queues);
+int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix);
+void vi_reset_dev(struct virtio_softc *);
+
+int vq_getchain(struct vqueue_info *vq, struct iovec *iov, int niov,
+ struct vi_req *reqp);
+void vq_retchains(struct vqueue_info *vq, uint16_t n_chains);
+void vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx,
+ uint32_t iolen);
+void vq_relchain_publish(struct vqueue_info *vq);
+void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen);
+void vq_endchains(struct vqueue_info *vq, int used_all_avail);
+
+void vi_mmio_write(struct virtio_softc *vs, uint64_t offset);
+#endif /* _BHYVE_VIRTIO_H_ */
diff --git a/tests/sys/virtio/virtio.c b/tests/sys/virtio/virtio.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/virtio.c
@@ -0,0 +1,886 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ * Copyright (c) 2019 Joyent, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/ioctl.h>
+#include <sys/uio.h>
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include <dev/virtio/dbg/virtio_dbg.h>
+
+#include "debug.h"
+#include "iov_emul.h"
+#include "mmio_emul.h"
+#include "virtio.h"
+
+/*
+ * Functions for dealing with generalized "virtual devices" as
+ * defined by <https://www.google.com/#output=search&q=virtio+spec>
+ */
+
+/*
+ * In case we decide to relax the "virtio softc comes at the
+ * front of virtio-based device softc" constraint, let's use
+ * this to convert.
+ */
+#define DEV_SOFTC(vs) ((void *)(vs))
+
+/*
+ * Link a virtio_softc to its constants, the device softc, and
+ * the PCI emulation.
+ */
+void
+vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
+ void *dev_softc, struct mmio_devinst *mdi,
+ struct vqueue_info *queues)
+{
+ int i;
+
+ /* vs and dev_softc addresses must match */
+ assert((void *)vs == dev_softc);
+ vs->vs_vc = vc;
+ vs->vs_mi = mdi;
+
+ vs->vs_queues = queues;
+ for (i = 0; i < vc->vc_nvq; i++) {
+ queues[i].vq_vs = vs;
+ queues[i].vq_num = i;
+ }
+}
+
+/*
+ * Deliver an interrupt to the guest device.
+ */
+static void
+vq_interrupt(struct virtio_softc *vs)
+{
+ int fd = vs->vs_mi->mi_fd;
+ int error;
+
+ mmio_set_cfgdata32(vs->vs_mi, VIRTIO_MMIO_INTERRUPT_STATUS, VIRTIO_MMIO_INT_VRING);
+ error = ioctl(fd, VIRTIO_DBG_KICK);
+ if (error != 0)
+ EPRINTLN("device kick failed with %d\n", error);
+
+}
+
+/*
+ * Reset device (device-wide). This erases all queues, i.e.,
+ * all the queues become invalid (though we don't wipe out the
+ * internal pointers, we just clear the VQ_ALLOC flag).
+ *
+ * It resets negotiated features to "none".
+ */
+void
+vi_reset_dev(struct virtio_softc *vs)
+{
+ struct mmio_devinst *mdi = vs->vs_mi;
+ struct vqueue_info *vq;
+ int i, nvq;
+
+ if (vs->vs_mtx)
+ assert(pthread_mutex_isowned_np(vs->vs_mtx));
+
+ nvq = vs->vs_vc->vc_nvq;
+ for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
+ vq->vq_flags = 0;
+ vq->vq_last_avail = 0;
+ vq->vq_next_used = 0;
+ vq->vq_save_used = 0;
+ /* XXX Is this right? How should we actually set it? */
+ vq->vq_offset = UINT_MAX;
+ }
+ vs->vs_negotiated_caps = 0;
+ vs->vs_curq = 0;
+
+ mdi->mi_state = MIDEV_INVALID;
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_INTERRUPT_STATUS, 0);
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_READY, 0);
+
+}
+
+/*
+ * Initialize the currently-selected virtio queue (vs->vs_curq).
+ * The guest just gave us a page frame number, from which we can
+ * calculate the addresses of the queue.
+ */
+/* XXX Switch it back to using the virtio softc. */
+static void
+vi_vq_init(struct mmio_devinst *mdi, struct vqueue_info *vq)
+{
+ uint64_t offset;
+
+ offset = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_DESC_HIGH);
+ offset <<= 32;
+ offset |= mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_DESC_LOW);
+ vq->vq_desc = (struct vring_desc *)(mdi->mi_addr + offset);
+
+ offset = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_AVAIL_HIGH);
+ offset <<= 32;
+ offset |= mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_AVAIL_LOW);
+ vq->vq_avail = (struct vring_avail *)(mdi->mi_addr + offset);
+
+ offset = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_USED_HIGH);
+ offset <<= 32;
+ offset |= mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_USED_LOW);
+ vq->vq_used = (struct vring_used *)(mdi->mi_addr + offset);
+
+ /* Mark queue as allocated, and start at 0 when we use it. */
+ vq->vq_flags = VQ_ALLOC;
+ vq->vq_last_avail = 0;
+ vq->vq_next_used = 0;
+ vq->vq_save_used = 0;
+}
+
+
+/*
+ * Helper inline for vq_getchain(): record the i'th "real"
+ * descriptor.
+ */
+static inline void
+_vq_record(int i, struct vring_desc *vd, struct iovec *iov,
+ int n_iov, struct vi_req *reqp, struct iov_emul *wiove,
+ struct iov_emul *riove)
+{
+ if (i >= n_iov)
+ return;
+
+ /* XXX Handle OOM scenarios leading to iove_add failures. */
+
+ /* Preallocate a descriptor data region for the descriptor */
+ if ((vd->flags & VRING_DESC_F_WRITE) == 0) {
+ if (iove_add(riove, vd->addr, vd->len, &iov[i]) != 0)
+ return;
+
+ reqp->readable++;
+ } else {
+ if (iove_add(wiove, vd->addr, vd->len, &iov[i]) != 0)
+ return;
+
+ reqp->writable++;
+ }
+}
+#define VQ_MAX_DESCRIPTORS 512 /* see below */
+
+static int
+vq_import_indirect(struct vring_desc __unused **vdp)
+{
+ /* XXX Use the provided vd address to read in the indirect descriptor */
+ printf("UNIMPLEMENTED %s\n", __func__);
+ exit(1);
+}
+
+/*
+ * Examine the chain of descriptors starting at the "next one" to
+ * make sure that they describe a sensible request. If so, return
+ * the number of "real" descriptors that would be needed/used in
+ * acting on this request. This may be smaller than the number of
+ * available descriptors, e.g., if there are two available but
+ * they are two separate requests, this just returns 1. Or, it
+ * may be larger: if there are indirect descriptors involved,
+ * there may only be one descriptor available but it may be an
+ * indirect pointing to eight more. We return 8 in this case,
+ * i.e., we do not count the indirect descriptors, only the "real"
+ * ones.
+ *
+ * Basically, this vets the "flags" and "next" field of each
+ * descriptor and tells you how many are involved. Since some may
+ * be indirect, this also needs the vmctx (in the pci_devinst
+ * at vs->vs_pi) so that it can find indirect descriptors.
+ *
+ * As we process each descriptor, we copy and adjust it (guest to
+ * host address wise, also using the vmtctx) into the given iov[]
+ * array (of the given size). If the array overflows, we stop
+ * placing values into the array but keep processing descriptors,
+ * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1.
+ * So you, the caller, must not assume that iov[] is as big as the
+ * return value (you can process the same thing twice to allocate
+ * a larger iov array if needed, or supply a zero length to find
+ * out how much space is needed).
+ *
+ * If some descriptor(s) are invalid, this prints a diagnostic message
+ * and returns -1. If no descriptors are ready now it simply returns 0.
+ *
+ * You are assumed to have done a vq_ring_ready() if needed (note
+ * that vq_has_descs() does one).
+ */
+int
+vq_getchain(struct vqueue_info *vq, struct iovec *iov, int niov,
+ struct vi_req *reqp)
+{
+ int i;
+ u_int ndesc, n_indir;
+ u_int idx, next;
+ struct vi_req req;
+ struct vring_desc *vdir, *vindir, *vp;
+ struct virtio_softc *vs;
+ const char *name;
+ int error;
+ struct iov_emul *riove, *wiove;
+ int fd;
+
+ vs = vq->vq_vs;
+ fd = vs->vs_mi->mi_fd;
+ name = vs->vs_vc->vc_name;
+ memset(&req, 0, sizeof(req));
+
+ vindir = NULL;
+ riove = iove_alloc();
+ wiove = iove_alloc();
+ if (riove == NULL || wiove == NULL) {
+ iove_free(riove);
+ iove_free(wiove);
+ return (ENOMEM);
+ }
+
+ /*
+ * Note: it's the responsibility of the guest not to
+ * update vq->vq_avail->idx until all of the descriptors
+ * the guest has written are valid (including all their
+ * "next" fields and "flags").
+ *
+ * Compute (vq_avail->idx - last_avail) in integers mod 2**16. This is
+ * the number of descriptors the device has made available
+ * since the last time we updated vq->vq_last_avail.
+ *
+ * We just need to do the subtraction as an unsigned int,
+ * then trim off excess bits.
+ */
+ idx = vq->vq_last_avail;
+ ndesc = (uint16_t)((u_int)vq->vq_avail->idx - idx);
+ if (ndesc == 0)
+ return (0);
+ if (ndesc > vq->vq_qsize) {
+ /* XXX need better way to diagnose issues */
+ EPRINTLN(
+ "%s: ndesc (%u) out of range, driver confused?",
+ name, (u_int)ndesc);
+ return (-1);
+ }
+
+ /*
+ * Now count/parse "involved" descriptors starting from
+ * the head of the chain.
+ *
+ * To prevent loops, we could be more complicated and
+ * check whether we're re-visiting a previously visited
+ * index, but we just abort if the count gets excessive.
+ */
+ req.idx = next = vq->vq_avail->ring[idx & (vq->vq_qsize - 1)];
+ req.iove = wiove;
+ vq->vq_last_avail++;
+ for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->next) {
+ if (next >= vq->vq_qsize) {
+ EPRINTLN(
+ "%s: descriptor index %u out of range, "
+ "driver confused?",
+ name, next);
+ goto error;
+ }
+ vdir = &vq->vq_desc[next];
+ if ((vdir->flags & VRING_DESC_F_INDIRECT) == 0) {
+ _vq_record(i, vdir, iov, niov, &req, wiove, riove);
+ i++;
+ } else if ((vs->vs_vc->vc_hv_caps &
+ VIRTIO_RING_F_INDIRECT_DESC) == 0) {
+ EPRINTLN(
+ "%s: descriptor has forbidden INDIRECT flag, "
+ "driver confused?",
+ name);
+ goto error;
+ } else {
+ n_indir = vdir->len / 16;
+ if ((vdir->len & 0xf) || n_indir == 0) {
+ EPRINTLN(
+ "%s: invalid indir len 0x%x, "
+ "driver confused?",
+ name, (u_int)vdir->len);
+ goto error;
+ }
+
+ error = vq_import_indirect(&vindir);
+ if (error != 0)
+ goto error;
+ /*
+ * Indirects start at the 0th, then follow
+ * their own embedded "next"s until those run
+ * out. Each one's indirect flag must be off
+ * (we don't really have to check, could just
+ * ignore errors...).
+ */
+ next = 0;
+ for (;;) {
+ vp = &vindir[next];
+ if (vp->flags & VRING_DESC_F_INDIRECT) {
+ EPRINTLN(
+ "%s: indirect desc has INDIR flag,"
+ " driver confused?",
+ name);
+ goto error;
+ }
+ _vq_record(i, vp, iov, niov, &req, wiove, riove);
+ if (++i > VQ_MAX_DESCRIPTORS) {
+ EPRINTLN(
+ "%s: descriptor loop? count > %d - driver confused?",
+ name, i);
+ goto error;
+ }
+ if ((vp->flags & VRING_DESC_F_NEXT) == 0)
+ break;
+ next = vp->next;
+ if (next >= n_indir) {
+ EPRINTLN(
+ "%s: invalid next %u > %u, "
+ "driver confused?",
+ name, (u_int)next, n_indir);
+ goto error;
+ }
+ }
+ }
+ if ((vdir->flags & VRING_DESC_F_NEXT) == 0)
+ goto done;
+ }
+
+error:
+ iove_free(riove);
+ iove_free(wiove);
+ free(vindir);
+
+ return (-1);
+
+done:
+ /* Read in readable descriptors from the kernel. */
+ error = iove_import(fd, riove);
+ iove_free(riove);
+ free(vindir);
+
+ if (error != 0) {
+ EPRINTLN("Reading in data failed with %d", error);
+ return (-1);
+ }
+
+ *reqp = req;
+ return (i);
+}
+
+/*
+ * Return the first n_chain request chains back to the available queue.
+ *
+ * (These chains are the ones you handled when you called vq_getchain()
+ * and used its positive return value.)
+ */
+void
+vq_retchains(struct vqueue_info *vq, uint16_t n_chains)
+{
+
+ vq->vq_last_avail -= n_chains;
+}
+
+void
+vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
+{
+ struct vring_used *vuh;
+ struct vring_used_elem *vue;
+ uint16_t mask;
+
+ /*
+ * Notes:
+ * - mask is N-1 where N is a power of 2 so computes x % N
+ * - vuh points to the "used" data shared with guest
+ * - vue points to the "used" ring entry we want to update
+ */
+ mask = vq->vq_qsize - 1;
+ vuh = vq->vq_used;
+
+ vue = &vuh->ring[vq->vq_next_used++ & mask];
+ vue->id = idx;
+ vue->len = iolen;
+}
+
+void
+vq_relchain_publish(struct vqueue_info *vq)
+{
+ /*
+ * Ensure the used descriptor is visible before updating the index.
+ * This is necessary on ISAs with memory ordering less strict than x86
+ * (and even on x86 to act as a compiler barrier).
+ */
+ atomic_thread_fence_rel();
+ vq->vq_used->idx = vq->vq_next_used;
+}
+
+/*
+ * Return specified request chain to the guest, setting its I/O length
+ * to the provided value.
+ *
+ * (This chain is the one you handled when you called vq_getchain()
+ * and used its positive return value.)
+ */
+void
+vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
+{
+ vq_relchain_prepare(vq, idx, iolen);
+ vq_relchain_publish(vq);
+}
+
+/*
+ * Driver has finished processing "available" chains and calling
+ * vq_relchain on each one. If driver used all the available
+ * chains, used_all should be set.
+ *
+ * If the "used" index moved we may need to inform the guest, i.e.,
+ * deliver an interrupt. Even if the used index did NOT move we
+ * may need to deliver an interrupt, if the avail ring is empty and
+ * we are supposed to interrupt on empty.
+ *
+ * Note that used_all_avail is provided by the caller because it's
+ * a snapshot of the ring state when he decided to finish interrupt
+ * processing -- it's possible that descriptors became available after
+ * that point. (It's also typically a constant 1/True as well.)
+ */
+void
+vq_endchains(struct vqueue_info *vq, int used_all_avail)
+{
+ struct virtio_softc *vs;
+ uint16_t event_idx, new_idx, old_idx;
+ int intr;
+
+ /*
+ * Interrupt generation: if we're using EVENT_IDX,
+ * interrupt if we've crossed the event threshold.
+ * Otherwise interrupt is generated if we added "used" entries,
+ * but suppressed by VRING_AVAIL_F_NO_INTERRUPT.
+ *
+ * In any case, though, if NOTIFY_ON_EMPTY is set and the
+ * entire avail was processed, we need to interrupt always.
+ */
+ vs = vq->vq_vs;
+ old_idx = vq->vq_save_used;
+ vq->vq_save_used = new_idx = vq->vq_used->idx;
+
+ /*
+ * Use full memory barrier between "idx" store from preceding
+ * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or
+ * "flags" field below.
+ */
+ atomic_thread_fence_seq_cst();
+ if (used_all_avail &&
+ (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
+ intr = 1;
+ else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {
+ event_idx = VQ_USED_EVENT_IDX(vq);
+ /*
+ * This calculation is per docs and the kernel
+ * (see src/sys/dev/virtio/virtio_ring.h).
+ */
+ intr = (uint16_t)(new_idx - event_idx - 1) <
+ (uint16_t)(new_idx - old_idx);
+ } else {
+ intr = new_idx != old_idx &&
+ !(vq->vq_avail->flags & VRING_AVAIL_F_NO_INTERRUPT);
+ }
+ if (intr)
+ vq_interrupt(vs);
+}
+
+/* Note: these are in sorted order to make for a fast search */
+static struct config_reg {
+ uint16_t cr_offset; /* register offset */
+ uint8_t cr_ro; /* true => reg is read only */
+ const char *cr_name; /* name of reg */
+} config_regs[] = {
+ { VIRTIO_MMIO_MAGIC_VALUE, 1,"MMIO_MAGIC_VALUE" },
+ { VIRTIO_MMIO_VERSION, 1, "VERSION" },
+ { VIRTIO_MMIO_DEVICE_ID, 1, "DEVICE_ID" },
+ { VIRTIO_MMIO_VENDOR_ID, 1, "VENDOR_ID" },
+ { VIRTIO_MMIO_HOST_FEATURES, 1, "HOST_FEATURES" },
+ { VIRTIO_MMIO_HOST_FEATURES_SEL, 0, "HOST_FEATURES_SEL" },
+ { VIRTIO_MMIO_GUEST_FEATURES, 0, "GUEST_FEATURES" },
+ { VIRTIO_MMIO_GUEST_FEATURES_SEL, 0, "GUEST_FEATURES_SEL" },
+ { VIRTIO_MMIO_QUEUE_SEL, 0, "QUEUE_SEL" },
+ { VIRTIO_MMIO_QUEUE_NUM_MAX, 1, "QUEUE_NUM_MAX" },
+ { VIRTIO_MMIO_QUEUE_NUM, 0, "QUEUE_NUM" },
+ { VIRTIO_MMIO_QUEUE_READY, 0, "QUEUE_READY" },
+ { VIRTIO_MMIO_QUEUE_NOTIFY, 0, "QUEUE_NOTIFY" },
+ { VIRTIO_MMIO_INTERRUPT_STATUS, 1, "INTERRUPT_STATUS" },
+ { VIRTIO_MMIO_INTERRUPT_ACK, 0, "INTERRUPT_ACK" },
+ { VIRTIO_MMIO_STATUS, 0, "STATUS" },
+ { VIRTIO_MMIO_QUEUE_DESC_LOW, 0, "QUEUE_DESC_LOW" },
+ { VIRTIO_MMIO_QUEUE_DESC_HIGH, 0, "QUEUE_DESC_HIGH" },
+ { VIRTIO_MMIO_QUEUE_AVAIL_LOW, 0, "QUEUE_AVAIL_LOW" },
+ { VIRTIO_MMIO_QUEUE_AVAIL_HIGH, 0, "QUEUE_AVAIL_HIGH" },
+ { VIRTIO_MMIO_QUEUE_USED_LOW, 0, "QUEUE_USED_LOW" },
+ { VIRTIO_MMIO_QUEUE_USED_HIGH, 0, "QUEUE_USED_HIGH" },
+ { VIRTIO_MMIO_CONFIG_GENERATION, 1, "CONFIG_GENERATION" },
+};
+
+static inline struct config_reg *
+vi_find_cr(int offset) {
+ u_int hi, lo, mid;
+ struct config_reg *cr;
+
+ lo = 0;
+ hi = sizeof(config_regs) / sizeof(*config_regs) - 1;
+ while (hi >= lo) {
+ mid = (hi + lo) >> 1;
+ cr = &config_regs[mid];
+ if (cr->cr_offset == offset)
+ return (cr);
+ if (cr->cr_offset < offset)
+ lo = mid + 1;
+ else
+ hi = mid - 1;
+ }
+ return (NULL);
+}
+
+static void
+vi_handle_state_change(struct mmio_devinst *mdi, uint32_t status)
+{
+ switch (mdi->mi_state) {
+ case MIDEV_INVALID:
+ if (status & VIRTIO_CONFIG_STATUS_ACK)
+ mdi->mi_state = MIDEV_ACKNOWLEDGED;
+ break;
+
+ case MIDEV_ACKNOWLEDGED:
+ if (status & VIRTIO_CONFIG_STATUS_DRIVER)
+ mdi->mi_state = MIDEV_DRIVER_FOUND;
+ break;
+
+ case MIDEV_DRIVER_FOUND:
+ if (status & VIRTIO_CONFIG_S_FEATURES_OK)
+ mdi->mi_state = MIDEV_FEATURES_OK;
+ break;
+
+ case MIDEV_FEATURES_OK:
+ if (status & VIRTIO_CONFIG_STATUS_DRIVER_OK)
+ mdi->mi_state = MIDEV_LIVE;
+
+ break;
+
+ case MIDEV_LIVE:
+ break;
+
+ case MIDEV_FAILED:
+ mdi->mi_state = MIDEV_FAILED;
+ break;
+
+ default:
+ EPRINTLN("invalid device state %d", mdi->mi_state);
+ exit(1);
+ }
+}
+
+static void
+vi_handle_status(struct virtio_softc *vs, uint32_t status)
+{
+
+ struct mmio_devinst *mdi = vs->vs_mi;
+
+ if (status & VIRTIO_CONFIG_STATUS_FAILED) {
+ mdi->mi_state = MIDEV_FAILED;
+ return;
+ }
+
+ if (status & VIRTIO_CONFIG_STATUS_RESET) {
+ mdi->mi_state = MIDEV_INVALID;
+ vi_reset_dev(vs);
+ return;
+ }
+
+ vi_handle_state_change(mdi, status);
+}
+
+static void
+vi_handle_host_features_sel(struct virtio_softc *vs, uint32_t sel)
+{
+ uint64_t caps = vs->vs_vc->vc_hv_caps;
+ struct mmio_devinst *mdi = vs->vs_mi;
+
+ if (sel > 1) {
+ EPRINTLN("HOST_FEATURES SEL 0x%x, "
+ "driver confused?", sel);
+ return;
+ }
+
+ if (sel == 1) {
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_HOST_FEATURES,
+ (uint32_t)(caps >> 32));
+ } else {
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_HOST_FEATURES,
+ (uint32_t)caps);
+ }
+}
+
+static void
+vi_handle_guest_features(struct virtio_softc *vs, uint32_t features)
+{
+ struct mmio_devinst *mdi = vs->vs_mi;
+ struct virtio_consts *vc = vs->vs_vc;
+ uint64_t caps;
+ int hi;
+
+ /*
+ * XXX Add asserts to ensure we are negotiating w/ the device
+ * and not in the middle of an operation.
+ */
+
+ hi = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_GUEST_FEATURES_SEL);
+ if (hi > 1) {
+ EPRINTLN("GUEST_FEATURES_SEL 0x%x, "
+ "driver confused?", hi);
+ return;
+ }
+
+ if (hi == 1) {
+ /* Update the upper bits, keep the lower ones intact. */
+ caps = (vc->vc_hv_caps | features) >> 32;
+ vs->vs_negotiated_caps &= (vs->vs_negotiated_caps & (((1UL << 32) - 1)) << 32);
+ vs->vs_negotiated_caps |= (caps << 32);
+ } else {
+ /* Update the lower bits, keep the upper ones intact. */
+ caps = (uint32_t)(vc->vc_hv_caps | features);
+ vs->vs_negotiated_caps &= (vs->vs_negotiated_caps & ((1UL << 32) - 1));
+ vs->vs_negotiated_caps |= caps;
+
+ /* The LSBs get sent second, we are ready to apply the features. */
+ if (vc->vc_apply_features)
+ (*vc->vc_apply_features)(DEV_SOFTC(vs),
+ vs->vs_negotiated_caps);
+ }
+
+}
+
+
+static void
+vi_handle_queue_sel(struct virtio_softc *vs)
+{
+ struct mmio_devinst *mdi = vs->vs_mi;
+ struct vqueue_info *vq;
+
+ vs->vs_curq = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_SEL);
+
+ if (vs->vs_curq < 0 || vs->vs_curq >= vs->vs_vc->vc_nvq) {
+ EPRINTLN("Selected queue %d, driver confused?", vs->vs_curq);
+ return;
+ }
+
+ vq = &vs->vs_queues[vs->vs_curq];
+ if (vq_ring_ready(vq)) {
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_READY, 1);
+ return;
+ }
+
+ /* Part of virtqueue initialization. */
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_NUM_MAX, vq->vq_qsize);
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_READY, 0);
+
+ return;
+}
+
+static void
+vi_handle_queue_num(struct virtio_softc *vs, int32_t qsize)
+{
+ struct vqueue_info *vq = &vs->vs_queues[vs->vs_curq];
+
+ if (qsize > vq->vq_qsize || !powerof2(qsize)) {
+ EPRINTLN("QUEUE_NUM %d is invalid, driver confused?", qsize);
+ return;
+ }
+
+ vq->vq_qsize = qsize;
+}
+
+static void
+vi_handle_queue_ready(struct virtio_softc *vs, uint32_t ready)
+{
+ struct vqueue_info *vq = &vs->vs_queues[vs->vs_curq];
+ struct mmio_devinst *mdi = vs->vs_mi;
+
+ if (ready > 1) {
+ EPRINTLN("QUEUE_READY has value %d, driver confused?", ready);
+ return;
+ }
+
+ if (ready == 1 && !vq_ring_ready(vq)) {
+ vi_vq_init(mdi, vq);
+ return;
+ }
+}
+
+static void
+vi_handle_interrupt_ack(struct virtio_softc *vs, uint32_t ack)
+{
+ struct mmio_devinst *mdi = vs->vs_mi;
+
+ /*
+ * Follow the protocol even if we are executing the
+ * interrupt ourselves, so we are the ones that sent
+ * the ACK from the kernel in the first place.
+ */
+ if (ack != 1) {
+ EPRINTLN("INTERRUPT_ACK has value %d, "
+ "driver confused?", ack);
+ return;
+ }
+
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_INTERRUPT_ACK, 0);
+}
+
+static void
+vi_handle_queue_notify(struct virtio_softc *vs, uint32_t ind)
+{
+ struct virtio_consts *vc = vs->vs_vc;
+ struct vqueue_info *vq;
+
+ if (ind >= (unsigned int)vc->vc_nvq) {
+ EPRINTLN("%s: queue %d notify out of range",
+ vc->vc_name, ind);
+ }
+
+ vq = &vs->vs_queues[ind];
+ if (vq->vq_notify) {
+ (*vq->vq_notify)(DEV_SOFTC(vs), vq);
+ } else if (vc->vc_qnotify) {
+ (*vc->vc_qnotify)(DEV_SOFTC(vs), vq);
+ } else {
+ EPRINTLN("%s: qnotify value %d: missing vq/vc notify",
+ vc->vc_name, ind);
+ }
+
+}
+
+void
+vi_mmio_write(struct virtio_softc *vs, uint64_t offset)
+{
+ /* Reported writes are always 32-bit. */
+ const int size = 4;
+
+ struct mmio_devinst *mdi = vs->vs_mi;
+ struct virtio_consts *vc;
+ struct config_reg *cr;
+ const char *name;
+ uint32_t newoff;
+ int32_t value;
+ uint64_t max;
+ int error;
+
+ if (vs->vs_mtx)
+ pthread_mutex_lock(vs->vs_mtx);
+
+ vc = vs->vs_vc;
+ name = vc->vc_name;
+
+ /* If writing in the config space, */
+ if (offset >= VIRTIO_MMIO_CONFIG) {
+ newoff = offset - VIRTIO_MMIO_CONFIG;
+ max = vc->vc_cfgsize ? vc->vc_cfgsize : (mdi->mi_bytes - VIRTIO_MMIO_CONFIG);
+ if (newoff + size > max)
+ goto bad;
+
+ value = mmio_get_cfgdata32(mdi, offset);
+
+ if (vc->vc_cfgwrite != NULL)
+ error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value);
+ else
+ error = 0;
+ if (!error)
+ goto done;
+ }
+
+bad:
+ cr = vi_find_cr(offset);
+ if (cr == NULL) {
+ EPRINTLN("%s: write to bad offset %jd",
+ name, (uintmax_t)offset);
+ goto done;
+
+ }
+
+ if (cr->cr_ro) {
+ EPRINTLN("%s: write to read-only reg %s",
+ name, cr->cr_name);
+ goto done;
+ }
+
+ value = mmio_get_cfgdata32(mdi, cr->cr_offset);
+
+ switch (cr->cr_offset) {
+ case VIRTIO_MMIO_STATUS:
+ vi_handle_status(vs, value);
+ break;
+
+ case VIRTIO_MMIO_HOST_FEATURES_SEL:
+ vi_handle_host_features_sel(vs, value);
+ break;
+
+ case VIRTIO_MMIO_GUEST_FEATURES:
+ vi_handle_guest_features(vs, value);
+ break;
+
+ case VIRTIO_MMIO_QUEUE_SEL:
+ vi_handle_queue_sel(vs);
+ break;
+
+ case VIRTIO_MMIO_QUEUE_NUM:
+ vi_handle_queue_num(vs, value);
+ break;
+
+ case VIRTIO_MMIO_QUEUE_READY:
+ vi_handle_queue_ready(vs, value);
+ break;
+
+ case VIRTIO_MMIO_QUEUE_NOTIFY:
+ vi_handle_queue_notify(vs, value);
+ break;
+
+ case VIRTIO_MMIO_INTERRUPT_ACK:
+ vi_handle_interrupt_ack(vs, value);
+ break;
+ default:
+ EPRINTLN("Unhandled offset %d\n", cr->cr_offset);
+ assert(0);
+ }
+
+ goto done;
+
+done:
+
+ if (vs->vs_mtx)
+ pthread_mutex_unlock(vs->vs_mtx);
+}
diff --git a/tests/sys/virtio/virtiodbg.c b/tests/sys/virtio/virtiodbg.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/virtiodbg.c
@@ -0,0 +1,105 @@
+#include <err.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include "config.h"
+#include "debug.h"
+#include "mevent.h"
+#include "mmio_emul.h"
+
+static void
+virtiodbg_usage(int code)
+{
+ const char *progname;
+
+ progname = getprogname();
+
+ fprintf(stderr,
+ "Usage: %s [-hot]\n"
+ " -h: help\n"
+ " -o: set config 'var' to 'value'\n"
+ " -t: MMIO device type\n",
+ progname);
+ exit(code);
+}
+
+static bool
+virtiodbg_parse_config_option(nvlist_t *nvl, const char *option)
+{
+ const char *key;
+ char *value;
+
+ key = option;
+ value = strchr(option, '=');
+ if (value == NULL || value[1] == '\0')
+ return (false);
+
+ *value = '\0';
+
+ set_config_value_node(nvl, key, value + 1);
+ return (true);
+}
+
+
+static nvlist_t *
+virtiodbg_optparse(int argc, char **argv)
+{
+ const char *optstr;
+ nvlist_t *nvl;
+ int c;
+
+ nvl = create_config_node("device");
+
+ optstr = "ho:t:";
+ while ((c = getopt(argc, argv, optstr)) != -1) {
+ switch (c) {
+ case 't':
+ if (strncmp(optarg, "help", strlen(optarg)) == 0) {
+ mmio_print_supported_devices();
+ exit(0);
+ } else if (mmio_parse_device(nvl, optarg) != 0)
+ exit(4);
+ else
+ break;
+ case 'o':
+ if (!virtiodbg_parse_config_option(nvl, optarg)) {
+ errx(EX_USAGE,
+ "invalid configuration option '%s'",
+ optarg);
+ }
+ break;
+ case 'h':
+ virtiodbg_usage(0);
+ default:
+ virtiodbg_usage(1);
+ }
+ }
+
+ return (nvl);
+}
+
+int
+main(int argc, char *argv[])
+{
+ nvlist_t *nvl;
+
+ init_config();
+ nvl = virtiodbg_optparse(argc, argv);
+
+ /* Exit if a device emulation finds an error in its initialization */
+ if (init_mmio(nvl) != 0) {
+ EPRINTLN("Device emulation initialization error: %s",
+ strerror(errno));
+ exit(4);
+ }
+
+ /* Head off to the main event dispatch loop. */
+ mevent_dispatch();
+
+ exit(4);
+}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Nov 27, 9:51 AM (14 h, 30 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
26252425
Default Alt Text
D45370.diff (160 KB)
Attached To
Mode
D45370: add virtio single-machine testing framework
Attached
Detach File
Event Timeline
Log In to Comment