Page MenuHomeFreeBSD

D45370.diff
No OneTemporary

D45370.diff

diff --git a/sys/conf/files b/sys/conf/files
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3438,6 +3438,7 @@
dev/virtio/mmio/virtio_mmio_cmdline.c optional virtio_mmio
dev/virtio/mmio/virtio_mmio_fdt.c optional virtio_mmio fdt
dev/virtio/mmio/virtio_mmio_if.m optional virtio_mmio
+dev/virtio/dbg/virtio_dbg.c optional virtio_dbg
dev/virtio/network/if_vtnet.c optional vtnet
dev/virtio/balloon/virtio_balloon.c optional virtio_balloon
dev/virtio/block/virtio_blk.c optional virtio_blk
diff --git a/sys/dev/virtio/dbg/virtio_dbg.h b/sys/dev/virtio/dbg/virtio_dbg.h
new file mode 100644
--- /dev/null
+++ b/sys/dev/virtio/dbg/virtio_dbg.h
@@ -0,0 +1,25 @@
+#ifndef _VIRTIO_DBG_
+#define _VIRTIO_DBG_
+
+#include <sys/cdefs.h>
+#include <sys/ioccom.h>
+
+struct vtdbg_transfer {
+ caddr_t vtdt_device;
+ caddr_t vtdt_driver;
+ size_t vtdt_len;
+};
+
+struct vtdbg_io_args {
+ struct vtdbg_transfer *transfers;
+ size_t cnt;
+ bool touser;
+};
+
+#define VIRTIO_DBG_INIT _IO('v', 1)
+#define VIRTIO_DBG_KICK _IO('v', 2)
+#define VIRTIO_DBG_ACK _IO('v', 3)
+#define VIRTIO_DBG_TRANSFER _IOWR('v', 4, struct vtdbg_io_args)
+
+
+#endif /* _VIRTIO_DBG_ */
diff --git a/sys/dev/virtio/dbg/virtio_dbg.c b/sys/dev/virtio/dbg/virtio_dbg.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/virtio/dbg/virtio_dbg.c
@@ -0,0 +1,970 @@
+/*-
+ * Copyright (c) 2024 Emil Tsalapatis
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/event.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/kthread.h>
+#include <sys/limits.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rman.h>
+#include <sys/rwlock.h>
+#include <sys/selinfo.h>
+#include <sys/stat.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_param.h>
+
+#include <machine/bus.h>
+#include <machine/pmap.h>
+#include <machine/resource.h>
+#include <machine/vmparam.h>
+
+#include <dev/virtio/virtio_config.h>
+#include <dev/virtio/virtqueue.h>
+#include <dev/virtio/dbg/virtio_dbg.h>
+#include <dev/virtio/mmio/virtio_mmio.h>
+
+#include "virtio_mmio_if.h"
+
+#define VTDBG_MAGIC ((uint64_t)0x84848484ULL)
+
+/*
+ * XXX Determine these sizes in a well-defined
+ * per-device fashion.
+ */
+#define VTDBG_MAPSZ (1024 * 1024 * 10)
+#define VTDBG_RESERVE_DEVSPACE (4096)
+
+/* XXX Remove after development is done. */
+#define VTDBG_WARN(format, ...) \
+ do { \
+ printf("(%s:%d) " format, __func__, __LINE__, ##__VA_ARGS__); \
+ } while (0)
+
+static device_t vtdbg_parent;
+static driver_t *vtdbg_driver;
+
+#define VTDBG_UPDATE_DESC (0x01)
+#define VTDBG_UPDATE_USED (0x02)
+#define VTDBG_UPDATE_AVAIL (0x04)
+#define VTDBG_INTR_PENDING (0x08)
+#define VTDBG_INTR_EXITING (0x10)
+
+/*
+ * Information on a debug device instance. Accessed
+ * through the control device's softc.
+ */
+struct vtdbg_softc {
+ struct mtx vtd_mtx;
+ struct cv vtd_cv;
+ struct knlist vtd_note;
+ uint32_t vtd_magic;
+
+ vm_object_t vtd_object;
+ vm_ooffset_t vtd_baseaddr;
+ size_t vtd_bytes;
+ size_t vtd_allocated;
+
+ virtqueue_intr_t *vtd_intr;
+ void *vtd_intr_arg;
+ struct proc *vtd_pintr;
+
+ vm_ooffset_t vtd_offset;
+
+ uint32_t vtd_flags;
+
+ device_t vtd_dev;
+};
+
+/*
+ * Subclass of vtmmio_softc that also lets the virtio device access
+ * vtdbg related information while also being usable from vtmmio_*
+ * methods. The vtdbg_softc * is the softc of the control device and
+ * is allocated dynamically when opening an instance of the control device,
+ * while the virtio_dbg_softc here is allocated during device_t creation.
+ */
+struct virtio_dbg_softc {
+ struct vtmmio_softc vtmdbg_mmio;
+ struct vtdbg_softc *vtmdbg_dbg;
+};
+
+/*
+ * Store the parent bus and driver pointers for the debug devices,
+ * because we need them when creating debug devices on-demand later on.
+ * We are hanging off of the nexus, so we are certain it's not going away.
+ */
+static void
+virtio_dbg_identify(driver_t *driver, device_t parent)
+{
+ vtdbg_parent = parent;
+ vtdbg_driver = driver;
+}
+
+static struct vtdbg_softc *
+vtmmio_get_vtdbg(device_t dev)
+{
+ struct virtio_dbg_softc *sc;
+
+ sc = device_get_softc(dev);
+ MPASS(sc->vtmdbg_dbg->vtd_magic == VTDBG_MAGIC);
+
+ return (sc->vtmdbg_dbg);
+}
+
+/*
+ * Explicitly turn polling into a no-op.
+ */
+static int
+virtio_dbg_poll(device_t dev)
+{
+
+ return (0);
+}
+
+
+/*
+ * Make sure the shared virtio device region between kernel and userspace
+ * is configured properly.
+ */
+static int
+virtio_dbg_probe(device_t dev)
+{
+ struct virtio_dbg_softc *sc;
+ struct vtmmio_softc *mmiosc;
+ uint32_t magic, version;
+
+ sc = device_get_softc(dev);
+ mmiosc = &sc->vtmdbg_mmio;
+
+ /* Fake platform to trigger virtio_mmio_note() on writes. */
+ sc->vtmdbg_mmio.platform = dev;
+
+ magic = vtmmio_read_config_4(mmiosc, VIRTIO_MMIO_MAGIC_VALUE);
+ if (magic != VIRTIO_MMIO_MAGIC_VIRT) {
+ device_printf(dev, "Bad magic value %#x\n", magic);
+ return (ENXIO);
+ }
+
+ version = vtmmio_read_config_4(mmiosc, VIRTIO_MMIO_VERSION);
+ if (version != 2) {
+ device_printf(dev, "Unsupported version: %#x\n", version);
+ return (ENXIO);
+ }
+
+ if (vtmmio_read_config_4(mmiosc, VIRTIO_MMIO_DEVICE_ID) == 0)
+ return (ENXIO);
+
+ device_set_desc(dev, "VirtIO Emulated MMIO adapter");
+
+ return (0);
+}
+
+/*
+ * Creates the virtio device corresponding to the transport instance.
+ */
+static int
+virtio_dbg_attach(device_t dev)
+{
+ struct virtio_dbg_softc *sc;
+ struct vtmmio_softc *mmiosc;
+ device_t child;
+
+ sc = device_get_softc(dev);
+ mmiosc = &sc->vtmdbg_mmio;
+
+ mmiosc->dev = dev;
+ mmiosc->vtmmio_version = vtmmio_read_config_4(mmiosc, VIRTIO_MMIO_VERSION);
+
+ vtmmio_reset(mmiosc);
+
+ /* Tell the host we've noticed this device. */
+ vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
+
+ mtx_lock(&Giant);
+ if ((child = device_add_child(dev, NULL, -1)) == NULL) {
+ device_printf(dev, "Cannot create child device.\n");
+ vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED);
+
+ DEVICE_DETACH(dev);
+ mtx_unlock(&Giant);
+
+ return (ENOMEM);
+ }
+
+ mmiosc->vtmmio_child_dev = child;
+ vtmmio_probe_and_attach_child(mmiosc);
+
+ mtx_unlock(&Giant);
+
+ return (0);
+}
+
+/*
+ * Recompute the queue descriptor to be an offset within the shared user/kernel
+ * device control region. Our userspace cannot meaningfully translate
+ * kernel physical addresses, so we transform the values in the queue
+ * descriptor address registers into offsets. Userspace finds the vq address
+ * by adding the offset to its own virtual address for the region.
+ */
+static void
+virtio_dbg_qdesc_offset(struct vtmmio_softc *sc, uint64_t baseaddr,
+ int hireg, int loreg)
+{
+ struct resource *res = sc->res[0];
+ uint32_t hi, lo;
+ uint64_t qaddr;
+
+ /* Read in the components of the physical address. */
+ hi = bus_read_4(res, hireg);
+ lo = bus_read_4(res, loreg);
+
+ /* Recompute into an offset into the vq control region. */
+ qaddr = (((uint64_t)hi) << 32 | (uint64_t)lo);
+ qaddr -= vtophys(baseaddr);
+
+ /* Update the register values. */
+ hi = (qaddr >> 32);
+ lo = (qaddr & ((1ULL << 32) - 1));
+
+ /* Direct bus write because to avoid triggering note(). */
+ bus_write_4(res, hireg, hi);
+ bus_write_4(res, loreg, lo);
+}
+
+/* Notify userspace of a write, and wait for a response. */
+static int
+virtio_dbg_note(device_t dev, size_t offset, int val)
+{
+ struct vtdbg_softc *vtdsc;
+ struct virtio_dbg_softc *sc;
+
+ sc = device_get_softc(dev);
+ vtdsc = sc->vtmdbg_dbg;
+ MPASS(vtdsc->vtd_magic == VTDBG_MAGIC);
+
+ /*
+ * Intercept writes to the QUEUE_{DESC, AVAIL, USED}_{HIGH, LOW}
+ * registers and instead pass to the user the offset from the beginning
+ * of the control region. Do not actually notify userspace of the writes,
+ * we will recompute and notify once we set VIRTIO_MMIO_QUEUE_READY.
+ *
+ * Both high and low registers are set together, so just track writes to
+ * the high address bits.
+ */
+ switch (offset) {
+ case VIRTIO_MMIO_QUEUE_DESC_HIGH:
+ vtdsc->vtd_flags |= VTDBG_UPDATE_DESC;
+ return (1);
+ case VIRTIO_MMIO_QUEUE_USED_HIGH:
+ vtdsc->vtd_flags |= VTDBG_UPDATE_USED;
+ return (1);
+ case VIRTIO_MMIO_QUEUE_AVAIL_HIGH:
+ vtdsc->vtd_flags |= VTDBG_UPDATE_AVAIL;
+ return (1);
+ }
+
+ /* Only forward the listed register writes to userspace. */
+ switch (offset) {
+ case VIRTIO_MMIO_HOST_FEATURES_SEL:
+ case VIRTIO_MMIO_GUEST_FEATURES:
+ case VIRTIO_MMIO_QUEUE_SEL:
+ case VIRTIO_MMIO_QUEUE_NUM:
+ case VIRTIO_MMIO_QUEUE_NOTIFY:
+ case VIRTIO_MMIO_INTERRUPT_ACK:
+ case VIRTIO_MMIO_STATUS:
+ break;
+ case VIRTIO_MMIO_QUEUE_READY:
+ /* if changed, transform the offsets. */
+ if (vtdsc->vtd_flags & VTDBG_UPDATE_DESC) {
+ virtio_dbg_qdesc_offset(&sc->vtmdbg_mmio, vtdsc->vtd_baseaddr,
+ VIRTIO_MMIO_QUEUE_DESC_HIGH, VIRTIO_MMIO_QUEUE_DESC_LOW);
+ vtdsc->vtd_flags &= ~VTDBG_UPDATE_DESC;
+ }
+
+ if (vtdsc->vtd_flags & VTDBG_UPDATE_USED) {
+ virtio_dbg_qdesc_offset(&sc->vtmdbg_mmio, vtdsc->vtd_baseaddr,
+ VIRTIO_MMIO_QUEUE_USED_HIGH, VIRTIO_MMIO_QUEUE_USED_LOW);
+ vtdsc->vtd_flags &= ~VTDBG_UPDATE_USED;
+ }
+
+ if (vtdsc->vtd_flags & VTDBG_UPDATE_AVAIL) {
+ virtio_dbg_qdesc_offset(&sc->vtmdbg_mmio, vtdsc->vtd_baseaddr,
+ VIRTIO_MMIO_QUEUE_AVAIL_HIGH, VIRTIO_MMIO_QUEUE_AVAIL_LOW);
+ vtdsc->vtd_flags &= ~VTDBG_UPDATE_AVAIL;
+ }
+ break;
+ default:
+ return (1);
+ }
+
+ mtx_lock(&vtdsc->vtd_mtx);
+ vtdsc->vtd_offset = offset;
+ KNOTE_LOCKED(&vtdsc->vtd_note, 0);
+
+ /*
+ * We cannot sleep here because this code is called holding non-sleepable locks.
+ * This is because this busy wait's corresponding operation for other transports is
+ * a VM exit, which is instantaneous from the point of view of the guest kernel.
+ * To prevent a "sleeping thread" panic, we busy wait here. There is always the
+ * danger of our VMM process leaving us hanging, but that is always a danger even
+ * with non-emulated virtio transports - it just isn't visible to the guest, since
+ * the VMM is normally on the host.
+ */
+ while (vtdsc->vtd_offset != 0) {
+ mtx_unlock(&vtdsc->vtd_mtx);
+ cpu_spinwait();
+ mtx_lock(&vtdsc->vtd_mtx);
+ }
+
+ mtx_unlock(&vtdsc->vtd_mtx);
+
+ return (1);
+}
+
+/*
+ * Pass interrupt information to the cdev. The cdev will be directly
+ * running the device interrupt handling code as an ioctl.
+ */
+static int
+virtio_dbg_setup_intr(device_t dev, device_t mmio_dev, void *handler, void *ih_user)
+{
+ struct vtdbg_softc *sc;
+
+ sc = vtmmio_get_vtdbg(dev);
+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
+
+ mtx_lock(&sc->vtd_mtx);
+ sc->vtd_intr = handler;
+ sc->vtd_intr_arg = ih_user;
+ mtx_unlock(&sc->vtd_mtx);
+
+ return (0);
+}
+
+static device_method_t virtio_dbg_methods[] = {
+ DEVMETHOD(device_attach, virtio_dbg_attach),
+ DEVMETHOD(device_identify, virtio_dbg_identify),
+ DEVMETHOD(device_probe, virtio_dbg_probe),
+
+ DEVMETHOD(virtio_mmio_poll, virtio_dbg_poll),
+ DEVMETHOD(virtio_mmio_note, virtio_dbg_note),
+ DEVMETHOD(virtio_mmio_setup_intr, virtio_dbg_setup_intr),
+
+ DEVMETHOD_END
+};
+
+DEFINE_CLASS_1(virtio_dbg, virtio_dbg_driver, virtio_dbg_methods,
+ sizeof(struct vtdbg_softc), vtmmio_driver);
+/*
+ * XXX We are currently hanging off of the nexus, not 100% it's the right way.
+ */
+DRIVER_MODULE(virtio_dbg, nexus, virtio_dbg_driver, 0, 0);
+MODULE_VERSION(virtio_dbg, 1);
+
+static struct cdev *vtdbg_dev;
+
+/*
+ * Create and map the device memory into the kernel.
+ */
+static int
+vtdbg_map_kernel(struct vtdbg_softc *sc)
+{
+ vm_object_t obj = sc->vtd_object;
+ size_t bytes = IDX_TO_OFF(obj->size);
+ vm_offset_t baseaddr, tmp;
+ vm_page_t m, end_m;
+ int error;
+
+ /* XXX Do not allow mapping twice. */
+
+ vm_object_reference(obj);
+
+ /*
+ * Populate the object with physically contiguous pages, because
+ * the object is used to back the virtio device control region.
+ */
+ VM_OBJECT_WLOCK(obj);
+ m = vm_page_alloc_contig(obj, 0, VM_ALLOC_NORMAL | VM_ALLOC_ZERO, obj->size,
+ 0, (uint64_t) -1, 1, 0, VM_MEMATTR_DEFAULT);
+ VM_OBJECT_WUNLOCK(obj);
+ if (m == NULL) {
+ vm_object_deallocate(obj);
+ return (ENOMEM);
+ }
+
+
+ baseaddr = VM_MIN_KERNEL_ADDRESS;
+ error = vm_map_find(kernel_map, obj, 0, &baseaddr, bytes, VM_MAX_KERNEL_ADDRESS,
+ VMFS_OPTIMAL_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error != KERN_SUCCESS) {
+ vm_object_deallocate(obj);
+ return (ENOMEM);
+ }
+
+ end_m = m + (bytes / PAGE_SIZE);
+ tmp = baseaddr;
+ for (; m < end_m; m++) {
+ vm_page_valid(m);
+ pmap_zero_page(m);
+ pmap_enter(kernel_pmap, tmp, m, VM_PROT_RW,
+ VM_PROT_RW | PMAP_ENTER_WIRED, 0);
+ tmp += PAGE_SIZE;
+ vm_page_xunbusy(m);
+ }
+
+
+ sc->vtd_baseaddr = baseaddr;
+ sc->vtd_bytes = bytes;
+
+ /* Reserve space for the device control region. */
+ sc->vtd_allocated = VTDBG_RESERVE_DEVSPACE;
+
+ return (0);
+}
+
+static void
+vtdbg_intr(void *arg)
+{
+ struct vtdbg_softc *sc = (struct vtdbg_softc *)arg;
+
+ mtx_lock(&sc->vtd_mtx);
+ while ((sc->vtd_flags & VTDBG_INTR_EXITING) == 0) {
+ if ((sc->vtd_flags & VTDBG_INTR_PENDING) == 0) {
+ cv_wait(&sc->vtd_cv, &sc->vtd_mtx);
+ continue;
+ }
+
+ sc->vtd_flags &= ~VTDBG_INTR_PENDING;
+ mtx_unlock(&sc->vtd_mtx);
+
+ if (sc->vtd_intr)
+ sc->vtd_intr(sc->vtd_intr_arg);
+
+ mtx_lock(&sc->vtd_mtx);
+ cv_wait(&sc->vtd_cv, &sc->vtd_mtx);
+ }
+
+ sc->vtd_pintr = NULL;
+ cv_signal(&sc->vtd_cv);
+
+ mtx_unlock(&sc->vtd_mtx);
+
+ kproc_exit(0);
+}
+
+/*
+ * Destroy the virtio transport instance when closing the
+ * corresponding control device fd.
+ */
+static void
+vtdbg_dtor(void *arg)
+{
+ struct virtio_dbg_softc *devsc;
+ struct vtdbg_softc *sc = (struct vtdbg_softc *)arg;
+ vm_offset_t sva, eva;
+ device_t dev;
+
+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
+
+ if (sc->vtd_pintr != NULL) {
+ mtx_lock(&sc->vtd_mtx);
+ sc->vtd_flags |= VTDBG_INTR_EXITING;
+ cv_signal(&sc->vtd_cv);
+ mtx_unlock(&sc->vtd_mtx);
+
+ mtx_lock(&sc->vtd_mtx);
+ while (sc->vtd_pintr != NULL)
+ cv_wait(&sc->vtd_cv, &sc->vtd_mtx);
+ mtx_unlock(&sc->vtd_mtx);
+ }
+
+ dev = sc->vtd_dev;
+ if (dev != NULL) {
+ devsc = device_get_softc(dev);
+
+ mtx_lock(&Giant);
+ DEVICE_DETACH(dev);
+ mtx_unlock(&Giant);
+
+ free(devsc->vtmdbg_mmio.res[0], M_DEVBUF);
+ device_delete_child(vtdbg_parent, dev);
+ }
+
+
+ if (sc->vtd_baseaddr != 0) {
+ sva = sc->vtd_baseaddr;
+ eva = sva + sc->vtd_bytes;
+ vm_map_remove(kernel_map, sva, eva);
+ pmap_remove(kernel_pmap, sva, eva);
+ }
+
+ vm_object_deallocate(sc->vtd_object);
+
+ knlist_delete(&sc->vtd_note, curthread, 0);
+ knlist_destroy(&sc->vtd_note);
+
+ cv_destroy(&sc->vtd_cv);
+ mtx_destroy(&sc->vtd_mtx);
+
+ free(sc, M_DEVBUF);
+}
+
+static int
+vtdbg_open(struct cdev *cdev, int oflags, int devtype, struct thread *td)
+{
+ size_t sz = round_page(VTDBG_MAPSZ);
+ struct vtdbg_softc *sc;
+ int error;
+
+ sc = malloc(sizeof(struct vtdbg_softc), M_DEVBUF, M_NOWAIT|M_ZERO);
+ if (sc == NULL)
+ return (ENOMEM);
+
+ sc->vtd_magic = VTDBG_MAGIC;
+ mtx_init(&sc->vtd_mtx, "vtdbg", NULL, MTX_DEF);
+ cv_init(&sc->vtd_cv, "vtdbg");
+
+ knlist_init_mtx(&sc->vtd_note, &sc->vtd_mtx);
+
+ /* Create the common userspace/kernel virtio device region. */
+ sc->vtd_object = vm_pager_allocate(OBJT_PHYS, NULL, sz, VM_PROT_ALL,
+ 0, thread0.td_ucred);
+ if (sc->vtd_object == NULL) {
+ vtdbg_dtor(sc);
+ return (ENOMEM);
+ }
+
+ error = vtdbg_map_kernel(sc);
+ if (error != 0) {
+ vtdbg_dtor(sc);
+ return (error);
+ }
+
+ error = kproc_create(vtdbg_intr, (void *)sc, &sc->vtd_pintr,
+ 0, 0, "vtdbg_intr");
+ if (error != 0) {
+ vtdbg_dtor(sc);
+ return (error);
+ }
+
+ error = devfs_set_cdevpriv((void *)sc, vtdbg_dtor);
+ if (error != 0)
+ vtdbg_dtor(sc);
+
+ return (error);
+}
+
+static int
+vtdbg_mmap_single(struct cdev *cdev, vm_ooffset_t *offset,
+ vm_size_t size, vm_object_t *objp, int nprot)
+{
+ struct vtdbg_softc *sc;
+ int error;
+
+ error = devfs_get_cdevpriv((void **)&sc);
+ if (error != 0)
+ return (error);
+
+ if (*offset + size > sc->vtd_bytes)
+ return (EINVAL);
+
+ vm_object_reference(sc->vtd_object);
+ *objp = sc->vtd_object;
+
+ return (0);
+}
+
+static void *
+vtdbg_ringalloc(device_t dev, size_t size)
+{
+ struct vtdbg_softc *sc = vtmmio_get_vtdbg(dev);
+ void *mem;
+
+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
+
+ mtx_lock(&sc->vtd_mtx);
+ if (sc->vtd_allocated + size > sc->vtd_bytes) {
+ mtx_unlock(&sc->vtd_mtx);
+ return (NULL);
+ }
+
+ mem = (void *)(sc->vtd_baseaddr + sc->vtd_allocated);
+ sc->vtd_allocated += size;
+
+ mtx_unlock(&sc->vtd_mtx);
+
+ return (mem);
+}
+
+static device_t
+vtdbg_create_transport(device_t parent, struct vtdbg_softc *vtdsc)
+{
+ struct virtio_dbg_softc *sc;
+ struct vtmmio_softc *mmiosc;
+ struct resource *res;
+ device_t transport;
+
+ int uid = 0;
+
+ transport = BUS_ADD_CHILD(parent, 0, virtio_dbg_driver.name, uid);
+ device_set_driver(transport, vtdbg_driver);
+
+ sc = device_get_softc(transport);
+ mmiosc = &sc->vtmdbg_mmio;
+
+ /*
+ * XXX Hack. Create the resource out of thin air to
+ * keep the vtmmio_write_* calls working. If we wanted to be uniform
+ * would be reserving the resource out of the RAM pseudobus,
+ * but it has no associated struct rman * instance,
+ * and we have already reserved this memory region
+ * by allocating it anyway so there is no possiblity
+ * of conflicts..
+ */
+ res = malloc(sizeof(*res), M_DEVBUF, M_WAITOK);
+ res->r_bushandle = vtdsc->vtd_baseaddr;
+ res->r_bustag = X86_BUS_SPACE_MEM;
+ mmiosc->res[0] = res;
+
+ /* Ring buffer allocation callback. */
+ mmiosc->vtmmio_ringalloc_cb = vtdbg_ringalloc;
+
+ return (transport);
+}
+
+static int
+vtdbg_linkup_transport(struct vtdbg_softc *vtdsc, device_t dev)
+{
+ struct virtio_dbg_softc *mmiosc;
+
+ mtx_lock(&vtdsc->vtd_mtx);
+ if (vtdsc->vtd_dev != NULL) {
+ mtx_unlock(&vtdsc->vtd_mtx);
+ return (EALREADY);
+ }
+
+ mmiosc = device_get_softc(dev);
+
+ /* Have the device and cdev be able to refer to each other. */
+ mmiosc->vtmdbg_dbg = vtdsc;
+ vtdsc->vtd_dev = dev;
+
+ mtx_unlock(&vtdsc->vtd_mtx);
+
+ return (0);
+}
+
+/*
+ * Create virtio device. This function does the initialization both
+ * for the emulated transport, and for the virtio device. These are
+ * normally (e.g., for MMIO)) created at boot time using vtmmio_probe/vtmmio_attach,
+ * and vtmmio_probe_and_attach_child, respectively. We do this initialization
+ * here because we are dynamically creating the devices after booting, so
+ * we must manually invoke the device probe and attach methods.
+ */
+static int
+vtdbg_init(void)
+{
+ struct virtio_dbg_softc *sc;
+ struct vtdbg_softc *vtdsc;
+ device_t transport;
+ int error;
+
+ /* Retrieve the mapping address/size. */
+ error = devfs_get_cdevpriv((void **)&vtdsc);
+ if (error != 0)
+ return (error);
+
+ MPASS(vtdsc->vtd_magic == VTDBG_MAGIC);
+
+ transport = vtdbg_create_transport(vtdbg_parent, vtdsc);
+
+ error = vtdbg_linkup_transport(vtdsc, transport);
+ if (error != 0)
+ goto err;
+
+ error = DEVICE_PROBE(transport);
+ if (error != 0)
+ goto err;
+
+ return (DEVICE_ATTACH(transport));
+
+err:
+ sc = device_get_softc(transport);
+
+ /*
+ * Release the resource but do not notify
+ * the parent bus as we didn't reserve it
+ * from it.
+ */
+ free(sc->vtmdbg_mmio.res[0], M_DEVBUF);
+
+ mtx_lock(&Giant);
+ device_delete_child(vtdbg_parent, transport);
+ mtx_unlock(&Giant);
+
+ vtdsc->vtd_dev = NULL;
+
+ return (error);
+}
+
+/*
+ * Kick the dedicated kernel interrupt process.
+ */
+static void
+vtdbg_kick(struct vtdbg_softc *sc)
+{
+ mtx_lock(&sc->vtd_mtx);
+ sc->vtd_flags |= VTDBG_INTR_PENDING;
+ cv_signal(&sc->vtd_cv);
+ mtx_unlock(&sc->vtd_mtx);
+}
+
+/*
+ * The mmio virtio code uses note() to let the host know there has been a write.
+ * The note() call suspends the thread until the userspace device has been properly
+ * emulated, at which point a userspace thread will allow it to resume.
+ *
+ * There can only be one unacknowledged interrupt outstanding at a time, so a single
+ * vtd_offset in the softc is enough.
+ */
+static void
+vtdbg_ack(struct vtdbg_softc *sc)
+{
+ mtx_lock(&sc->vtd_mtx);
+ sc->vtd_offset = 0;
+ wakeup(sc);
+ mtx_unlock(&sc->vtd_mtx);
+}
+
+/*
+ * Get virtio data in and out of the kernel, required by userspace to interact with
+ * the data pointed to by the virtqueue descriptors.
+ */
+static int
+vtdbg_io(struct vtdbg_softc *sc, struct vtdbg_io_args *args)
+{
+ struct vtdbg_transfer *tf;
+ caddr_t driver, device;
+ int error = 0;
+ size_t len;
+ int i;
+
+ tf = malloc(args->cnt * sizeof(*tf), M_DEVBUF, M_NOWAIT);
+ if (tf == NULL)
+ return (ENOMEM);
+
+ error = copyin(args->transfers, tf, args->cnt * (sizeof(*tf)));
+ if (error != 0) {
+ free(tf, M_DEVBUF);
+ return (error);
+ }
+
+ for (i = 0; i < args->cnt; i++) {
+ driver = (caddr_t)PHYS_TO_DMAP((vm_paddr_t)tf[i].vtdt_driver);
+ /* Translate from physical to kernel virtual. */
+ device = tf[i].vtdt_device;
+ len = tf[i].vtdt_len;
+
+ if (args->touser)
+ error = copyout(driver, device, len);
+ else
+ error = copyin(device, driver, len);
+
+ if (error != 0)
+ break;
+ }
+
+ free(tf, M_DEVBUF);
+
+ return (error);
+}
+
+
+static int
+vtdbg_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td)
+{
+ struct vtdbg_softc *sc;
+ int ret = 0;
+
+ ret = devfs_get_cdevpriv((void **)&sc);
+ if (ret != 0)
+ return (ret);
+
+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
+ switch (cmd) {
+ case VIRTIO_DBG_INIT:
+ ret = vtdbg_init();
+ break;
+ case VIRTIO_DBG_KICK:
+ vtdbg_kick(sc);
+ break;
+ case VIRTIO_DBG_ACK:
+ vtdbg_ack(sc);
+ break;
+ case VIRTIO_DBG_TRANSFER:
+ ret = vtdbg_io(sc, (struct vtdbg_io_args *)data);
+ break;
+ }
+
+ return (ret);
+}
+
+static int
+vtdbg_filt_attach(struct knote *kn)
+{
+ kn->kn_flags |= EV_CLEAR;
+ return (0);
+}
+
+static void
+vtdbg_filt_detach(struct knote *kn)
+{
+ struct vtdbg_softc *sc;
+ sc = (struct vtdbg_softc *)kn->kn_hook;
+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
+
+ knlist_remove(&sc->vtd_note, kn, 0);
+ kn->kn_hook = NULL;
+}
+
+static int
+vtdbg_filt_read(struct knote *kn, long hint)
+{
+ struct vtdbg_softc *sc;
+
+
+ sc = (struct vtdbg_softc *)kn->kn_hook;
+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
+ mtx_assert(&sc->vtd_mtx, MA_OWNED);
+
+ if (sc->vtd_offset == 0)
+ return (0);
+
+ kn->kn_data = sc->vtd_offset;
+
+ return (1);
+}
+
+struct filterops vtdbg_filtops = {
+ .f_isfd = 1,
+ .f_attach = vtdbg_filt_attach,
+ .f_detach = vtdbg_filt_detach,
+ .f_event = vtdbg_filt_read,
+};
+
+static int
+vtdbg_kqfilter(struct cdev *dev, struct knote *kn)
+{
+ struct vtdbg_softc *sc;
+ int error;
+
+ error = devfs_get_cdevpriv((void **)&sc);
+ if (error != 0)
+ return (error);
+ MPASS(sc->vtd_magic == VTDBG_MAGIC);
+
+ if (kn->kn_filter != EVFILT_READ) {
+ kn->kn_data = EINVAL;
+ return (EINVAL);
+ }
+
+ kn->kn_fop = &vtdbg_filtops;
+ kn->kn_hook = sc;
+ knlist_add(&sc->vtd_note, kn, 0);
+
+ return (0);
+
+}
+
+static struct cdevsw vtdbg_cdevsw = {
+ .d_open = vtdbg_open,
+ .d_mmap_single = vtdbg_mmap_single,
+ .d_ioctl = vtdbg_ioctl,
+ .d_kqfilter = vtdbg_kqfilter,
+ .d_name = "vtdbg",
+ .d_version = D_VERSION,
+};
+
+static int
+vtdbg_dev_create(void)
+{
+ vtdbg_dev = make_dev(&vtdbg_cdevsw, 0, UID_ROOT, GID_OPERATOR,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP, "vtdbg");
+ if (vtdbg_dev == NULL)
+ return (ENOMEM);
+
+ return (0);
+}
+
+static void
+vtdbg_dev_destroy(void)
+{
+ MPASS(vtdbg_dev != NULL);
+ destroy_dev(vtdbg_dev);
+}
+
+static int
+vtdbg_loader(struct module *m, int what, void *arg)
+{
+ int err = 0;
+
+ switch (what) {
+ case MOD_LOAD:
+ err = vtdbg_dev_create();
+ break;
+ case MOD_UNLOAD:
+ vtdbg_dev_destroy();
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ return (err);
+}
+
+static moduledata_t vtdbg_moddata = {
+ "vtdbg",
+ vtdbg_loader,
+ NULL,
+};
+
+DECLARE_MODULE(vtdbg, vtdbg_moddata, SI_SUB_VFS, SI_ORDER_MIDDLE);
diff --git a/sys/dev/virtio/mmio/virtio_mmio.h b/sys/dev/virtio/mmio/virtio_mmio.h
--- a/sys/dev/virtio/mmio/virtio_mmio.h
+++ b/sys/dev/virtio/mmio/virtio_mmio.h
@@ -31,9 +31,12 @@
#ifndef _VIRTIO_MMIO_H
#define _VIRTIO_MMIO_H
+#ifdef _KERNEL
+
DECLARE_CLASS(vtmmio_driver);
struct vtmmio_virtqueue;
+typedef void *vtmmio_alloc_cb_t(device_t, size_t);
struct vtmmio_softc {
device_t dev;
@@ -51,10 +54,25 @@
int vtmmio_nvqs;
struct vtmmio_virtqueue *vtmmio_vqs;
void *ih;
+
+ vtmmio_alloc_cb_t *vtmmio_ringalloc_cb;
};
int vtmmio_probe(device_t);
int vtmmio_attach(device_t);
+void vtmmio_reset(struct vtmmio_softc *);
+uint8_t vtmmio_get_status(device_t);
+void vtmmio_set_status(device_t, uint8_t);
+void vtmmio_probe_and_attach_child(struct vtmmio_softc *);
+
+#define vtmmio_read_config_1(sc, o) \
+ bus_read_1((sc)->res[0], (o))
+#define vtmmio_read_config_2(sc, o) \
+ bus_read_2((sc)->res[0], (o))
+#define vtmmio_read_config_4(sc, o) \
+ bus_read_4((sc)->res[0], (o))
+
+#endif /* _KERNEL */
#define VIRTIO_MMIO_MAGIC_VALUE 0x000
#define VIRTIO_MMIO_VERSION 0x004
diff --git a/sys/dev/virtio/mmio/virtio_mmio.c b/sys/dev/virtio/mmio/virtio_mmio.c
--- a/sys/dev/virtio/mmio/virtio_mmio.c
+++ b/sys/dev/virtio/mmio/virtio_mmio.c
@@ -84,19 +84,15 @@
static void vtmmio_reinit_complete(device_t);
static void vtmmio_notify_virtqueue(device_t, uint16_t, bus_size_t);
static int vtmmio_config_generation(device_t);
-static uint8_t vtmmio_get_status(device_t);
-static void vtmmio_set_status(device_t, uint8_t);
static void vtmmio_read_dev_config(device_t, bus_size_t, void *, int);
static uint64_t vtmmio_read_dev_config_8(struct vtmmio_softc *, bus_size_t);
static void vtmmio_write_dev_config(device_t, bus_size_t, const void *, int);
static void vtmmio_describe_features(struct vtmmio_softc *, const char *,
uint64_t);
-static void vtmmio_probe_and_attach_child(struct vtmmio_softc *);
static int vtmmio_reinit_virtqueue(struct vtmmio_softc *, int);
static void vtmmio_free_interrupts(struct vtmmio_softc *);
static void vtmmio_free_virtqueues(struct vtmmio_softc *);
static void vtmmio_release_child_resources(struct vtmmio_softc *);
-static void vtmmio_reset(struct vtmmio_softc *);
static void vtmmio_select_virtqueue(struct vtmmio_softc *, int);
static void vtmmio_vq_intr(void *);
@@ -128,13 +124,6 @@
VIRTIO_MMIO_NOTE(sc->platform, (o), (v)); \
} while (0)
-#define vtmmio_read_config_1(sc, o) \
- bus_read_1((sc)->res[0], (o))
-#define vtmmio_read_config_2(sc, o) \
- bus_read_2((sc)->res[0], (o))
-#define vtmmio_read_config_4(sc, o) \
- bus_read_4((sc)->res[0], (o))
-
static device_method_t vtmmio_methods[] = {
/* Device interface. */
DEVMETHOD(device_attach, vtmmio_attach),
@@ -572,7 +561,7 @@
error = virtqueue_alloc(dev, idx, size,
VIRTIO_MMIO_QUEUE_NOTIFY, VIRTIO_MMIO_VRING_ALIGN,
- ~(vm_paddr_t)0, info, &vq);
+ ~(vm_paddr_t)0, info, &vq, sc->vtmmio_ringalloc_cb);
if (error) {
device_printf(dev,
"cannot allocate virtqueue %d: %d\n",
@@ -689,7 +678,7 @@
return (gen);
}
-static uint8_t
+uint8_t
vtmmio_get_status(device_t dev)
{
struct vtmmio_softc *sc;
@@ -699,7 +688,7 @@
return (vtmmio_read_config_4(sc, VIRTIO_MMIO_STATUS));
}
-static void
+void
vtmmio_set_status(device_t dev, uint8_t status)
{
struct vtmmio_softc *sc;
@@ -875,7 +864,7 @@
virtio_describe(dev, msg, features, sc->vtmmio_child_feat_desc);
}
-static void
+void
vtmmio_probe_and_attach_child(struct vtmmio_softc *sc)
{
device_t dev, child;
@@ -976,7 +965,7 @@
vtmmio_free_virtqueues(sc);
}
-static void
+void
vtmmio_reset(struct vtmmio_softc *sc)
{
diff --git a/sys/dev/virtio/pci/virtio_pci.c b/sys/dev/virtio/pci/virtio_pci.c
--- a/sys/dev/virtio/pci/virtio_pci.c
+++ b/sys/dev/virtio/pci/virtio_pci.c
@@ -362,7 +362,7 @@
notify_offset = vtpci_get_vq_notify_off(cn, idx);
error = virtqueue_alloc(dev, idx, size, notify_offset, align,
- ~(vm_paddr_t)0, info, &vq);
+ ~(vm_paddr_t)0, info, &vq, NULL);
if (error) {
device_printf(dev,
"cannot allocate virtqueue %d: %d\n", idx, error);
diff --git a/sys/dev/virtio/virtio.h b/sys/dev/virtio/virtio.h
--- a/sys/dev/virtio/virtio.h
+++ b/sys/dev/virtio/virtio.h
@@ -68,7 +68,9 @@
#define VIRTIO_DRIVER_MODULE(name, driver, evh, arg) \
DRIVER_MODULE(name, virtio_mmio, driver, evh, arg); \
- DRIVER_MODULE(name, virtio_pci, driver, evh, arg)
+ DRIVER_MODULE(name, virtio_pci, driver, evh, arg); \
+ DRIVER_MODULE(name, virtio_dbg, driver, evh, arg)
+
struct virtio_pnp_match {
uint32_t device_type;
@@ -82,6 +84,8 @@
MODULE_PNP_INFO("U32:device_type;D:#", virtio_mmio, driver, \
&driver ## _match, 1); \
MODULE_PNP_INFO("U32:device_type;D:#", virtio_pci, driver, \
+ &driver ## _match, 1) \
+ MODULE_PNP_INFO("U32:device_type;D:#", virtio_dbg, driver, \
&driver ## _match, 1)
#define VIRTIO_SIMPLE_PROBE(dev, driver) \
(virtio_simple_probe(dev, &driver ## _match))
diff --git a/sys/dev/virtio/virtqueue.h b/sys/dev/virtio/virtqueue.h
--- a/sys/dev/virtio/virtqueue.h
+++ b/sys/dev/virtio/virtqueue.h
@@ -34,6 +34,7 @@
/* Device callback for a virtqueue interrupt. */
typedef void virtqueue_intr_t(void *);
+typedef void *virtqueue_alloc_cb_t(device_t, size_t);
/*
* Hint on how long the next interrupt should be postponed. This is
@@ -67,7 +68,8 @@
int virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size,
bus_size_t notify_offset, int align, vm_paddr_t highaddr,
- struct vq_alloc_info *info, struct virtqueue **vqp);
+ struct vq_alloc_info *info, struct virtqueue **vqp,
+ virtqueue_alloc_cb_t *cb);
void *virtqueue_drain(struct virtqueue *vq, int *last);
void virtqueue_free(struct virtqueue *vq);
int virtqueue_reinit(struct virtqueue *vq, uint16_t size);
diff --git a/sys/dev/virtio/virtqueue.c b/sys/dev/virtio/virtqueue.c
--- a/sys/dev/virtio/virtqueue.c
+++ b/sys/dev/virtio/virtqueue.c
@@ -151,7 +151,8 @@
int
virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size,
bus_size_t notify_offset, int align, vm_paddr_t highaddr,
- struct vq_alloc_info *info, struct virtqueue **vqp)
+ struct vq_alloc_info *info, struct virtqueue **vqp,
+ virtqueue_alloc_cb_t alloc_cb)
{
struct virtqueue *vq;
int error;
@@ -206,8 +207,12 @@
}
vq->vq_ring_size = round_page(vring_size(size, align));
- vq->vq_ring_mem = contigmalloc(vq->vq_ring_size, M_DEVBUF,
- M_NOWAIT | M_ZERO, 0, highaddr, PAGE_SIZE, 0);
+ if (alloc_cb != NULL) {
+ vq->vq_ring_mem = alloc_cb(dev, vq->vq_ring_size);
+ } else {
+ vq->vq_ring_mem = contigmalloc(vq->vq_ring_size, M_DEVBUF,
+ M_NOWAIT | M_ZERO, 0, highaddr, PAGE_SIZE, 0);
+ }
if (vq->vq_ring_mem == NULL) {
device_printf(dev,
"cannot allocate memory for virtqueue ring\n");
diff --git a/tests/sys/Makefile b/tests/sys/Makefile
--- a/tests/sys/Makefile
+++ b/tests/sys/Makefile
@@ -33,6 +33,7 @@
TESTS_SUBDIRS+= ses
TESTS_SUBDIRS+= sys
TESTS_SUBDIRS+= vfs
+TESTS_SUBDIRS+= virtio
TESTS_SUBDIRS+= vm
TESTS_SUBDIRS+= vmm
diff --git a/tests/sys/virtio/Makefile b/tests/sys/virtio/Makefile
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/Makefile
@@ -0,0 +1,28 @@
+PROG= virtiodbg
+
+.PATH: ${SRCTOP}/sys/libkern
+
+SRCS= block_if.c \
+ config.c \
+ iov.c \
+ iov_emul.c \
+ mevent.c \
+ mmio_virtio_block.c \
+ mmio_emul.c \
+ virtio.c \
+ virtiodbg.c
+
+MAN=
+
+CFLAGS+=-I${.CURDIR} \
+ -I${SRCTOP}/sys
+
+LIBADD+= md nv pthread
+
+# Disable thread safety analysis since it only finds very simple bugs and
+# yields many false positives.
+NO_WTHREAD_SAFETY=
+
+NO_WCAST_ALIGN=
+
+.include <bsd.prog.mk>
diff --git a/tests/sys/virtio/block_if.h b/tests/sys/virtio/block_if.h
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/block_if.h
@@ -0,0 +1,84 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * The block API to be used by bhyve block-device emulations. The routines
+ * are thread safe, with no assumptions about the context of the completion
+ * callback - it may occur in the caller's context, or asynchronously in
+ * another thread.
+ */
+
+#ifndef _BLOCK_IF_H_
+#define _BLOCK_IF_H_
+
+#include <sys/nv.h>
+#include <sys/uio.h>
+#include <sys/unistd.h>
+
+/*
+ * BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in
+ * a single request. BLOCKIF_RING_MAX is the maxmimum number of
+ * pending requests that can be queued.
+ */
+#define BLOCKIF_IOV_MAX 128 /* not practical to be IOV_MAX */
+#define BLOCKIF_RING_MAX 128
+
+struct blockif_req {
+ int br_iovcnt;
+ off_t br_offset;
+ ssize_t br_resid;
+ void (*br_callback)(struct blockif_req *req, int err);
+ void *br_param;
+ struct iovec br_iov[BLOCKIF_IOV_MAX];
+};
+
+struct mmio_devinst;
+struct blockif_ctxt;
+
+typedef void blockif_resize_cb(struct blockif_ctxt *, void *, size_t, uint64_t);
+
+int blockif_legacy_config(nvlist_t *nvl, const char *opts);
+struct blockif_ctxt *blockif_open(nvlist_t *nvl, const char *ident);
+int blockif_register_resize_callback(struct blockif_ctxt *bc,
+ blockif_resize_cb *cb, void *cb_arg);
+off_t blockif_size(struct blockif_ctxt *bc);
+void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h,
+ uint8_t *s);
+int blockif_sectsz(struct blockif_ctxt *bc);
+void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off);
+int blockif_queuesz(struct blockif_ctxt *bc);
+int blockif_is_ro(struct blockif_ctxt *bc);
+int blockif_candelete(struct blockif_ctxt *bc);
+int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_close(struct blockif_ctxt *bc);
+
+#endif /* _BLOCK_IF_H_ */
diff --git a/tests/sys/virtio/block_if.c b/tests/sys/virtio/block_if.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/block_if.c
@@ -0,0 +1,980 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ * Copyright 2020 Joyent, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/queue.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+
+#include <assert.h>
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <signal.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <machine/atomic.h>
+#include <machine/vmm_snapshot.h>
+
+#include "config.h"
+#include "debug.h"
+#include "mevent.h"
+#include "block_if.h"
+
+#define BLOCKIF_SIG 0xb109b109
+
+#define BLOCKIF_NUMTHR 8
+#define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
+
+enum blockop {
+ BOP_READ,
+ BOP_WRITE,
+ BOP_FLUSH,
+ BOP_DELETE
+};
+
+enum blockstat {
+ BST_FREE,
+ BST_BLOCK,
+ BST_PEND,
+ BST_BUSY,
+ BST_DONE
+};
+
+struct blockif_elem {
+ TAILQ_ENTRY(blockif_elem) be_link;
+ struct blockif_req *be_req;
+ enum blockop be_op;
+ enum blockstat be_status;
+ pthread_t be_tid;
+ off_t be_block;
+};
+
+struct blockif_ctxt {
+ unsigned int bc_magic;
+ int bc_fd;
+ int bc_ischr;
+ int bc_isgeom;
+ int bc_candelete;
+ int bc_rdonly;
+ off_t bc_size;
+ int bc_sectsz;
+ int bc_psectsz;
+ int bc_psectoff;
+ int bc_closing;
+ int bc_paused;
+ pthread_t bc_btid[BLOCKIF_NUMTHR];
+ pthread_mutex_t bc_mtx;
+ pthread_cond_t bc_cond;
+ pthread_cond_t bc_work_done_cond;
+ blockif_resize_cb *bc_resize_cb;
+ void *bc_resize_cb_arg;
+ struct mevent *bc_resize_event;
+
+ /* Request elements and free/pending/busy queues */
+ TAILQ_HEAD(, blockif_elem) bc_freeq;
+ TAILQ_HEAD(, blockif_elem) bc_pendq;
+ TAILQ_HEAD(, blockif_elem) bc_busyq;
+ struct blockif_elem bc_reqs[BLOCKIF_MAXREQ];
+ int bc_bootindex;
+};
+
+static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
+
+struct blockif_sig_elem {
+ pthread_mutex_t bse_mtx;
+ pthread_cond_t bse_cond;
+ int bse_pending;
+ struct blockif_sig_elem *bse_next;
+};
+
+static struct blockif_sig_elem *blockif_bse_head;
+
+static int
+blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
+ enum blockop op)
+{
+ struct blockif_elem *be, *tbe;
+ off_t off;
+ int i;
+
+ be = TAILQ_FIRST(&bc->bc_freeq);
+ assert(be != NULL);
+ assert(be->be_status == BST_FREE);
+ TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
+ be->be_req = breq;
+ be->be_op = op;
+ switch (op) {
+ case BOP_READ:
+ case BOP_WRITE:
+ case BOP_DELETE:
+ off = breq->br_offset;
+ for (i = 0; i < breq->br_iovcnt; i++)
+ off += breq->br_iov[i].iov_len;
+ break;
+ default:
+ off = OFF_MAX;
+ }
+ be->be_block = off;
+ TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
+ if (tbe->be_block == breq->br_offset)
+ break;
+ }
+ if (tbe == NULL) {
+ TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
+ if (tbe->be_block == breq->br_offset)
+ break;
+ }
+ }
+ if (tbe == NULL)
+ be->be_status = BST_PEND;
+ else
+ be->be_status = BST_BLOCK;
+ TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
+ return (be->be_status == BST_PEND);
+}
+
+static int
+blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
+{
+ struct blockif_elem *be;
+
+ TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
+ if (be->be_status == BST_PEND)
+ break;
+ assert(be->be_status == BST_BLOCK);
+ }
+ if (be == NULL)
+ return (0);
+ TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
+ be->be_status = BST_BUSY;
+ be->be_tid = t;
+ TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
+ *bep = be;
+ return (1);
+}
+
+static void
+blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
+{
+ struct blockif_elem *tbe;
+
+ if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
+ TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
+ else
+ TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
+ TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
+ if (tbe->be_req->br_offset == be->be_block)
+ tbe->be_status = BST_PEND;
+ }
+ be->be_tid = 0;
+ be->be_status = BST_FREE;
+ be->be_req = NULL;
+ TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
+}
+
+static int
+blockif_flush_bc(struct blockif_ctxt *bc)
+{
+ if (bc->bc_ischr) {
+ if (ioctl(bc->bc_fd, DIOCGFLUSH))
+ return (errno);
+ } else if (fsync(bc->bc_fd))
+ return (errno);
+
+ return (0);
+}
+
+static void
+blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
+{
+ struct spacectl_range range;
+ struct blockif_req *br;
+ off_t arg[2];
+ ssize_t n;
+ size_t clen, len, off, boff, voff;
+ int i, err;
+
+ br = be->be_req;
+ assert(br->br_resid >= 0);
+
+ if (br->br_iovcnt <= 1)
+ buf = NULL;
+ err = 0;
+ switch (be->be_op) {
+ case BOP_READ:
+ if (buf == NULL) {
+ if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
+ br->br_offset)) < 0)
+ err = errno;
+ else
+ br->br_resid -= n;
+ break;
+ }
+ i = 0;
+ off = voff = 0;
+ while (br->br_resid > 0) {
+ len = MIN(br->br_resid, MAXPHYS);
+ n = pread(bc->bc_fd, buf, len, br->br_offset + off);
+ if (n < 0) {
+ err = errno;
+ break;
+ }
+ len = (size_t)n;
+ boff = 0;
+ do {
+ clen = MIN(len - boff, br->br_iov[i].iov_len -
+ voff);
+ memcpy((uint8_t *)br->br_iov[i].iov_base + voff,
+ buf + boff, clen);
+ if (clen < br->br_iov[i].iov_len - voff)
+ voff += clen;
+ else {
+ i++;
+ voff = 0;
+ }
+ boff += clen;
+ } while (boff < len);
+ off += len;
+ br->br_resid -= len;
+ }
+ break;
+ case BOP_WRITE:
+ if (bc->bc_rdonly) {
+ err = EROFS;
+ break;
+ }
+ if (buf == NULL) {
+ if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
+ br->br_offset)) < 0)
+ err = errno;
+ else
+ br->br_resid -= n;
+ break;
+ }
+ i = 0;
+ off = voff = 0;
+ while (br->br_resid > 0) {
+ len = MIN(br->br_resid, MAXPHYS);
+ boff = 0;
+ do {
+ clen = MIN(len - boff, br->br_iov[i].iov_len -
+ voff);
+ memcpy(buf + boff,
+ (uint8_t *)br->br_iov[i].iov_base + voff,
+ clen);
+ if (clen < br->br_iov[i].iov_len - voff)
+ voff += clen;
+ else {
+ i++;
+ voff = 0;
+ }
+ boff += clen;
+ } while (boff < len);
+
+ n = pwrite(bc->bc_fd, buf, len, br->br_offset + off);
+ if (n < 0) {
+ err = errno;
+ break;
+ }
+ off += n;
+ br->br_resid -= n;
+ }
+ break;
+ case BOP_FLUSH:
+ err = blockif_flush_bc(bc);
+ break;
+ case BOP_DELETE:
+ if (!bc->bc_candelete)
+ err = EOPNOTSUPP;
+ else if (bc->bc_rdonly)
+ err = EROFS;
+ else if (bc->bc_ischr) {
+ arg[0] = br->br_offset;
+ arg[1] = br->br_resid;
+ if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
+ err = errno;
+ else
+ br->br_resid = 0;
+ } else {
+ range.r_offset = br->br_offset;
+ range.r_len = br->br_resid;
+
+ while (range.r_len > 0) {
+ if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC,
+ &range, 0, &range) != 0) {
+ err = errno;
+ break;
+ }
+ }
+ if (err == 0)
+ br->br_resid = 0;
+ }
+ break;
+ default:
+ err = EINVAL;
+ break;
+ }
+
+ be->be_status = BST_DONE;
+
+ (*br->br_callback)(br, err);
+}
+
+static inline bool
+blockif_empty(const struct blockif_ctxt *bc)
+{
+ return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq));
+}
+
+static void *
+blockif_thr(void *arg)
+{
+ struct blockif_ctxt *bc;
+ struct blockif_elem *be;
+ pthread_t t;
+ uint8_t *buf;
+
+ bc = arg;
+ if (bc->bc_isgeom)
+ buf = malloc(MAXPHYS);
+ else
+ buf = NULL;
+ t = pthread_self();
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ for (;;) {
+ while (blockif_dequeue(bc, t, &be)) {
+ pthread_mutex_unlock(&bc->bc_mtx);
+ blockif_proc(bc, be, buf);
+ pthread_mutex_lock(&bc->bc_mtx);
+ blockif_complete(bc, be);
+ }
+
+ /* If none to work, notify the main thread */
+ if (blockif_empty(bc))
+ pthread_cond_broadcast(&bc->bc_work_done_cond);
+
+ /* Check ctxt status here to see if exit requested */
+ if (bc->bc_closing)
+ break;
+
+ pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
+ }
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ if (buf)
+ free(buf);
+ pthread_exit(NULL);
+ return (NULL);
+}
+
+static void
+blockif_sigcont_handler(int signal __unused, enum ev_type type __unused,
+ void *arg __unused, uint64_t data __unused)
+{
+ struct blockif_sig_elem *bse;
+
+ for (;;) {
+ /*
+ * Process the entire list even if not intended for
+ * this thread.
+ */
+ do {
+ bse = blockif_bse_head;
+ if (bse == NULL)
+ return;
+ } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
+ (uintptr_t)bse,
+ (uintptr_t)bse->bse_next));
+
+ pthread_mutex_lock(&bse->bse_mtx);
+ bse->bse_pending = 0;
+ pthread_cond_signal(&bse->bse_cond);
+ pthread_mutex_unlock(&bse->bse_mtx);
+ }
+}
+
+static void
+blockif_init(void)
+{
+ mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
+ (void) signal(SIGCONT, SIG_IGN);
+}
+
+
+struct blockif_ctxt *
+blockif_open(nvlist_t *nvl, const char *ident)
+{
+ char tname[MAXCOMLEN + 1];
+ char name[MAXPATHLEN];
+ const char *path, *pssval, *ssval, *bootindex_val;
+ char *cp;
+ struct blockif_ctxt *bc;
+ struct stat sbuf;
+ struct diocgattr_arg arg;
+ off_t size, psectsz, psectoff;
+ int extra, fd, i, sectsz;
+ int ro, candelete, geom, ssopt, pssopt;
+ int nodelete;
+ int bootindex;
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+ cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE };
+#endif
+
+ pthread_once(&blockif_once, blockif_init);
+
+ fd = -1;
+ extra = 0;
+ ssopt = 0;
+ ro = 0;
+ nodelete = 0;
+ bootindex = -1;
+
+ if (get_config_bool_node_default(nvl, "nocache", false))
+ extra |= O_DIRECT;
+ if (get_config_bool_node_default(nvl, "nodelete", false))
+ nodelete = 1;
+ if (get_config_bool_node_default(nvl, "sync", false) ||
+ get_config_bool_node_default(nvl, "direct", false))
+ extra |= O_SYNC;
+ if (get_config_bool_node_default(nvl, "ro", false))
+ ro = 1;
+ ssval = get_config_value_node(nvl, "sectorsize");
+ if (ssval != NULL) {
+ ssopt = strtol(ssval, &cp, 10);
+ if (cp == ssval) {
+ EPRINTLN("Invalid sector size \"%s\"", ssval);
+ goto err;
+ }
+ if (*cp == '\0') {
+ pssopt = ssopt;
+ } else if (*cp == '/') {
+ pssval = cp + 1;
+ pssopt = strtol(pssval, &cp, 10);
+ if (cp == pssval || *cp != '\0') {
+ EPRINTLN("Invalid sector size \"%s\"", ssval);
+ goto err;
+ }
+ } else {
+ EPRINTLN("Invalid sector size \"%s\"", ssval);
+ goto err;
+ }
+ }
+
+ bootindex_val = get_config_value_node(nvl, "bootindex");
+ if (bootindex_val != NULL) {
+ bootindex = atoi(bootindex_val);
+ }
+
+ path = get_config_value_node(nvl, "path");
+ if (path == NULL) {
+ EPRINTLN("Missing \"path\" for block device.");
+ goto err;
+ }
+
+ fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra);
+ if (fd < 0 && !ro) {
+ /* Attempt a r/w fail with a r/o open */
+ fd = open(path, O_RDONLY | extra);
+ ro = 1;
+ }
+
+ if (fd < 0) {
+ warn("Could not open backing file: %s", path);
+ goto err;
+ }
+
+ if (fstat(fd, &sbuf) < 0) {
+ warn("Could not stat backing file %s", path);
+ goto err;
+ }
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
+ CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF);
+ if (ro)
+ cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
+
+ if (caph_rights_limit(fd, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ /*
+ * Deal with raw devices
+ */
+ size = sbuf.st_size;
+ sectsz = DEV_BSIZE;
+ psectsz = psectoff = 0;
+ candelete = geom = 0;
+ if (S_ISCHR(sbuf.st_mode)) {
+ if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
+ ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
+ perror("Could not fetch dev blk/sector size");
+ goto err;
+ }
+ assert(size != 0);
+ assert(sectsz != 0);
+ if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
+ ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
+ strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
+ arg.len = sizeof(arg.value.i);
+ if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
+ candelete = arg.value.i;
+ if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
+ geom = 1;
+ } else {
+ psectsz = sbuf.st_blksize;
+ /* Avoid fallback implementation */
+ candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1;
+ }
+
+#ifndef WITHOUT_CAPSICUM
+ if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ if (ssopt != 0) {
+ if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
+ ssopt > pssopt) {
+ EPRINTLN("Invalid sector size %d/%d",
+ ssopt, pssopt);
+ goto err;
+ }
+
+ /*
+ * Some backend drivers (e.g. cd0, ada0) require that the I/O
+ * size be a multiple of the device's sector size.
+ *
+ * Validate that the emulated sector size complies with this
+ * requirement.
+ */
+ if (S_ISCHR(sbuf.st_mode)) {
+ if (ssopt < sectsz || (ssopt % sectsz) != 0) {
+ EPRINTLN("Sector size %d incompatible "
+ "with underlying device sector size %d",
+ ssopt, sectsz);
+ goto err;
+ }
+ }
+
+ sectsz = ssopt;
+ psectsz = pssopt;
+ psectoff = 0;
+ }
+
+ bc = calloc(1, sizeof(struct blockif_ctxt));
+ if (bc == NULL) {
+ perror("calloc");
+ goto err;
+ }
+
+ bc->bc_magic = BLOCKIF_SIG;
+ bc->bc_fd = fd;
+ bc->bc_ischr = S_ISCHR(sbuf.st_mode);
+ bc->bc_isgeom = geom;
+ bc->bc_candelete = candelete;
+ bc->bc_rdonly = ro;
+ bc->bc_size = size;
+ bc->bc_sectsz = sectsz;
+ bc->bc_psectsz = psectsz;
+ bc->bc_psectoff = psectoff;
+ pthread_mutex_init(&bc->bc_mtx, NULL);
+ pthread_cond_init(&bc->bc_cond, NULL);
+ bc->bc_paused = 0;
+ pthread_cond_init(&bc->bc_work_done_cond, NULL);
+ TAILQ_INIT(&bc->bc_freeq);
+ TAILQ_INIT(&bc->bc_pendq);
+ TAILQ_INIT(&bc->bc_busyq);
+ bc->bc_bootindex = bootindex;
+ for (i = 0; i < BLOCKIF_MAXREQ; i++) {
+ bc->bc_reqs[i].be_status = BST_FREE;
+ TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
+ }
+
+ for (i = 0; i < BLOCKIF_NUMTHR; i++) {
+ pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
+ snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
+ pthread_set_name_np(bc->bc_btid[i], tname);
+ }
+
+ return (bc);
+err:
+ if (fd >= 0)
+ close(fd);
+ return (NULL);
+}
+
+static void
+blockif_resized(int fd, enum ev_type type __unused, void *arg,
+ uint64_t data __unused)
+{
+ struct blockif_ctxt *bc;
+ struct stat sb;
+ off_t mediasize;
+
+ if (fstat(fd, &sb) != 0)
+ return;
+
+ if (S_ISCHR(sb.st_mode)) {
+ if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) {
+ EPRINTLN("blockif_resized: get mediasize failed: %s",
+ strerror(errno));
+ return;
+ }
+ } else
+ mediasize = sb.st_size;
+
+ bc = arg;
+ pthread_mutex_lock(&bc->bc_mtx);
+ if (mediasize != bc->bc_size) {
+ bc->bc_size = mediasize;
+ bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size, 0);
+ }
+ pthread_mutex_unlock(&bc->bc_mtx);
+}
+
+int
+blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb,
+ void *cb_arg)
+{
+ struct stat sb;
+ int err;
+
+ if (cb == NULL)
+ return (EINVAL);
+
+ err = 0;
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ if (bc->bc_resize_cb != NULL) {
+ err = EBUSY;
+ goto out;
+ }
+
+ assert(bc->bc_closing == 0);
+
+ if (fstat(bc->bc_fd, &sb) != 0) {
+ err = errno;
+ goto out;
+ }
+
+ bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE,
+ EVFF_ATTRIB, blockif_resized, bc);
+ if (bc->bc_resize_event == NULL) {
+ err = ENXIO;
+ goto out;
+ }
+
+ bc->bc_resize_cb = cb;
+ bc->bc_resize_cb_arg = cb_arg;
+out:
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ return (err);
+}
+
+static int
+blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
+ enum blockop op)
+{
+ int err;
+
+ err = 0;
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ assert(!bc->bc_paused);
+ if (!TAILQ_EMPTY(&bc->bc_freeq)) {
+ /*
+ * Enqueue and inform the block i/o thread
+ * that there is work available
+ */
+ if (blockif_enqueue(bc, breq, op))
+ pthread_cond_signal(&bc->bc_cond);
+ } else {
+ /*
+ * Callers are not allowed to enqueue more than
+ * the specified blockif queue limit. Return an
+ * error to indicate that the queue length has been
+ * exceeded.
+ */
+ err = E2BIG;
+ }
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ return (err);
+}
+
+int
+blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (blockif_request(bc, breq, BOP_READ));
+}
+
+int
+blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (blockif_request(bc, breq, BOP_WRITE));
+}
+
+int
+blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (blockif_request(bc, breq, BOP_FLUSH));
+}
+
+int
+blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (blockif_request(bc, breq, BOP_DELETE));
+}
+
+int
+blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+ struct blockif_elem *be;
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ /* XXX: not waiting while paused */
+
+ /*
+ * Check pending requests.
+ */
+ TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
+ if (be->be_req == breq)
+ break;
+ }
+ if (be != NULL) {
+ /*
+ * Found it.
+ */
+ blockif_complete(bc, be);
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ return (0);
+ }
+
+ /*
+ * Check in-flight requests.
+ */
+ TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
+ if (be->be_req == breq)
+ break;
+ }
+ if (be == NULL) {
+ /*
+ * Didn't find it.
+ */
+ pthread_mutex_unlock(&bc->bc_mtx);
+ return (EINVAL);
+ }
+
+ /*
+ * Interrupt the processing thread to force it return
+ * prematurely via it's normal callback path.
+ */
+ while (be->be_status == BST_BUSY) {
+ struct blockif_sig_elem bse, *old_head;
+
+ pthread_mutex_init(&bse.bse_mtx, NULL);
+ pthread_cond_init(&bse.bse_cond, NULL);
+
+ bse.bse_pending = 1;
+
+ do {
+ old_head = blockif_bse_head;
+ bse.bse_next = old_head;
+ } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
+ (uintptr_t)old_head,
+ (uintptr_t)&bse));
+
+ pthread_kill(be->be_tid, SIGCONT);
+
+ pthread_mutex_lock(&bse.bse_mtx);
+ while (bse.bse_pending)
+ pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
+ pthread_mutex_unlock(&bse.bse_mtx);
+ }
+
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ /*
+ * The processing thread has been interrupted. Since it's not
+ * clear if the callback has been invoked yet, return EBUSY.
+ */
+ return (EBUSY);
+}
+
+int
+blockif_close(struct blockif_ctxt *bc)
+{
+ void *jval;
+ int i;
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ /*
+ * Stop the block i/o thread
+ */
+ pthread_mutex_lock(&bc->bc_mtx);
+ bc->bc_closing = 1;
+ if (bc->bc_resize_event != NULL)
+ mevent_disable(bc->bc_resize_event);
+ pthread_mutex_unlock(&bc->bc_mtx);
+ pthread_cond_broadcast(&bc->bc_cond);
+ for (i = 0; i < BLOCKIF_NUMTHR; i++)
+ pthread_join(bc->bc_btid[i], &jval);
+
+ /* XXX Cancel queued i/o's ??? */
+
+ /*
+ * Release resources
+ */
+ bc->bc_magic = 0;
+ close(bc->bc_fd);
+ free(bc);
+
+ return (0);
+}
+
+/*
+ * Return virtual C/H/S values for a given block. Use the algorithm
+ * outlined in the VHD specification to calculate values.
+ */
+void
+blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
+{
+ off_t sectors; /* total sectors of the block dev */
+ off_t hcyl; /* cylinders times heads */
+ uint16_t secpt; /* sectors per track */
+ uint8_t heads;
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ sectors = bc->bc_size / bc->bc_sectsz;
+
+ /* Clamp the size to the largest possible with CHS */
+ if (sectors > 65535L * 16 * 255)
+ sectors = 65535L * 16 * 255;
+
+ if (sectors >= 65536L * 16 * 63) {
+ secpt = 255;
+ heads = 16;
+ hcyl = sectors / secpt;
+ } else {
+ secpt = 17;
+ hcyl = sectors / secpt;
+ heads = (hcyl + 1023) / 1024;
+
+ if (heads < 4)
+ heads = 4;
+
+ if (hcyl >= (heads * 1024) || heads > 16) {
+ secpt = 31;
+ heads = 16;
+ hcyl = sectors / secpt;
+ }
+ if (hcyl >= (heads * 1024)) {
+ secpt = 63;
+ heads = 16;
+ hcyl = sectors / secpt;
+ }
+ }
+
+ *c = hcyl / heads;
+ *h = heads;
+ *s = secpt;
+}
+
+/*
+ * Accessors
+ */
+off_t
+blockif_size(struct blockif_ctxt *bc)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (bc->bc_size);
+}
+
+int
+blockif_sectsz(struct blockif_ctxt *bc)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (bc->bc_sectsz);
+}
+
+void
+blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ *size = bc->bc_psectsz;
+ *off = bc->bc_psectoff;
+}
+
+int
+blockif_queuesz(struct blockif_ctxt *bc)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (BLOCKIF_MAXREQ - 1);
+}
+
+int
+blockif_is_ro(struct blockif_ctxt *bc)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (bc->bc_rdonly);
+}
+
+int
+blockif_candelete(struct blockif_ctxt *bc)
+{
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (bc->bc_candelete);
+}
diff --git a/tests/sys/virtio/config.h b/tests/sys/virtio/config.h
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/config.h
@@ -0,0 +1,129 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 John H. Baldwin <jhb@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef __CONFIG_H__
+#define __CONFIG_H__
+
+#include <sys/nv.h>
+
+/*-
+ * Manages a configuration database backed by an nv(9) list.
+ *
+ * The database only stores string values. Callers should parse
+ * values into other types if needed. String values can reference
+ * other configuration variables using a '%(name)' syntax. In this
+ * case, the name must be the full path of the configuration
+ * variable. The % character can be escaped with a preceding \ to
+ * avoid expansion. Any \ characters must be escaped.
+ *
+ * Configuration variables are stored in a tree. The full path of a
+ * variable is specified as a dot-separated name similar to sysctl(8)
+ * OIDs.
+ */
+
+/*
+ * Fetches the value of a configuration variable. If the "raw" value
+ * contains references to other configuration variables, this function
+ * expands those references and returns a pointer to the parsed
+ * string. The string's storage is only stable until the next call to
+ * this function.
+ *
+ * If no node is found, returns NULL.
+ *
+ * If 'parent' is NULL, 'name' is assumed to be a top-level variable.
+ */
+const char *get_config_value_node(const nvlist_t *parent, const char *name);
+
+/*
+ * Similar to get_config_value_node but expects a full path to the
+ * leaf node.
+ */
+const char *get_config_value(const char *path);
+
+/* Initializes the tree to an empty state. */
+void init_config(void);
+
+/*
+ * Creates an existing configuration node via a dot-separated OID
+ * path. Will fail if the path names an existing leaf configuration
+ * variable. If the node already exists, this returns a pointer to
+ * the existing node.
+ */
+nvlist_t *create_config_node(const char *path);
+
+/*
+ * Looks for an existing configuration node via a dot-separated OID
+ * path. Will fail if the path names an existing leaf configuration
+ * variable.
+ */
+nvlist_t *find_config_node(const char *path);
+
+/*
+ * Similar to the above, but treats the path relative to an existing
+ * 'parent' node rather than as an absolute path.
+ */
+nvlist_t *create_relative_config_node(nvlist_t *parent, const char *path);
+nvlist_t *find_relative_config_node(nvlist_t *parent, const char *path);
+
+/*
+ * Adds or replaces the value of the specified variable.
+ *
+ * If 'parent' is NULL, 'name' is assumed to be a top-level variable.
+ */
+void set_config_value_node(nvlist_t *parent, const char *name,
+ const char *value);
+
+/*
+ * Similar to set_config_value_node but only sets value if it's unset yet.
+ */
+void set_config_value_node_if_unset(nvlist_t *const parent,
+ const char *const name, const char *const value);
+
+/*
+ * Similar to set_config_value_node but expects a full path to the
+ * leaf node.
+ */
+void set_config_value(const char *path, const char *value);
+
+/*
+ * Similar to set_config_value but only sets the value if it's unset yet.
+ */
+void set_config_value_if_unset(const char *const path,
+ const char *const value);
+
+/* Convenience wrappers for boolean variables. */
+bool get_config_bool(const char *path);
+bool get_config_bool_node(const nvlist_t *parent, const char *name);
+bool get_config_bool_default(const char *path, bool def);
+bool get_config_bool_node_default(const nvlist_t *parent, const char *name,
+ bool def);
+void set_config_bool(const char *path, bool value);
+void set_config_bool_node(nvlist_t *parent, const char *name, bool value);
+
+void dump_config(void);
+
+#endif /* !__CONFIG_H__ */
diff --git a/tests/sys/virtio/config.c b/tests/sys/virtio/config.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/config.c
@@ -0,0 +1,464 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 John H. Baldwin <jhb@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include <assert.h>
+#include <err.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config.h"
+
+static nvlist_t *config_root;
+
+void
+init_config(void)
+{
+
+ config_root = nvlist_create(0);
+ if (config_root == NULL)
+ err(4, "Failed to create configuration root nvlist");
+}
+
+static nvlist_t *
+_lookup_config_node(nvlist_t *parent, const char *path, bool create)
+{
+ char *copy, *name, *tofree;
+ nvlist_t *nvl, *new_nvl;
+
+ copy = strdup(path);
+ if (copy == NULL)
+ errx(4, "Failed to allocate memory");
+ tofree = copy;
+ nvl = parent;
+ while ((name = strsep(&copy, ".")) != NULL) {
+ if (*name == '\0') {
+ warnx("Invalid configuration node: %s", path);
+ nvl = NULL;
+ break;
+ }
+ if (nvlist_exists_nvlist(nvl, name))
+ /*
+ * XXX-MJ it is incorrect to cast away the const
+ * qualifier like this since the contract with nvlist
+ * says that values are immutable, and some consumers
+ * will indeed add nodes to the returned nvlist. In
+ * practice, however, it appears to be harmless with the
+ * current nvlist implementation, so we just live with
+ * it until the implementation is reworked.
+ */
+ nvl = __DECONST(nvlist_t *,
+ nvlist_get_nvlist(nvl, name));
+ else if (nvlist_exists(nvl, name)) {
+ for (copy = tofree; copy < name; copy++)
+ if (*copy == '\0')
+ *copy = '.';
+ warnx(
+ "Configuration node %s is a child of existing variable %s",
+ path, tofree);
+ nvl = NULL;
+ break;
+ } else if (create) {
+ /*
+ * XXX-MJ as with the case above, "new_nvl" shouldn't be
+ * mutated after its ownership is given to "nvl".
+ */
+ new_nvl = nvlist_create(0);
+ if (new_nvl == NULL)
+ errx(4, "Failed to allocate memory");
+ nvlist_move_nvlist(nvl, name, new_nvl);
+ nvl = new_nvl;
+ } else {
+ nvl = NULL;
+ break;
+ }
+ }
+ free(tofree);
+ return (nvl);
+}
+
+nvlist_t *
+create_config_node(const char *path)
+{
+
+ return (_lookup_config_node(config_root, path, true));
+}
+
+nvlist_t *
+find_config_node(const char *path)
+{
+
+ return (_lookup_config_node(config_root, path, false));
+}
+
+nvlist_t *
+create_relative_config_node(nvlist_t *parent, const char *path)
+{
+
+ return (_lookup_config_node(parent, path, true));
+}
+
+nvlist_t *
+find_relative_config_node(nvlist_t *parent, const char *path)
+{
+
+ return (_lookup_config_node(parent, path, false));
+}
+
+void
+set_config_value_node(nvlist_t *parent, const char *name, const char *value)
+{
+
+ if (strchr(name, '.') != NULL)
+ errx(4, "Invalid config node name %s", name);
+ if (parent == NULL)
+ parent = config_root;
+ if (nvlist_exists_string(parent, name))
+ nvlist_free_string(parent, name);
+ else if (nvlist_exists(parent, name))
+ errx(4,
+ "Attempting to add value %s to existing node %s of list %p",
+ value, name, parent);
+ nvlist_add_string(parent, name, value);
+}
+
+void
+set_config_value_node_if_unset(nvlist_t *const parent, const char *const name,
+ const char *const value)
+{
+ if (get_config_value_node(parent, name) != NULL) {
+ return;
+ }
+
+ set_config_value_node(parent, name, value);
+}
+
+void
+set_config_value(const char *path, const char *value)
+{
+ const char *name;
+ char *node_name;
+ nvlist_t *nvl;
+
+ /* Look for last separator. */
+ name = strrchr(path, '.');
+ if (name == NULL) {
+ nvl = config_root;
+ name = path;
+ } else {
+ node_name = strndup(path, name - path);
+ if (node_name == NULL)
+ errx(4, "Failed to allocate memory");
+ nvl = create_config_node(node_name);
+ if (nvl == NULL)
+ errx(4, "Failed to create configuration node %s",
+ node_name);
+ free(node_name);
+
+ /* Skip over '.'. */
+ name++;
+ }
+
+ if (nvlist_exists_nvlist(nvl, name))
+ errx(4, "Attempting to add value %s to existing node %s",
+ value, path);
+ set_config_value_node(nvl, name, value);
+}
+
+void
+set_config_value_if_unset(const char *const path, const char *const value)
+{
+ if (get_config_value(path) != NULL) {
+ return;
+ }
+
+ set_config_value(path, value);
+}
+
+static const char *
+get_raw_config_value(const char *path)
+{
+ const char *name;
+ char *node_name;
+ nvlist_t *nvl;
+
+ /* Look for last separator. */
+ name = strrchr(path, '.');
+ if (name == NULL) {
+ nvl = config_root;
+ name = path;
+ } else {
+ node_name = strndup(path, name - path);
+ if (node_name == NULL)
+ errx(4, "Failed to allocate memory");
+ nvl = find_config_node(node_name);
+ free(node_name);
+ if (nvl == NULL)
+ return (NULL);
+
+ /* Skip over '.'. */
+ name++;
+ }
+
+ if (nvlist_exists_string(nvl, name))
+ return (nvlist_get_string(nvl, name));
+ if (nvlist_exists_nvlist(nvl, name))
+ warnx("Attempting to fetch value of node %s", path);
+ return (NULL);
+}
+
+static char *
+_expand_config_value(const char *value, int depth)
+{
+ FILE *valfp;
+ const char *cp, *vp;
+ char *nestedval, *path, *valbuf;
+ size_t valsize;
+
+ valfp = open_memstream(&valbuf, &valsize);
+ if (valfp == NULL)
+ errx(4, "Failed to allocate memory");
+
+ vp = value;
+ while (*vp != '\0') {
+ switch (*vp) {
+ case '%':
+ if (depth > 15) {
+ warnx(
+ "Too many recursive references in configuration value");
+ fputc('%', valfp);
+ vp++;
+ break;
+ }
+ if (vp[1] != '(' || vp[2] == '\0')
+ cp = NULL;
+ else
+ cp = strchr(vp + 2, ')');
+ if (cp == NULL) {
+ warnx(
+ "Invalid reference in configuration value \"%s\"",
+ value);
+ fputc('%', valfp);
+ vp++;
+ break;
+ }
+ vp += 2;
+
+ if (cp == vp) {
+ warnx(
+ "Empty reference in configuration value \"%s\"",
+ value);
+ vp++;
+ break;
+ }
+
+ /* Allocate a C string holding the path. */
+ path = strndup(vp, cp - vp);
+ if (path == NULL)
+ errx(4, "Failed to allocate memory");
+
+ /* Advance 'vp' past the reference. */
+ vp = cp + 1;
+
+ /* Fetch the referenced value. */
+ cp = get_raw_config_value(path);
+ if (cp == NULL)
+ warnx(
+ "Failed to fetch referenced configuration variable %s",
+ path);
+ else {
+ nestedval = _expand_config_value(cp, depth + 1);
+ fputs(nestedval, valfp);
+ free(nestedval);
+ }
+ free(path);
+ break;
+ case '\\':
+ vp++;
+ if (*vp == '\0') {
+ warnx(
+ "Trailing \\ in configuration value \"%s\"",
+ value);
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ fputc(*vp, valfp);
+ vp++;
+ break;
+ }
+ }
+ fclose(valfp);
+ return (valbuf);
+}
+
+static const char *
+expand_config_value(const char *value)
+{
+ static char *valbuf;
+
+ if (strchr(value, '%') == NULL)
+ return (value);
+
+ free(valbuf);
+ valbuf = _expand_config_value(value, 0);
+ return (valbuf);
+}
+
+const char *
+get_config_value(const char *path)
+{
+ const char *value;
+
+ value = get_raw_config_value(path);
+ if (value == NULL)
+ return (NULL);
+ return (expand_config_value(value));
+}
+
+const char *
+get_config_value_node(const nvlist_t *parent, const char *name)
+{
+
+ if (strchr(name, '.') != NULL)
+ errx(4, "Invalid config node name %s", name);
+ if (parent == NULL)
+ parent = config_root;
+ if (nvlist_exists_nvlist(parent, name))
+ warnx("Attempt to fetch value of node %s of list %p", name,
+ parent);
+ if (!nvlist_exists_string(parent, name))
+ return (NULL);
+
+ return (expand_config_value(nvlist_get_string(parent, name)));
+}
+
+static bool
+_bool_value(const char *name, const char *value)
+{
+
+ if (strcasecmp(value, "true") == 0 ||
+ strcasecmp(value, "on") == 0 ||
+ strcasecmp(value, "yes") == 0 ||
+ strcmp(value, "1") == 0)
+ return (true);
+ if (strcasecmp(value, "false") == 0 ||
+ strcasecmp(value, "off") == 0 ||
+ strcasecmp(value, "no") == 0 ||
+ strcmp(value, "0") == 0)
+ return (false);
+ err(4, "Invalid value %s for boolean variable %s", value, name);
+}
+
+bool
+get_config_bool(const char *path)
+{
+ const char *value;
+
+ value = get_config_value(path);
+ if (value == NULL)
+ err(4, "Failed to fetch boolean variable %s", path);
+ return (_bool_value(path, value));
+}
+
+bool
+get_config_bool_default(const char *path, bool def)
+{
+ const char *value;
+
+ value = get_config_value(path);
+ if (value == NULL)
+ return (def);
+ return (_bool_value(path, value));
+}
+
+bool
+get_config_bool_node(const nvlist_t *parent, const char *name)
+{
+ const char *value;
+
+ value = get_config_value_node(parent, name);
+ if (value == NULL)
+ err(4, "Failed to fetch boolean variable %s", name);
+ return (_bool_value(name, value));
+}
+
+bool
+get_config_bool_node_default(const nvlist_t *parent, const char *name,
+ bool def)
+{
+ const char *value;
+
+ value = get_config_value_node(parent, name);
+ if (value == NULL)
+ return (def);
+ return (_bool_value(name, value));
+}
+
+void
+set_config_bool(const char *path, bool value)
+{
+
+ set_config_value(path, value ? "true" : "false");
+}
+
+void
+set_config_bool_node(nvlist_t *parent, const char *name, bool value)
+{
+
+ set_config_value_node(parent, name, value ? "true" : "false");
+}
+
+static void
+dump_tree(const char *prefix, const nvlist_t *nvl)
+{
+ const char *name;
+ void *cookie;
+ int type;
+
+ cookie = NULL;
+ while ((name = nvlist_next(nvl, &type, &cookie)) != NULL) {
+ if (type == NV_TYPE_NVLIST) {
+ char *new_prefix;
+
+ asprintf(&new_prefix, "%s%s.", prefix, name);
+ dump_tree(new_prefix, nvlist_get_nvlist(nvl, name));
+ free(new_prefix);
+ } else {
+ assert(type == NV_TYPE_STRING);
+ printf("%s%s=%s\n", prefix, name,
+ nvlist_get_string(nvl, name));
+ }
+ }
+}
+
+void
+dump_config(void)
+{
+ dump_tree("", config_root);
+}
diff --git a/tests/sys/virtio/debug.h b/tests/sys/virtio/debug.h
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/debug.h
@@ -0,0 +1,40 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2019 Vincenzo Maffione <vmaffione@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _DEBUG_H_
+#define _DEBUG_H_
+
+
+#define FPRINTLN(filep, fmt, arg...) \
+ do { \
+ fprintf(filep, fmt "\n", ##arg); \
+ } while (0)
+
+#define PRINTLN(fmt, arg...) FPRINTLN(stdout, fmt, ##arg)
+#define EPRINTLN(fmt, arg...) FPRINTLN(stderr, fmt, ##arg)
+
+#endif
diff --git a/tests/sys/virtio/iov.h b/tests/sys/virtio/iov.h
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/iov.h
@@ -0,0 +1,42 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>.
+ * Copyright (c) 2018 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _IOV_H_
+#define _IOV_H_
+
+void seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2,
+ int *niov2, size_t seek);
+void truncate_iov(struct iovec *iov, int *niov, size_t length);
+size_t count_iov(const struct iovec *iov, int niov);
+ssize_t iov_to_buf(const struct iovec *iov, int niov, void **buf);
+ssize_t buf_to_iov(const void *buf, size_t buflen, const struct iovec *iov,
+ int niov, size_t seek);
+
+#endif /* _IOV_H_ */
diff --git a/tests/sys/virtio/iov.c b/tests/sys/virtio/iov.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/iov.c
@@ -0,0 +1,146 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>.
+ * Copyright (c) 2018 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+
+#include <stdlib.h>
+#include <string.h>
+#include "iov.h"
+
+void
+seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2, int *niov2,
+ size_t seek)
+{
+ size_t remainder = 0;
+ size_t left = seek;
+ int i, j;
+
+ for (i = 0; i < niov1; i++) {
+ size_t toseek = MIN(left, iov1[i].iov_len);
+ left -= toseek;
+
+ if (toseek == iov1[i].iov_len)
+ continue;
+
+ if (left == 0) {
+ remainder = toseek;
+ break;
+ }
+ }
+
+ for (j = i; j < niov1; j++) {
+ iov2[j - i].iov_base = (char *)iov1[j].iov_base + remainder;
+ iov2[j - i].iov_len = iov1[j].iov_len - remainder;
+ remainder = 0;
+ }
+
+ *niov2 = j - i;
+}
+
+size_t
+count_iov(const struct iovec *iov, int niov)
+{
+ size_t total = 0;
+ int i;
+
+ for (i = 0; i < niov; i++)
+ total += iov[i].iov_len;
+
+ return (total);
+}
+
+void
+truncate_iov(struct iovec *iov, int *niov, size_t length)
+{
+ size_t done = 0;
+ int i;
+
+ for (i = 0; i < *niov; i++) {
+ size_t toseek = MIN(length - done, iov[i].iov_len);
+ done += toseek;
+
+ if (toseek <= iov[i].iov_len) {
+ iov[i].iov_len = toseek;
+ *niov = i + 1;
+ return;
+ }
+ }
+}
+
+ssize_t
+iov_to_buf(const struct iovec *iov, int niov, void **buf)
+{
+ size_t ptr, total;
+ int i;
+
+ total = count_iov(iov, niov);
+ *buf = realloc(*buf, total);
+ if (*buf == NULL)
+ return (-1);
+
+ for (i = 0, ptr = 0; i < niov; i++) {
+ memcpy((uint8_t *)*buf + ptr, iov[i].iov_base, iov[i].iov_len);
+ ptr += iov[i].iov_len;
+ }
+
+ return (total);
+}
+
+ssize_t
+buf_to_iov(const void *buf, size_t buflen, const struct iovec *iov, int niov,
+ size_t seek)
+{
+ struct iovec *diov;
+ size_t off = 0, len;
+ int i;
+
+ if (seek > 0) {
+ int ndiov;
+
+ diov = malloc(sizeof(struct iovec) * niov);
+ seek_iov(iov, niov, diov, &ndiov, seek);
+ iov = diov;
+ niov = ndiov;
+ }
+
+ for (i = 0; i < niov && off < buflen; i++) {
+ len = MIN(iov[i].iov_len, buflen - off);
+ memcpy(iov[i].iov_base, (const uint8_t *)buf + off, len);
+ off += len;
+ }
+
+ if (seek > 0)
+ free(diov);
+
+ return ((ssize_t)off);
+}
+
diff --git a/tests/sys/virtio/iov_emul.h b/tests/sys/virtio/iov_emul.h
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/iov_emul.h
@@ -0,0 +1,20 @@
+#ifndef _IOV_EMUL_E
+#define _IOV_EMUL_E
+
+struct virtio_softc;
+
+struct iov_emul {
+ struct vtdbg_transfer *iove_tf;
+ size_t iove_maxcnt;
+ size_t iove_ind;
+};
+
+#define IOVE_INIT (16)
+
+struct iov_emul *iove_alloc(void);
+void iove_free(struct iov_emul *iove);
+int iove_add(struct iov_emul *iove, uint64_t phys, size_t len, struct iovec *iov);
+int iove_import(int fd, struct iov_emul *iove);
+int iove_export(int fd, struct iov_emul *iove);
+
+#endif /* _IOV_EMUL_E */
diff --git a/tests/sys/virtio/iov_emul.c b/tests/sys/virtio/iov_emul.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/iov_emul.c
@@ -0,0 +1,106 @@
+#include <sys/param.h>
+#include <sys/uio.h>
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+
+#include <dev/virtio/dbg/virtio_dbg.h>
+
+#include "debug.h"
+#include "iov_emul.h"
+#include "mmio_emul.h"
+#include "virtio.h"
+
+struct iov_emul *
+iove_alloc(void)
+{
+ struct iov_emul *iove;
+
+ iove = calloc(1, sizeof(*iove));
+
+ iove->iove_tf = calloc(IOVE_INIT, sizeof(*iove->iove_tf));
+ if (iove->iove_tf == NULL) {
+ free(iove);
+ return (NULL);
+ }
+
+ iove->iove_maxcnt = IOVE_INIT;
+
+ return (iove);
+}
+
+void
+iove_free(struct iov_emul *iove)
+{
+ size_t i;
+
+ for (i = 0; i < iove->iove_ind; i++)
+ free(iove->iove_tf[i].vtdt_device);
+
+ free(iove);
+}
+
+
+int
+iove_add(struct iov_emul *iove, uint64_t phys, size_t len, struct iovec *iov)
+{
+ struct vtdbg_transfer *tf = iove->iove_tf;
+ size_t ind = iove->iove_ind;
+ char *base;
+
+ if (ind == iove->iove_maxcnt){
+ tf = reallocarray(tf, 2 * iove->iove_maxcnt,
+ sizeof(*tf));
+ if (tf == NULL)
+ return (ENOMEM);
+ iove->iove_tf = tf;
+ iove->iove_maxcnt *= 2;
+ }
+
+ base = malloc(len);
+ if (base == NULL)
+ return (ENOMEM);
+
+ iove->iove_tf[ind].vtdt_device = base;
+ iove->iove_tf[ind].vtdt_driver = (caddr_t) phys;
+ iove->iove_tf[ind].vtdt_len = len;
+ iove->iove_ind += 1;
+
+ iov->iov_base = base;
+ iov->iov_len = len;
+
+ return (0);
+}
+
+
+/*
+ * Import a read IO vector from the kernel.
+ */
+int
+iove_import(int fd, struct iov_emul *iove)
+{
+ struct vtdbg_io_args args = {
+ .transfers = iove->iove_tf,
+ .cnt = iove->iove_ind,
+ .touser = true,
+ };
+
+ return (ioctl(fd, VIRTIO_DBG_TRANSFER, &args));
+}
+
+/*
+ * Export a write IO vector to the kernel.
+ */
+int
+iove_export(int fd, struct iov_emul *iove)
+{
+ struct vtdbg_io_args args = {
+ .transfers = iove->iove_tf,
+ .cnt = iove->iove_ind,
+ .touser = false,
+ };
+
+ return (ioctl(fd, VIRTIO_DBG_TRANSFER, &args));
+}
+
diff --git a/tests/sys/virtio/mevent.h b/tests/sys/virtio/mevent.h
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/mevent.h
@@ -0,0 +1,60 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _MEVENT_H_
+#define _MEVENT_H_
+
+enum ev_type {
+ EVF_READ,
+ EVF_WRITE,
+ EVF_TIMER,
+ EVF_SIGNAL,
+ EVF_VNODE,
+};
+
+/* Filter flags for EVF_VNODE */
+#define EVFF_ATTRIB 0x0001
+
+typedef void mevent_cb_t(int, enum ev_type, void *, uint64_t);
+struct mevent;
+
+struct mevent *mevent_add(int fd, enum ev_type type, mevent_cb_t *func,
+ void *param);
+struct mevent *mevent_add_flags(int fd, enum ev_type type, int fflags,
+ mevent_cb_t *func, void *param);
+struct mevent *mevent_add_disabled(int fd, enum ev_type type,
+ mevent_cb_t *func, void *param);
+int mevent_enable(struct mevent *evp);
+int mevent_disable(struct mevent *evp);
+int mevent_delete(struct mevent *evp);
+int mevent_delete_close(struct mevent *evp);
+int mevent_timer_update(struct mevent *evp, int msecs);
+
+void mevent_dispatch(void);
+
+#endif /* _MEVENT_H_ */
diff --git a/tests/sys/virtio/mevent.c b/tests/sys/virtio/mevent.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/mevent.c
@@ -0,0 +1,564 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Micro event library for FreeBSD, designed for a single i/o thread
+ * using kqueue, and having events be persistent by default.
+ */
+
+#include <sys/cdefs.h>
+#include <assert.h>
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/event.h>
+#include <sys/time.h>
+
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "mevent.h"
+
+#define MEVENT_MAX 64
+
+static pthread_t mevent_tid;
+static pthread_once_t mevent_once = PTHREAD_ONCE_INIT;
+static int mevent_timid = 43;
+static int mevent_pipefd[2];
+static int mfd;
+static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
+
+struct mevent {
+ mevent_cb_t *me_func;
+#define me_msecs me_fd
+ int me_fd;
+ int me_timid;
+ enum ev_type me_type;
+ void *me_param;
+ int me_cq;
+ int me_state; /* Desired kevent flags. */
+ int me_closefd;
+ int me_fflags;
+ LIST_ENTRY(mevent) me_list;
+};
+
+enum mevent_update_type {
+ UPDATE_ENABLE,
+ UPDATE_DISABLE,
+ UPDATE_TIMER,
+};
+
+static LIST_HEAD(listhead, mevent) global_head, change_head;
+
+static void
+mevent_qlock(void)
+{
+ pthread_mutex_lock(&mevent_lmutex);
+}
+
+static void
+mevent_qunlock(void)
+{
+ pthread_mutex_unlock(&mevent_lmutex);
+}
+
+static void
+mevent_pipe_read(int fd, enum ev_type type __unused, void *param __unused,
+ uint64_t data __unused)
+{
+ char buf[MEVENT_MAX];
+ int status;
+
+ /*
+ * Drain the pipe read side. The fd is non-blocking so this is
+ * safe to do.
+ */
+ do {
+ status = read(fd, buf, sizeof(buf));
+ } while (status == MEVENT_MAX);
+}
+
+static void
+mevent_notify(void)
+{
+ char c = '\0';
+
+ /*
+ * If calling from outside the i/o thread, write a byte on the
+ * pipe to force the i/o thread to exit the blocking kevent call.
+ */
+ if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
+ write(mevent_pipefd[1], &c, 1);
+ }
+}
+
+static void
+mevent_init(void)
+{
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+#endif
+
+ mfd = kqueue();
+ assert(mfd > 0);
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_KQUEUE);
+ if (caph_rights_limit(mfd, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ LIST_INIT(&change_head);
+ LIST_INIT(&global_head);
+}
+
+static int
+mevent_kq_filter(struct mevent *mevp)
+{
+ int retval;
+
+ retval = 0;
+
+ if (mevp->me_type == EVF_READ)
+ retval = EVFILT_READ;
+
+ if (mevp->me_type == EVF_WRITE)
+ retval = EVFILT_WRITE;
+
+ if (mevp->me_type == EVF_TIMER)
+ retval = EVFILT_TIMER;
+
+ if (mevp->me_type == EVF_SIGNAL)
+ retval = EVFILT_SIGNAL;
+
+ if (mevp->me_type == EVF_VNODE)
+ retval = EVFILT_VNODE;
+
+ return (retval);
+}
+
+static int
+mevent_kq_flags(struct mevent *mevp)
+{
+ int retval;
+
+ retval = mevp->me_state;
+
+ if (mevp->me_type == EVF_VNODE)
+ retval |= EV_CLEAR;
+
+ return (retval);
+}
+
+static int
+mevent_kq_fflags(struct mevent *mevp)
+{
+ int retval;
+
+ retval = 0;
+
+ switch (mevp->me_type) {
+ case EVF_VNODE:
+ if ((mevp->me_fflags & EVFF_ATTRIB) != 0)
+ retval |= NOTE_ATTRIB;
+ break;
+ case EVF_READ:
+ case EVF_WRITE:
+ case EVF_TIMER:
+ case EVF_SIGNAL:
+ break;
+ }
+
+ return (retval);
+}
+
+static void
+mevent_populate(struct mevent *mevp, struct kevent *kev)
+{
+ if (mevp->me_type == EVF_TIMER) {
+ kev->ident = mevp->me_timid;
+ kev->data = mevp->me_msecs;
+ } else {
+ kev->ident = mevp->me_fd;
+ kev->data = 0;
+ }
+ kev->filter = mevent_kq_filter(mevp);
+ kev->flags = mevent_kq_flags(mevp);
+ kev->fflags = mevent_kq_fflags(mevp);
+ kev->udata = mevp;
+}
+
+static int
+mevent_build(struct kevent *kev)
+{
+ struct mevent *mevp, *tmpp;
+ int i;
+
+ i = 0;
+
+ mevent_qlock();
+
+ LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
+ if (mevp->me_closefd) {
+ /*
+ * A close of the file descriptor will remove the
+ * event
+ */
+ close(mevp->me_fd);
+ } else {
+ mevent_populate(mevp, &kev[i]);
+ i++;
+ }
+
+ mevp->me_cq = 0;
+ LIST_REMOVE(mevp, me_list);
+
+ if (mevp->me_state & EV_DELETE) {
+ free(mevp);
+ } else {
+ LIST_INSERT_HEAD(&global_head, mevp, me_list);
+ }
+
+ assert(i < MEVENT_MAX);
+ }
+
+ mevent_qunlock();
+
+ return (i);
+}
+
+static void
+mevent_handle(struct kevent *kev, int numev)
+{
+ struct mevent *mevp;
+ uint64_t data;
+ int i;
+
+ for (i = 0; i < numev; i++) {
+ mevp = kev[i].udata;
+ data = kev[i].data;
+
+ /* XXX check for EV_ERROR ? */
+
+ (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param, data);
+ }
+}
+
+static struct mevent *
+mevent_add_state(int tfd, enum ev_type type, mevent_cb_t *func, void *param,
+ int state, int fflags)
+{
+ struct kevent kev;
+ struct mevent *lp, *mevp;
+ int ret;
+
+ if (tfd < 0 || func == NULL) {
+ return (NULL);
+ }
+
+ mevp = NULL;
+
+ pthread_once(&mevent_once, mevent_init);
+
+ mevent_qlock();
+
+ /*
+ * Verify that the fd/type tuple is not present in any list
+ */
+ LIST_FOREACH(lp, &global_head, me_list) {
+ if (type != EVF_TIMER && lp->me_fd == tfd &&
+ lp->me_type == type) {
+ goto exit;
+ }
+ }
+
+ LIST_FOREACH(lp, &change_head, me_list) {
+ if (type != EVF_TIMER && lp->me_fd == tfd &&
+ lp->me_type == type) {
+ goto exit;
+ }
+ }
+
+ /*
+ * Allocate an entry and populate it.
+ */
+ mevp = calloc(1, sizeof(struct mevent));
+ if (mevp == NULL) {
+ goto exit;
+ }
+
+ if (type == EVF_TIMER) {
+ mevp->me_msecs = tfd;
+ mevp->me_timid = mevent_timid++;
+ } else
+ mevp->me_fd = tfd;
+ mevp->me_type = type;
+ mevp->me_func = func;
+ mevp->me_param = param;
+ mevp->me_state = state;
+ mevp->me_fflags = fflags;
+
+ /*
+ * Try to add the event. If this fails, report the failure to
+ * the caller.
+ */
+ mevent_populate(mevp, &kev);
+ ret = kevent(mfd, &kev, 1, NULL, 0, NULL);
+ if (ret == -1) {
+ free(mevp);
+ mevp = NULL;
+ goto exit;
+ }
+
+ mevp->me_state &= ~EV_ADD;
+ LIST_INSERT_HEAD(&global_head, mevp, me_list);
+
+exit:
+ mevent_qunlock();
+
+ return (mevp);
+}
+
+struct mevent *
+mevent_add(int tfd, enum ev_type type, mevent_cb_t *func, void *param)
+{
+
+ return (mevent_add_state(tfd, type, func, param, EV_ADD, 0));
+}
+
+struct mevent *
+mevent_add_flags(int tfd, enum ev_type type, int fflags, mevent_cb_t *func, void *param)
+{
+
+ return (mevent_add_state(tfd, type, func, param, EV_ADD, fflags));
+}
+
+struct mevent *
+mevent_add_disabled(int tfd, enum ev_type type, mevent_cb_t *func, void *param)
+{
+
+ return (mevent_add_state(tfd, type, func, param, EV_ADD | EV_DISABLE, 0));
+}
+
+static int
+mevent_update(struct mevent *evp, enum mevent_update_type type, int msecs)
+{
+ int newstate;
+
+ mevent_qlock();
+
+ /*
+ * It's not possible to update a deleted event
+ */
+ assert((evp->me_state & EV_DELETE) == 0);
+
+ newstate = evp->me_state;
+ if (type == UPDATE_ENABLE) {
+ newstate |= EV_ENABLE;
+ newstate &= ~EV_DISABLE;
+ } else if (type == UPDATE_DISABLE) {
+ newstate |= EV_DISABLE;
+ newstate &= ~EV_ENABLE;
+ } else {
+ assert(type == UPDATE_TIMER);
+ assert(evp->me_type == EVF_TIMER);
+ newstate |= EV_ADD;
+ evp->me_msecs = msecs;
+ }
+
+ /*
+ * No update needed if enable/disable had no effect
+ */
+ if (evp->me_state != newstate || type == UPDATE_TIMER) {
+ evp->me_state = newstate;
+
+ /*
+ * Place the entry onto the changed list if not
+ * already there.
+ */
+ if (evp->me_cq == 0) {
+ evp->me_cq = 1;
+ LIST_REMOVE(evp, me_list);
+ LIST_INSERT_HEAD(&change_head, evp, me_list);
+ mevent_notify();
+ }
+ }
+
+ mevent_qunlock();
+
+ return (0);
+}
+
+int
+mevent_enable(struct mevent *evp)
+{
+ return (mevent_update(evp, UPDATE_ENABLE, -1));
+}
+
+int
+mevent_disable(struct mevent *evp)
+{
+ return (mevent_update(evp, UPDATE_DISABLE, -1));
+}
+
+int
+mevent_timer_update(struct mevent *evp, int msecs)
+{
+ return (mevent_update(evp, UPDATE_TIMER, msecs));
+}
+
+static int
+mevent_delete_event(struct mevent *evp, int closefd)
+{
+ mevent_qlock();
+
+ /*
+ * Place the entry onto the changed list if not already there, and
+ * mark as to be deleted.
+ */
+ if (evp->me_cq == 0) {
+ evp->me_cq = 1;
+ LIST_REMOVE(evp, me_list);
+ LIST_INSERT_HEAD(&change_head, evp, me_list);
+ mevent_notify();
+ }
+ evp->me_state = EV_DELETE;
+
+ if (closefd)
+ evp->me_closefd = 1;
+
+ mevent_qunlock();
+
+ return (0);
+}
+
+int
+mevent_delete(struct mevent *evp)
+{
+
+ return (mevent_delete_event(evp, 0));
+}
+
+int
+mevent_delete_close(struct mevent *evp)
+{
+
+ return (mevent_delete_event(evp, 1));
+}
+
+static void
+mevent_set_name(void)
+{
+
+ pthread_set_name_np(mevent_tid, "mevent");
+}
+
+void
+mevent_dispatch(void)
+{
+ struct kevent changelist[MEVENT_MAX];
+ struct kevent eventlist[MEVENT_MAX];
+ struct mevent *pipev;
+ int numev;
+ int ret;
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+#endif
+
+ mevent_tid = pthread_self();
+ mevent_set_name();
+
+ pthread_once(&mevent_once, mevent_init);
+
+ /*
+ * Open the pipe that will be used for other threads to force
+ * the blocking kqueue call to exit by writing to it. Set the
+ * descriptor to non-blocking.
+ */
+ ret = pipe(mevent_pipefd);
+ if (ret < 0) {
+ perror("pipe");
+ exit(0);
+ }
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
+ if (caph_rights_limit(mevent_pipefd[0], &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+ if (caph_rights_limit(mevent_pipefd[1], &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ /*
+ * Add internal event handler for the pipe write fd
+ */
+ pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
+ assert(pipev != NULL);
+
+ for (;;) {
+ /*
+ * Build changelist if required.
+ * XXX the changelist can be put into the blocking call
+ * to eliminate the extra syscall. Currently better for
+ * debug.
+ */
+ numev = mevent_build(changelist);
+ if (numev) {
+ ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
+ if (ret == -1) {
+ perror("Error return from kevent change");
+ }
+ }
+
+ /*
+ * Block awaiting events
+ */
+ ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
+ if (ret == -1 && errno != EINTR) {
+ perror("Error return from kevent monitor");
+ }
+
+ /*
+ * Handle reported events
+ */
+ mevent_handle(eventlist, ret);
+ }
+}
diff --git a/tests/sys/virtio/mmio_emul.h b/tests/sys/virtio/mmio_emul.h
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/mmio_emul.h
@@ -0,0 +1,117 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _MMIO_EMUL_H_
+#define _MMIO_EMUL_H_
+
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/kernel.h>
+#include <sys/nv.h>
+#include <sys/_pthreadtypes.h>
+
+#include <assert.h>
+
+#define MI_NAMESZ (40)
+
+struct mmio_devinst;
+
+struct mmio_devemu {
+ const char *me_emu; /* Name of device emulation */
+
+ /* instance creation */
+ int (*me_init)(struct mmio_devinst *, nvlist_t *);
+ void (*me_write)(struct mmio_devinst *mdi, uint64_t offset,
+ int size, uint32_t value);
+};
+#define MMIO_EMUL_SET(x) DATA_SET(mmio_devemu_set, x)
+
+enum mmio_devstate {
+ MIDEV_INVALID,
+ MIDEV_ACKNOWLEDGED,
+ MIDEV_DRIVER_FOUND,
+ MIDEV_FEATURES_OK,
+ MIDEV_LIVE,
+ MIDEV_FAILED,
+ MIDEV_DEVICE_STATES,
+};
+
+struct mmio_devinst {
+ struct mmio_devemu *mi_d;
+ char mi_name[MI_NAMESZ];
+ char *mi_addr; /* VQ control region */
+ size_t mi_bytes; /* Size of region in bytes */
+ int mi_fd; /* File descriptor for the region. */
+ enum mmio_devstate mi_state;
+};
+
+/* XXX Sensible default until proven otherwise. But we need to link it with the in-kernel header. */
+#define MMIO_TOTAL_SIZE (1024 * 1024 * 10)
+#define MMIO_CTRDEV ("/dev/vtdbg")
+
+int init_mmio(nvlist_t *nvl);
+void mmio_print_supported_devices(void);
+int mmio_parse_device(nvlist_t *nvl, char *opt);
+
+static __inline void
+mmio_set_cfgdata8(struct mmio_devinst *mdi, int offset, uint8_t val)
+{
+ *(uint8_t *)(mdi->mi_addr + offset) = val;
+}
+
+static __inline void
+mmio_set_cfgdata16(struct mmio_devinst *mdi, int offset, uint16_t val)
+{
+ *(uint16_t *)(mdi->mi_addr + offset) = htole16(val);
+}
+
+static __inline void
+mmio_set_cfgdata32(struct mmio_devinst *mdi, int offset, uint32_t val)
+{
+ *(uint32_t *)(mdi->mi_addr + offset) = htole32(val);
+}
+
+static __inline uint8_t
+mmio_get_cfgdata8(struct mmio_devinst *mdi, int offset)
+{
+ return (*(uint8_t *)(mdi->mi_addr + offset));
+}
+
+static __inline uint16_t
+mmio_get_cfgdata16(struct mmio_devinst *mdi, int offset)
+{
+ return le16toh((*(uint16_t *)(mdi->mi_addr + offset)));
+}
+
+static __inline uint32_t
+mmio_get_cfgdata32(struct mmio_devinst *mdi, int offset)
+{
+ return le32toh((*(uint32_t *)(mdi->mi_addr + offset)));
+}
+
+#endif /* _MMIO_EMUL_H_ */
diff --git a/tests/sys/virtio/mmio_emul.c b/tests/sys/virtio/mmio_emul.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/mmio_emul.c
@@ -0,0 +1,178 @@
+#include <sys/param.h>
+#include <sys/mman.h>
+#include <sys/nv.h>
+
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <dev/virtio/dbg/virtio_dbg.h>
+
+#include "config.h"
+#include "debug.h"
+#include "mmio_emul.h"
+#include "virtio.h"
+
+SET_DECLARE(mmio_devemu_set, struct mmio_devemu);
+
+static struct mmio_devemu *
+mmio_emul_finddev(const char *name)
+{
+ struct mmio_devemu **mdpp, *mdp;
+
+ SET_FOREACH(mdpp, mmio_devemu_set) {
+ mdp = *mdpp;
+ if (!strcmp(mdp->me_emu, name)) {
+ return (mdp);
+ }
+ }
+
+ return (NULL);
+}
+
+static void *
+mmio_emul_driver_init(void *arg)
+{
+ int error;
+ int fd = (int)(long)arg;
+
+ error = ioctl(fd, VIRTIO_DBG_INIT);
+ if (error < 0) {
+ EPRINTLN("Control device initialization error: %s",
+ strerror(errno));
+ exit(1);
+ }
+ pthread_exit(NULL);
+}
+
+static int
+mmio_emul_control_init(struct mmio_devinst *mdi, struct mmio_devemu *mde, nvlist_t *nvl)
+{
+ pthread_t thread;
+ char *mmio;
+ int err;
+ int fd;
+
+ fd = open(MMIO_CTRDEV, O_RDWR);
+ if (fd == -1) {
+ EPRINTLN("Control device open error: %s",
+ strerror(errno));
+ return (-1);
+ }
+
+ mmio = mmap(NULL, MMIO_TOTAL_SIZE, PROT_READ | PROT_WRITE,
+ MAP_FILE | MAP_SHARED, fd, 0);
+ if (mmio == MAP_FAILED) {
+ EPRINTLN("Control device mapping error: %s",
+ strerror(errno));
+ close(fd);
+ return (-1);
+ }
+
+ mdi->mi_fd = fd;
+ mdi->mi_addr = mmio;
+ mdi->mi_bytes = MMIO_TOTAL_SIZE;
+
+ /*
+ * XXX Hack. We currently hardwire the block device ID. Propagate
+ * the device type in a different way.
+ */
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_MAGIC_VALUE, VIRTIO_MMIO_MAGIC_VIRT);
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_VERSION, 0x2);
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_DEVICE_ID, 0x2);
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_VENDOR_ID, VIRTIO_VENDOR);
+
+ err = (mde->me_init)(mdi, nvl);
+ if (err != 0)
+ return (err);
+
+ /*
+ * Make the ioctl out of band, because we wll use this thread to to service
+ * the register the writes triggered by the driver during device attach.
+ */
+ return (pthread_create(&thread, NULL, mmio_emul_driver_init, (void *)(long)fd));
+}
+
+static int
+mmio_emul_init(struct mmio_devemu *mde, nvlist_t *nvl)
+{
+ struct mmio_devinst *mdi;
+ int err;
+
+ mdi = calloc(1, sizeof(struct mmio_devinst));
+ if (mdi == NULL)
+ return (ENOMEM);
+
+ snprintf(mdi->mi_name, sizeof(mdi->mi_name), "%s@mmio", mde->me_emu);
+ mdi->mi_state = MIDEV_INVALID;
+ mdi->mi_fd = -1;
+
+ err = mmio_emul_control_init(mdi, mde, nvl);
+ if (err != 0) {
+ free(mdi);
+ return (err);
+ }
+
+ return (0);
+}
+
+int
+mmio_parse_device(nvlist_t *nvl, char *opt)
+{
+ struct mmio_devemu *mde;
+ char *emul = opt;
+
+ mde = mmio_emul_finddev(emul);
+ if (mde == NULL) {
+ EPRINTLN("unknown mmio device %s\n", emul);
+ return (EINVAL);
+ }
+
+ if (get_config_value_node(nvl, "devtype") != NULL) {
+ EPRINTLN("device type already defined!");
+ return (EINVAL);
+ }
+
+ set_config_value_node(nvl, "devtype", mde->me_emu);
+
+ return (0);
+}
+
+
+void
+mmio_print_supported_devices(void)
+{
+ struct mmio_devemu **mdpp, *mdp;
+
+ SET_FOREACH(mdpp, mmio_devemu_set) {
+ mdp = *mdpp;
+ printf("%s\n", mdp->me_emu);
+ }
+}
+
+int
+init_mmio(nvlist_t *nvl)
+{
+ struct mmio_devemu *mde;
+ const char *emul;
+
+ emul = get_config_value_node(nvl, "devtype");
+ if (emul == NULL) {
+ EPRINTLN("mmio device missing devtype value");
+ return (EINVAL);
+ }
+
+ mde = mmio_emul_finddev(emul);
+ if (mde == NULL) {
+ EPRINTLN("mmio unknown device \"%s\"", emul);
+ return (EINVAL);
+ }
+
+ return (mmio_emul_init(mde, nvl));
+}
diff --git a/tests/sys/virtio/mmio_virtio_block.c b/tests/sys/virtio/mmio_virtio_block.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/mmio_virtio_block.c
@@ -0,0 +1,560 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ * Copyright 2020-2021 Joyent, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+
+#include <stdbool.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <md5.h>
+
+#include <dev/virtio/dbg/virtio_dbg.h>
+
+#include "config.h"
+#include "debug.h"
+#include "mevent.h"
+#include "mmio_emul.h"
+#include "virtio.h"
+#include "block_if.h"
+#include "iov_emul.h"
+
+#define VTBLK_BSIZE 512
+#define VTBLK_RINGSZ 128
+
+_Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request");
+
+#define VTBLK_S_OK 0
+#define VTBLK_S_IOERR 1
+#define VTBLK_S_UNSUPP 2
+
+#define VTBLK_BLK_ID_BYTES 20 + 1
+
+/* Capability bits */
+#define VTBLK_F_BARRIER (1 << 0) /* Does host support barriers? */
+#define VTBLK_F_SIZE_MAX (1 << 1) /* Indicates maximum segment size */
+#define VTBLK_F_SEG_MAX (1 << 2) /* Indicates maximum # of segments */
+#define VTBLK_F_GEOMETRY (1 << 4) /* Legacy geometry available */
+#define VTBLK_F_RO (1 << 5) /* Disk is read-only */
+#define VTBLK_F_BLK_SIZE (1 << 6) /* Block size of disk is available*/
+#define VTBLK_F_SCSI (1 << 7) /* Supports scsi command passthru */
+#define VTBLK_F_FLUSH (1 << 9) /* Writeback mode enabled after reset */
+#define VTBLK_F_WCE (1 << 9) /* Legacy alias for FLUSH */
+#define VTBLK_F_TOPOLOGY (1 << 10) /* Topology information is available */
+#define VTBLK_F_CONFIG_WCE (1 << 11) /* Writeback mode available in config */
+#define VTBLK_F_MQ (1 << 12) /* Multi-Queue */
+#define VTBLK_F_DISCARD (1 << 13) /* Trim blocks */
+#define VTBLK_F_WRITE_ZEROES (1 << 14) /* Write zeros */
+
+/*
+ * Host capabilities
+ */
+#define VTBLK_S_HOSTCAPS \
+ ( VTBLK_F_SEG_MAX | \
+ VTBLK_F_BLK_SIZE | \
+ VTBLK_F_FLUSH | \
+ VTBLK_F_TOPOLOGY )
+ /* XXX Reactivate */
+// VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */
+
+/*
+ * The current blockif_delete() interface only allows a single delete
+ * request at a time.
+ */
+#define VTBLK_MAX_DISCARD_SEG 1
+
+/*
+ * An arbitrary limit to prevent excessive latency due to large
+ * delete requests.
+ */
+#define VTBLK_MAX_DISCARD_SECT ((16 << 20) / VTBLK_BSIZE) /* 16 MiB */
+
+/*
+ * Config space "registers"
+ */
+struct vtblk_config {
+ uint64_t vbc_capacity;
+ uint32_t vbc_size_max;
+ uint32_t vbc_seg_max;
+ struct {
+ uint16_t cylinders;
+ uint8_t heads;
+ uint8_t sectors;
+ } vbc_geometry;
+ uint32_t vbc_blk_size;
+ struct {
+ uint8_t physical_block_exp;
+ uint8_t alignment_offset;
+ uint16_t min_io_size;
+ uint32_t opt_io_size;
+ } vbc_topology;
+ uint8_t vbc_writeback;
+ uint8_t unused0[1];
+ uint16_t num_queues;
+ uint32_t max_discard_sectors;
+ uint32_t max_discard_seg;
+ uint32_t discard_sector_alignment;
+ uint32_t max_write_zeroes_sectors;
+ uint32_t max_write_zeroes_seg;
+ uint8_t write_zeroes_may_unmap;
+ uint8_t unused1[3];
+} __packed;
+
+/*
+ * Fixed-size block header
+ */
+struct virtio_blk_hdr {
+#define VBH_OP_READ 0
+#define VBH_OP_WRITE 1
+#define VBH_OP_SCSI_CMD 2
+#define VBH_OP_SCSI_CMD_OUT 3
+#define VBH_OP_FLUSH 4
+#define VBH_OP_FLUSH_OUT 5
+#define VBH_OP_IDENT 8
+#define VBH_OP_DISCARD 11
+#define VBH_OP_WRITE_ZEROES 13
+
+#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */
+ uint32_t vbh_type;
+ uint32_t vbh_ioprio;
+ uint64_t vbh_sector;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int mmio_vtblk_debug;
+#define DPRINTF(params) if (mmio_vtblk_debug) PRINTLN params
+#define WPRINTF(params) PRINTLN params
+
+struct mmio_vtblk_ioreq {
+ struct blockif_req io_req;
+ struct mmio_vtblk_softc *io_sc;
+ uint8_t *io_status;
+ uint16_t io_idx;
+ struct iov_emul *io_iove;
+};
+
+struct virtio_blk_discard_write_zeroes {
+ uint64_t sector;
+ uint32_t num_sectors;
+ struct {
+ uint32_t unmap:1;
+ uint32_t reserved:31;
+ } flags;
+};
+
+/*
+ * Per-device softc
+ */
+struct mmio_vtblk_softc {
+ struct virtio_softc vbsc_vs;
+ pthread_mutex_t vsc_mtx;
+ struct vqueue_info vbsc_vq;
+ struct vtblk_config *vbsc_cfg;
+ struct virtio_consts vbsc_consts;
+ struct blockif_ctxt *bc;
+ char vbsc_ident[VTBLK_BLK_ID_BYTES];
+ struct mmio_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ];
+};
+
+static void mmio_vtblk_reset(void *);
+static void mmio_vtblk_notify(void *, struct vqueue_info *);
+static int mmio_vtblk_cfgread(void *, int, int, uint32_t *);
+static int mmio_vtblk_cfgwrite(void *, int, int, uint32_t);
+
+static struct virtio_consts vtblk_vi_consts = {
+ .vc_name = "vtblk",
+ .vc_nvq = 1,
+ .vc_cfgsize = sizeof(struct vtblk_config),
+ .vc_reset = mmio_vtblk_reset,
+ .vc_qnotify = mmio_vtblk_notify,
+ .vc_cfgread = mmio_vtblk_cfgread,
+ .vc_cfgwrite = mmio_vtblk_cfgwrite,
+ .vc_apply_features = NULL,
+ .vc_hv_caps = VTBLK_S_HOSTCAPS,
+};
+
+static void
+mmio_vtblk_reset(void *vsc)
+{
+ struct mmio_vtblk_softc *sc = vsc;
+
+ DPRINTF(("vtblk: device reset requested !"));
+ vi_reset_dev(&sc->vbsc_vs);
+}
+
+static void
+mmio_vtblk_done_locked(struct mmio_vtblk_ioreq *io, int err)
+{
+ struct mmio_vtblk_softc *sc = io->io_sc;
+ int fd = sc->vbsc_vs.vs_mi->mi_fd;
+
+ /* convert errno into a virtio block error return */
+ if (err == EOPNOTSUPP || err == ENOSYS)
+ *io->io_status = VTBLK_S_UNSUPP;
+ else if (err != 0)
+ *io->io_status = VTBLK_S_IOERR;
+ else
+ *io->io_status = VTBLK_S_OK;
+
+
+ iove_export(fd, io->io_iove);
+ iove_free(io->io_iove);
+ io->io_iove = NULL;
+
+ /*
+ * Return the descriptor back to the host.
+ * We wrote 1 byte (our status) to host.
+ */
+ vq_relchain(&sc->vbsc_vq, io->io_idx, 1);
+ vq_endchains(&sc->vbsc_vq, 0);
+}
+
+static void
+mmio_vtblk_done(struct blockif_req *br, int err)
+{
+ struct mmio_vtblk_ioreq *io = br->br_param;
+ struct mmio_vtblk_softc *sc = io->io_sc;
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+ mmio_vtblk_done_locked(io, err);
+ pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+static void
+mmio_vtblk_proc(struct mmio_vtblk_softc *sc, struct vqueue_info *vq)
+{
+ struct virtio_blk_hdr *vbh;
+ struct mmio_vtblk_ioreq *io;
+ int i, n;
+ int err;
+ ssize_t iolen;
+ int writeop, type;
+ struct vi_req req;
+ struct iovec iov[BLOCKIF_IOV_MAX + 2];
+ struct virtio_blk_discard_write_zeroes *discard;
+
+ n = vq_getchain(vq, iov, BLOCKIF_IOV_MAX + 2, &req);
+
+ /*
+ * The first descriptor will be the read-only fixed header,
+ * and the last is for status (hence +2 above and below).
+ * The remaining iov's are the actual data I/O vectors.
+ *
+ * XXX - note - this fails on crash dump, which does a
+ * VIRTIO_BLK_T_FLUSH with a zero transfer length
+ */
+ assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2);
+
+ io = &sc->vbsc_ios[req.idx];
+ assert(req.readable != 0);
+ assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr));
+ vbh = (struct virtio_blk_hdr *)iov[0].iov_base;
+ memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2));
+ io->io_req.br_iovcnt = n - 2;
+ io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE;
+ io->io_status = (uint8_t *)iov[--n].iov_base;
+ io->io_iove = req.iove;
+ assert(req.writable != 0);
+ assert(iov[n].iov_len == 1);
+
+ /*
+ * XXX
+ * The guest should not be setting the BARRIER flag because
+ * we don't advertise the capability.
+ */
+ type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
+ writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD);
+ /*
+ * - Write op implies read-only descriptor
+ * - Read/ident op implies write-only descriptor
+ *
+ * By taking away either the read-only fixed header or the write-only
+ * status iovec, the following condition should hold true.
+ */
+ assert(n == (writeop ? req.readable : req.writable));
+
+ iolen = 0;
+ for (i = 1; i < n; i++) {
+ iolen += iov[i].iov_len;
+ }
+ io->io_req.br_resid = iolen;
+
+ DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld",
+ writeop ? "write/discard" : "read/ident", iolen, i - 1,
+ io->io_req.br_offset));
+
+ switch (type) {
+ case VBH_OP_READ:
+ err = blockif_read(sc->bc, &io->io_req);
+ break;
+ case VBH_OP_WRITE:
+ err = blockif_write(sc->bc, &io->io_req);
+ break;
+ case VBH_OP_DISCARD:
+ /*
+ * We currently only support a single request, if the guest
+ * has submitted a request that doesn't conform to the
+ * requirements, we return a error.
+ */
+ if (iov[1].iov_len != sizeof (*discard)) {
+ mmio_vtblk_done_locked(io, EINVAL);
+ return;
+ }
+
+ /* The segments to discard are provided rather than data */
+ discard = (struct virtio_blk_discard_write_zeroes *)
+ iov[1].iov_base;
+
+ /*
+ * virtio v1.1 5.2.6.2:
+ * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP
+ * for discard and write zeroes commands if any unknown flag is
+ * set. Furthermore, the device MUST set the status byte to
+ * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag
+ * is set.
+ *
+ * Currently there are no known flags for a DISCARD request.
+ */
+ if (discard->flags.unmap != 0 || discard->flags.reserved != 0) {
+ mmio_vtblk_done_locked(io, ENOTSUP);
+ return;
+ }
+
+ /* Make sure the request doesn't exceed our size limit */
+ if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) {
+ mmio_vtblk_done_locked(io, EINVAL);
+ return;
+ }
+
+ io->io_req.br_offset = discard->sector * VTBLK_BSIZE;
+ io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE;
+ err = blockif_delete(sc->bc, &io->io_req);
+ break;
+ case VBH_OP_FLUSH:
+ case VBH_OP_FLUSH_OUT:
+ err = blockif_flush(sc->bc, &io->io_req);
+ break;
+ case VBH_OP_IDENT:
+ /* Assume a single buffer */
+ /* S/n equal to buffer is not zero-terminated. */
+ memset(iov[1].iov_base, 0, iov[1].iov_len);
+ strncpy(iov[1].iov_base, sc->vbsc_ident,
+ MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
+ mmio_vtblk_done_locked(io, 0);
+ return;
+ default:
+ mmio_vtblk_done_locked(io, EOPNOTSUPP);
+ return;
+ }
+ assert(err == 0);
+}
+
+static void
+mmio_vtblk_notify(void *vsc, struct vqueue_info *vq)
+{
+ struct mmio_vtblk_softc *sc = vsc;
+
+ while (vq_has_descs(vq))
+ mmio_vtblk_proc(sc, vq);
+}
+
+static void
+mmio_vtblk_resized(struct blockif_ctxt *bctxt __unused, void *arg,
+ size_t new_size, uint64_t data __unused)
+{
+ struct mmio_vtblk_softc *sc;
+
+ sc = arg;
+
+ sc->vbsc_cfg->vbc_capacity = new_size / VTBLK_BSIZE; /* 512-byte units */
+ /* XXX Handle resizing. */
+ printf("UNIMPLEMENTED %s\n", __func__);
+ exit(1);
+}
+
+static void
+mmio_vtblk_event(int fd, enum ev_type type, void *arg, uint64_t offset)
+{
+ struct mmio_vtblk_softc *sc = (struct mmio_vtblk_softc *)arg;
+ struct mmio_devinst *mdi = sc->vbsc_vs.vs_mi;
+
+ assert(fd == mdi->mi_fd);
+ assert(type == EVF_READ);
+
+ vi_mmio_write(&sc->vbsc_vs, offset);
+
+ /* Let in-progress operations continue. */
+ ioctl(mdi->mi_fd, VIRTIO_DBG_ACK);
+}
+
+static int
+mmio_vtblk_init(struct mmio_devinst *mdi, nvlist_t *nvl)
+{
+ char bident[MI_NAMESZ];
+ struct blockif_ctxt *bctxt;
+ const char *path, *serial;
+ MD5_CTX mdctx;
+ u_char digest[16];
+ struct mmio_vtblk_softc *sc;
+ off_t size;
+ int i, sectsz, sts, sto;
+
+ /*
+ * The supplied backing file has to exist
+ */
+ /* Make sure the name fits */
+ snprintf(bident, sizeof(bident), "%s", mdi->mi_name);
+ bctxt = blockif_open(nvl, bident);
+ if (bctxt == NULL) {
+ perror("Could not open backing file");
+ return (1);
+ }
+
+ size = blockif_size(bctxt);
+ sectsz = blockif_sectsz(bctxt);
+ blockif_psectsz(bctxt, &sts, &sto);
+
+ sc = calloc(1, sizeof(struct mmio_vtblk_softc));
+ sc->vbsc_cfg = (struct vtblk_config *)((uint64_t)mdi->mi_addr + VIRTIO_MMIO_CONFIG);
+
+ sc->bc = bctxt;
+ for (i = 0; i < VTBLK_RINGSZ; i++) {
+ struct mmio_vtblk_ioreq *io = &sc->vbsc_ios[i];
+ io->io_req.br_callback = mmio_vtblk_done;
+ io->io_req.br_param = io;
+ io->io_sc = sc;
+ io->io_idx = i;
+ }
+
+ bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts));
+ if (blockif_candelete(sc->bc))
+ sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD;
+
+ pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+ /* init virtio softc and virtqueues */
+ vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, mdi, &sc->vbsc_vq);
+ sc->vbsc_vs.vs_mtx = &sc->vsc_mtx;
+
+ sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
+ /* sc->vbsc_vq.vq_notify = we have no per-queue notify */
+
+ /*
+ * If an explicit identifier is not given, create an
+ * identifier using parts of the md5 sum of the filename.
+ */
+ bzero(sc->vbsc_ident, VTBLK_BLK_ID_BYTES);
+ if ((serial = get_config_value_node(nvl, "serial")) != NULL ||
+ (serial = get_config_value_node(nvl, "ser")) != NULL) {
+ strlcpy(sc->vbsc_ident, serial, VTBLK_BLK_ID_BYTES);
+ } else {
+ path = get_config_value_node(nvl, "path");
+ MD5Init(&mdctx);
+ MD5Update(&mdctx, path, strlen(path));
+ MD5Final(digest, &mdctx);
+ snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES,
+ "BHYVE-%02X%02X-%02X%02X-%02X%02X",
+ digest[0], digest[1], digest[2], digest[3], digest[4],
+ digest[5]);
+ }
+
+ /* setup virtio block config space */
+ sc->vbsc_cfg->vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */
+ sc->vbsc_cfg->vbc_size_max = 0; /* not negotiated */
+
+ /*
+ * If Linux is presented with a seg_max greater than the virtio queue
+ * size, it can stumble into situations where it violates its own
+ * invariants and panics. For safety, we keep seg_max clamped, paying
+ * heed to the two extra descriptors needed for the header and status
+ * of a request.
+ */
+ sc->vbsc_cfg->vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX);
+ sc->vbsc_cfg->vbc_geometry.cylinders = 0; /* no geometry */
+ sc->vbsc_cfg->vbc_geometry.heads = 0;
+ sc->vbsc_cfg->vbc_geometry.sectors = 0;
+ sc->vbsc_cfg->vbc_blk_size = sectsz;
+ sc->vbsc_cfg->vbc_topology.physical_block_exp =
+ (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0;
+ sc->vbsc_cfg->vbc_topology.alignment_offset =
+ (sto != 0) ? ((sts - sto) / sectsz) : 0;
+ sc->vbsc_cfg->vbc_topology.min_io_size = 0;
+ sc->vbsc_cfg->vbc_topology.opt_io_size = 0;
+ sc->vbsc_cfg->vbc_writeback = 0;
+ sc->vbsc_cfg->max_discard_sectors = VTBLK_MAX_DISCARD_SECT;
+ sc->vbsc_cfg->max_discard_seg = VTBLK_MAX_DISCARD_SEG;
+ sc->vbsc_cfg->discard_sector_alignment = MAX(sectsz, sts) / VTBLK_BSIZE;
+
+ mevent_add(mdi->mi_fd, EVF_READ, mmio_vtblk_event, sc);
+ blockif_register_resize_callback(sc->bc, mmio_vtblk_resized, sc);
+
+ return (0);
+}
+
+static int
+mmio_vtblk_cfgwrite(void *vsc __unused, int offset, int size __unused,
+ uint32_t value __unused)
+{
+
+ DPRINTF(("vtblk: write to readonly reg %d", offset));
+ return (1);
+}
+
+static int
+mmio_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval)
+{
+ struct mmio_vtblk_softc *sc = vsc;
+ void *ptr;
+
+ /* our caller has already verified offset and size */
+ ptr = (uint8_t *)sc->vbsc_cfg + offset;
+ memcpy(retval, ptr, size);
+ return (0);
+}
+
+
+static const struct mmio_devemu mmio_de_vblk = {
+ .me_emu = "virtio-blk",
+ .me_init = mmio_vtblk_init,
+};
+MMIO_EMUL_SET(mmio_de_vblk);
diff --git a/tests/sys/virtio/virtio.h b/tests/sys/virtio/virtio.h
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/virtio.h
@@ -0,0 +1,323 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _BHYVE_VIRTIO_H_
+#define _BHYVE_VIRTIO_H_
+
+#include <machine/atomic.h>
+
+#include <dev/virtio/virtio.h>
+#include <dev/virtio/virtio_ring.h>
+#include <dev/virtio/mmio/virtio_mmio.h>
+
+/*
+ * These are derived from several virtio specifications.
+ *
+ * Some useful links:
+ * https://github.com/rustyrussell/virtio-spec
+ * http://people.redhat.com/pbonzini/virtio-spec.pdf
+ */
+
+/*
+ * A virtual device has zero or more "virtual queues" (virtqueue).
+ * Each virtqueue uses at least two 4096-byte pages, laid out thus:
+ *
+ * +-----------------------------------------------+
+ * | "desc": <N> descriptors, 16 bytes each |
+ * | ----------------------------------------- |
+ * | "avail": 2 uint16; <N> uint16; 1 uint16 |
+ * | ----------------------------------------- |
+ * | pad to 4k boundary |
+ * +-----------------------------------------------+
+ * | "used": 2 x uint16; <N> elems; 1 uint16 |
+ * | ----------------------------------------- |
+ * | pad to 4k boundary |
+ * +-----------------------------------------------+
+ *
+ * The number <N> that appears here is always a power of two and is
+ * limited to no more than 32768 (as it must fit in a 16-bit field).
+ * If <N> is sufficiently large, the above will occupy more than
+ * two pages. In any case, all pages must be physically contiguous
+ * within the guest's physical address space.
+ *
+ * The <N> 16-byte "desc" descriptors consist of a 64-bit guest
+ * physical address <addr>, a 32-bit length <len>, a 16-bit
+ * <flags>, and a 16-bit <next> field (all in guest byte order).
+ *
+ * There are three flags that may be set :
+ * NEXT descriptor is chained, so use its "next" field
+ * WRITE descriptor is for host to write into guest RAM
+ * (else host is to read from guest RAM)
+ * INDIRECT descriptor address field is (guest physical)
+ * address of a linear array of descriptors
+ *
+ * Unless INDIRECT is set, <len> is the number of bytes that may
+ * be read/written from guest physical address <addr>. If
+ * INDIRECT is set, WRITE is ignored and <len> provides the length
+ * of the indirect descriptors (and <len> must be a multiple of
+ * 16). Note that NEXT may still be set in the main descriptor
+ * pointing to the indirect, and should be set in each indirect
+ * descriptor that uses the next descriptor (these should generally
+ * be numbered sequentially). However, INDIRECT must not be set
+ * in the indirect descriptors. Upon reaching an indirect descriptor
+ * without a NEXT bit, control returns to the direct descriptors.
+ *
+ * Except inside an indirect, each <next> value must be in the
+ * range [0 .. N) (i.e., the half-open interval). (Inside an
+ * indirect, each <next> must be in the range [0 .. <len>/16).)
+ *
+ * The "avail" data structures reside in the same pages as the
+ * "desc" structures since both together are used by the device to
+ * pass information to the hypervisor's virtual driver. These
+ * begin with a 16-bit <flags> field and 16-bit index <idx>, then
+ * have <N> 16-bit <ring> values, followed by one final 16-bit
+ * field <used_event>. The <N> <ring> entries are simply indices
+ * into the descriptor ring (and thus must meet the same
+ * constraints as each <next> value). However, <idx> is counted
+ * up from 0 (initially) and simply wraps around after 65535; it
+ * is taken mod <N> to find the next available entry.
+ *
+ * The "used" ring occupies a separate page or pages, and contains
+ * values written from the virtual driver back to the guest OS.
+ * This begins with a 16-bit <flags> and 16-bit <idx>, then there
+ * are <N> "vring_used" elements, followed by a 16-bit <avail_event>.
+ * The <N> "vring_used" elements consist of a 32-bit <id> and a
+ * 32-bit <len> (vu_tlen below). The <id> is simply the index of
+ * the head of a descriptor chain the guest made available
+ * earlier, and the <len> is the number of bytes actually written,
+ * e.g., in the case of a network driver that provided a large
+ * receive buffer but received only a small amount of data.
+ *
+ * The two event fields, <used_event> and <avail_event>, in the
+ * avail and used rings (respectively -- note the reversal!), are
+ * always provided, but are used only if the virtual device
+ * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature
+ * negotiation. Similarly, both rings provide a flag --
+ * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in
+ * their <flags> field, indicating that the guest does not need an
+ * interrupt, or that the hypervisor driver does not need a
+ * notify, when descriptors are added to the corresponding ring.
+ * (These are provided only for interrupt optimization and need
+ * not be implemented.)
+ */
+#define VRING_ALIGN 4096
+
+/*
+ * PCI vendor/device IDs
+ */
+#define VIRTIO_VENDOR 0x1AF4
+#define VIRTIO_DEV_NET 0x1000
+#define VIRTIO_DEV_BLOCK 0x1001
+#define VIRTIO_DEV_CONSOLE 0x1003
+#define VIRTIO_DEV_SCSI 0x1004
+#define VIRTIO_DEV_RANDOM 0x1005
+#define VIRTIO_DEV_9P 0x1009
+#define VIRTIO_DEV_INPUT 0x1052
+
+/*
+ * PCI revision IDs
+ */
+#define VIRTIO_REV_INPUT 1
+
+/*
+ * PCI subvendor IDs
+ */
+#define VIRTIO_SUBVEN_INPUT 0x108E
+
+/*
+ * PCI subdevice IDs
+ */
+#define VIRTIO_SUBDEV_INPUT 0x1100
+
+/* From section 2.3, "Virtqueue Configuration", of the virtio specification */
+static inline int
+vring_size_aligned(u_int qsz)
+{
+ return (roundup2(vring_size(qsz, VRING_ALIGN), VRING_ALIGN));
+}
+
+struct mmio_devinst;
+struct vqueue_info;
+
+struct virtio_softc {
+ struct virtio_consts *vs_vc; /* constants (see below) */
+ int vs_flags; /* VIRTIO_* flags from above */
+ pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */
+ struct mmio_devinst *vs_mi; /* MMIO device instance */
+ uint32_t vs_negotiated_caps; /* negotiated capabilities */
+ struct vqueue_info *vs_queues; /* one per vc_nvq */
+ int vs_curq; /* current queue */
+};
+
+#define VS_LOCK(vs) \
+do { \
+ if (vs->vs_mtx) \
+ pthread_mutex_lock(vs->vs_mtx); \
+} while (0)
+
+#define VS_UNLOCK(vs) \
+do { \
+ if (vs->vs_mtx) \
+ pthread_mutex_unlock(vs->vs_mtx); \
+} while (0)
+
+struct virtio_consts {
+ const char *vc_name; /* name of driver (for diagnostics) */
+ int vc_nvq; /* number of virtual queues */
+ size_t vc_cfgsize; /* size of dev-specific config regs */
+ void (*vc_reset)(void *); /* called on virtual device reset */
+ void (*vc_qnotify)(void *, struct vqueue_info *);
+ /* called on QNOTIFY if no VQ notify */
+ int (*vc_cfgread)(void *, int, int, uint32_t *);
+ /* called to read config regs */
+ int (*vc_cfgwrite)(void *, int, int, uint32_t);
+ /* called to write config regs */
+ void (*vc_apply_features)(void *, uint64_t);
+ /* called to apply negotiated features */
+ uint64_t vc_hv_caps; /* hypervisor-provided capabilities */
+};
+
+/*
+ * Data structure allocated (statically) per virtual queue.
+ *
+ * Drivers may change vq_qsize after a reset. When the guest OS
+ * requests a device reset, the hypervisor first calls
+ * vs->vs_vc->vc_reset(); then the data structure below is
+ * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq).
+ *
+ * The remaining fields should only be fussed-with by the generic
+ * code.
+ *
+ * Note: the addresses of vq_desc, vq_avail, and vq_used are all
+ * computable from each other, but it's a lot simpler if we just
+ * keep a pointer to each one. The event indices are similarly
+ * (but more easily) computable, and this time we'll compute them:
+ * they're just XX_ring[N].
+ */
+#define VQ_ALLOC 0x01 /* set once we have a pfn */
+#define VQ_BROKED 0x02 /* ??? */
+struct vqueue_info {
+ uint16_t vq_qsize; /* size of this queue (a power of 2) */
+ void (*vq_notify)(void *, struct vqueue_info *);
+ /* called instead of vc_notify, if not NULL */
+
+ struct virtio_softc *vq_vs; /* backpointer to softc */
+ uint16_t vq_num; /* we're the num'th queue in the softc */
+
+ uint16_t vq_flags; /* flags (see above) */
+ uint16_t vq_last_avail; /* a recent value of vq_avail->idx */
+ uint16_t vq_next_used; /* index of the next used slot to be filled */
+ uint16_t vq_save_used; /* saved vq_used->idx; see vq_endchains */
+
+ uint32_t vq_offset; /* Offset in the control region */
+
+ struct vring_desc *vq_desc; /* descriptor array */
+ struct vring_avail *vq_avail; /* the "avail" ring */
+ struct vring_used *vq_used; /* the "used" ring */
+};
+
+/* as noted above, these are sort of backwards, name-wise */
+#define VQ_AVAIL_EVENT_IDX(vq) \
+ (*(uint16_t *)&(vq)->vq_used->ring[(vq)->vq_qsize])
+#define VQ_USED_EVENT_IDX(vq) \
+ ((vq)->vq_avail->ring[(vq)->vq_qsize])
+
+/*
+ * Is this ring ready for I/O?
+ */
+static inline int
+vq_ring_ready(struct vqueue_info *vq)
+{
+
+ return (vq->vq_flags & VQ_ALLOC);
+}
+
+/*
+ * Are there "available" descriptors? (This does not count
+ * how many, just returns True if there are some.)
+ */
+static inline int
+vq_has_descs(struct vqueue_info *vq)
+{
+
+ return (vq_ring_ready(vq) && vq->vq_last_avail !=
+ vq->vq_avail->idx);
+}
+
+
+static inline void
+vq_kick_enable(struct vqueue_info *vq)
+{
+
+ vq->vq_used->flags &= ~VRING_USED_F_NO_NOTIFY;
+ /*
+ * Full memory barrier to make sure the store to vq_used->flags
+ * happens before the load from vq_avail->idx, which results from a
+ * subsequent call to vq_has_descs().
+ */
+ atomic_thread_fence_seq_cst();
+}
+
+static inline void
+vq_kick_disable(struct vqueue_info *vq)
+{
+
+ vq->vq_used->flags |= VRING_USED_F_NO_NOTIFY;
+}
+
+struct iovec;
+
+/*
+ * Request description returned by vq_getchain.
+ *
+ * Writable iovecs start at iov[req.readable].
+ */
+struct vi_req {
+ int readable; /* num of readable iovecs */
+ int writable; /* num of writable iovecs */
+ unsigned int idx; /* ring index */
+ struct iov_emul *iove; /* Export io vector */
+};
+
+void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
+ void *dev_softc, struct mmio_devinst *mi,
+ struct vqueue_info *queues);
+int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix);
+void vi_reset_dev(struct virtio_softc *);
+
+int vq_getchain(struct vqueue_info *vq, struct iovec *iov, int niov,
+ struct vi_req *reqp);
+void vq_retchains(struct vqueue_info *vq, uint16_t n_chains);
+void vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx,
+ uint32_t iolen);
+void vq_relchain_publish(struct vqueue_info *vq);
+void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen);
+void vq_endchains(struct vqueue_info *vq, int used_all_avail);
+
+void vi_mmio_write(struct virtio_softc *vs, uint64_t offset);
+#endif /* _BHYVE_VIRTIO_H_ */
diff --git a/tests/sys/virtio/virtio.c b/tests/sys/virtio/virtio.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/virtio.c
@@ -0,0 +1,886 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ * Copyright (c) 2019 Joyent, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/ioctl.h>
+#include <sys/uio.h>
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include <dev/virtio/dbg/virtio_dbg.h>
+
+#include "debug.h"
+#include "iov_emul.h"
+#include "mmio_emul.h"
+#include "virtio.h"
+
+/*
+ * Functions for dealing with generalized "virtual devices" as
+ * defined by <https://www.google.com/#output=search&q=virtio+spec>
+ */
+
+/*
+ * In case we decide to relax the "virtio softc comes at the
+ * front of virtio-based device softc" constraint, let's use
+ * this to convert.
+ */
+#define DEV_SOFTC(vs) ((void *)(vs))
+
+/*
+ * Link a virtio_softc to its constants, the device softc, and
+ * the PCI emulation.
+ */
+void
+vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
+ void *dev_softc, struct mmio_devinst *mdi,
+ struct vqueue_info *queues)
+{
+ int i;
+
+ /* vs and dev_softc addresses must match */
+ assert((void *)vs == dev_softc);
+ vs->vs_vc = vc;
+ vs->vs_mi = mdi;
+
+ vs->vs_queues = queues;
+ for (i = 0; i < vc->vc_nvq; i++) {
+ queues[i].vq_vs = vs;
+ queues[i].vq_num = i;
+ }
+}
+
+/*
+ * Deliver an interrupt to the guest device.
+ */
+static void
+vq_interrupt(struct virtio_softc *vs)
+{
+ int fd = vs->vs_mi->mi_fd;
+ int error;
+
+ mmio_set_cfgdata32(vs->vs_mi, VIRTIO_MMIO_INTERRUPT_STATUS, VIRTIO_MMIO_INT_VRING);
+ error = ioctl(fd, VIRTIO_DBG_KICK);
+ if (error != 0)
+ EPRINTLN("device kick failed with %d\n", error);
+
+}
+
+/*
+ * Reset device (device-wide). This erases all queues, i.e.,
+ * all the queues become invalid (though we don't wipe out the
+ * internal pointers, we just clear the VQ_ALLOC flag).
+ *
+ * It resets negotiated features to "none".
+ */
+void
+vi_reset_dev(struct virtio_softc *vs)
+{
+ struct mmio_devinst *mdi = vs->vs_mi;
+ struct vqueue_info *vq;
+ int i, nvq;
+
+ if (vs->vs_mtx)
+ assert(pthread_mutex_isowned_np(vs->vs_mtx));
+
+ nvq = vs->vs_vc->vc_nvq;
+ for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
+ vq->vq_flags = 0;
+ vq->vq_last_avail = 0;
+ vq->vq_next_used = 0;
+ vq->vq_save_used = 0;
+ /* XXX Is this right? How should we actually set it? */
+ vq->vq_offset = UINT_MAX;
+ }
+ vs->vs_negotiated_caps = 0;
+ vs->vs_curq = 0;
+
+ mdi->mi_state = MIDEV_INVALID;
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_INTERRUPT_STATUS, 0);
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_READY, 0);
+
+}
+
+/*
+ * Initialize the currently-selected virtio queue (vs->vs_curq).
+ * The guest just gave us a page frame number, from which we can
+ * calculate the addresses of the queue.
+ */
+/* XXX Switch it back to using the virtio softc. */
+static void
+vi_vq_init(struct mmio_devinst *mdi, struct vqueue_info *vq)
+{
+ uint64_t offset;
+
+ offset = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_DESC_HIGH);
+ offset <<= 32;
+ offset |= mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_DESC_LOW);
+ vq->vq_desc = (struct vring_desc *)(mdi->mi_addr + offset);
+
+ offset = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_AVAIL_HIGH);
+ offset <<= 32;
+ offset |= mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_AVAIL_LOW);
+ vq->vq_avail = (struct vring_avail *)(mdi->mi_addr + offset);
+
+ offset = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_USED_HIGH);
+ offset <<= 32;
+ offset |= mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_USED_LOW);
+ vq->vq_used = (struct vring_used *)(mdi->mi_addr + offset);
+
+ /* Mark queue as allocated, and start at 0 when we use it. */
+ vq->vq_flags = VQ_ALLOC;
+ vq->vq_last_avail = 0;
+ vq->vq_next_used = 0;
+ vq->vq_save_used = 0;
+}
+
+
+/*
+ * Helper inline for vq_getchain(): record the i'th "real"
+ * descriptor.
+ */
+static inline void
+_vq_record(int i, struct vring_desc *vd, struct iovec *iov,
+ int n_iov, struct vi_req *reqp, struct iov_emul *wiove,
+ struct iov_emul *riove)
+{
+ if (i >= n_iov)
+ return;
+
+ /* XXX Handle OOM scenarios leading to iove_add failures. */
+
+ /* Preallocate a descriptor data region for the descriptor */
+ if ((vd->flags & VRING_DESC_F_WRITE) == 0) {
+ if (iove_add(riove, vd->addr, vd->len, &iov[i]) != 0)
+ return;
+
+ reqp->readable++;
+ } else {
+ if (iove_add(wiove, vd->addr, vd->len, &iov[i]) != 0)
+ return;
+
+ reqp->writable++;
+ }
+}
+#define VQ_MAX_DESCRIPTORS 512 /* see below */
+
+static int
+vq_import_indirect(struct vring_desc __unused **vdp)
+{
+ /* XXX Use the provided vd address to read in the indirect descriptor */
+ printf("UNIMPLEMENTED %s\n", __func__);
+ exit(1);
+}
+
+/*
+ * Examine the chain of descriptors starting at the "next one" to
+ * make sure that they describe a sensible request. If so, return
+ * the number of "real" descriptors that would be needed/used in
+ * acting on this request. This may be smaller than the number of
+ * available descriptors, e.g., if there are two available but
+ * they are two separate requests, this just returns 1. Or, it
+ * may be larger: if there are indirect descriptors involved,
+ * there may only be one descriptor available but it may be an
+ * indirect pointing to eight more. We return 8 in this case,
+ * i.e., we do not count the indirect descriptors, only the "real"
+ * ones.
+ *
+ * Basically, this vets the "flags" and "next" field of each
+ * descriptor and tells you how many are involved. Since some may
+ * be indirect, this also needs the vmctx (in the pci_devinst
+ * at vs->vs_pi) so that it can find indirect descriptors.
+ *
+ * As we process each descriptor, we copy and adjust it (guest to
+ * host address wise, also using the vmtctx) into the given iov[]
+ * array (of the given size). If the array overflows, we stop
+ * placing values into the array but keep processing descriptors,
+ * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1.
+ * So you, the caller, must not assume that iov[] is as big as the
+ * return value (you can process the same thing twice to allocate
+ * a larger iov array if needed, or supply a zero length to find
+ * out how much space is needed).
+ *
+ * If some descriptor(s) are invalid, this prints a diagnostic message
+ * and returns -1. If no descriptors are ready now it simply returns 0.
+ *
+ * You are assumed to have done a vq_ring_ready() if needed (note
+ * that vq_has_descs() does one).
+ */
+int
+vq_getchain(struct vqueue_info *vq, struct iovec *iov, int niov,
+ struct vi_req *reqp)
+{
+ int i;
+ u_int ndesc, n_indir;
+ u_int idx, next;
+ struct vi_req req;
+ struct vring_desc *vdir, *vindir, *vp;
+ struct virtio_softc *vs;
+ const char *name;
+ int error;
+ struct iov_emul *riove, *wiove;
+ int fd;
+
+ vs = vq->vq_vs;
+ fd = vs->vs_mi->mi_fd;
+ name = vs->vs_vc->vc_name;
+ memset(&req, 0, sizeof(req));
+
+ vindir = NULL;
+ riove = iove_alloc();
+ wiove = iove_alloc();
+ if (riove == NULL || wiove == NULL) {
+ iove_free(riove);
+ iove_free(wiove);
+ return (ENOMEM);
+ }
+
+ /*
+ * Note: it's the responsibility of the guest not to
+ * update vq->vq_avail->idx until all of the descriptors
+ * the guest has written are valid (including all their
+ * "next" fields and "flags").
+ *
+ * Compute (vq_avail->idx - last_avail) in integers mod 2**16. This is
+ * the number of descriptors the device has made available
+ * since the last time we updated vq->vq_last_avail.
+ *
+ * We just need to do the subtraction as an unsigned int,
+ * then trim off excess bits.
+ */
+ idx = vq->vq_last_avail;
+ ndesc = (uint16_t)((u_int)vq->vq_avail->idx - idx);
+ if (ndesc == 0)
+ return (0);
+ if (ndesc > vq->vq_qsize) {
+ /* XXX need better way to diagnose issues */
+ EPRINTLN(
+ "%s: ndesc (%u) out of range, driver confused?",
+ name, (u_int)ndesc);
+ return (-1);
+ }
+
+ /*
+ * Now count/parse "involved" descriptors starting from
+ * the head of the chain.
+ *
+ * To prevent loops, we could be more complicated and
+ * check whether we're re-visiting a previously visited
+ * index, but we just abort if the count gets excessive.
+ */
+ req.idx = next = vq->vq_avail->ring[idx & (vq->vq_qsize - 1)];
+ req.iove = wiove;
+ vq->vq_last_avail++;
+ for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->next) {
+ if (next >= vq->vq_qsize) {
+ EPRINTLN(
+ "%s: descriptor index %u out of range, "
+ "driver confused?",
+ name, next);
+ goto error;
+ }
+ vdir = &vq->vq_desc[next];
+ if ((vdir->flags & VRING_DESC_F_INDIRECT) == 0) {
+ _vq_record(i, vdir, iov, niov, &req, wiove, riove);
+ i++;
+ } else if ((vs->vs_vc->vc_hv_caps &
+ VIRTIO_RING_F_INDIRECT_DESC) == 0) {
+ EPRINTLN(
+ "%s: descriptor has forbidden INDIRECT flag, "
+ "driver confused?",
+ name);
+ goto error;
+ } else {
+ n_indir = vdir->len / 16;
+ if ((vdir->len & 0xf) || n_indir == 0) {
+ EPRINTLN(
+ "%s: invalid indir len 0x%x, "
+ "driver confused?",
+ name, (u_int)vdir->len);
+ goto error;
+ }
+
+ error = vq_import_indirect(&vindir);
+ if (error != 0)
+ goto error;
+ /*
+ * Indirects start at the 0th, then follow
+ * their own embedded "next"s until those run
+ * out. Each one's indirect flag must be off
+ * (we don't really have to check, could just
+ * ignore errors...).
+ */
+ next = 0;
+ for (;;) {
+ vp = &vindir[next];
+ if (vp->flags & VRING_DESC_F_INDIRECT) {
+ EPRINTLN(
+ "%s: indirect desc has INDIR flag,"
+ " driver confused?",
+ name);
+ goto error;
+ }
+ _vq_record(i, vp, iov, niov, &req, wiove, riove);
+ if (++i > VQ_MAX_DESCRIPTORS) {
+ EPRINTLN(
+ "%s: descriptor loop? count > %d - driver confused?",
+ name, i);
+ goto error;
+ }
+ if ((vp->flags & VRING_DESC_F_NEXT) == 0)
+ break;
+ next = vp->next;
+ if (next >= n_indir) {
+ EPRINTLN(
+ "%s: invalid next %u > %u, "
+ "driver confused?",
+ name, (u_int)next, n_indir);
+ goto error;
+ }
+ }
+ }
+ if ((vdir->flags & VRING_DESC_F_NEXT) == 0)
+ goto done;
+ }
+
+error:
+ iove_free(riove);
+ iove_free(wiove);
+ free(vindir);
+
+ return (-1);
+
+done:
+ /* Read in readable descriptors from the kernel. */
+ error = iove_import(fd, riove);
+ iove_free(riove);
+ free(vindir);
+
+ if (error != 0) {
+ EPRINTLN("Reading in data failed with %d", error);
+ return (-1);
+ }
+
+ *reqp = req;
+ return (i);
+}
+
+/*
+ * Return the first n_chain request chains back to the available queue.
+ *
+ * (These chains are the ones you handled when you called vq_getchain()
+ * and used its positive return value.)
+ */
+void
+vq_retchains(struct vqueue_info *vq, uint16_t n_chains)
+{
+
+ vq->vq_last_avail -= n_chains;
+}
+
+void
+vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
+{
+ struct vring_used *vuh;
+ struct vring_used_elem *vue;
+ uint16_t mask;
+
+ /*
+ * Notes:
+ * - mask is N-1 where N is a power of 2 so computes x % N
+ * - vuh points to the "used" data shared with guest
+ * - vue points to the "used" ring entry we want to update
+ */
+ mask = vq->vq_qsize - 1;
+ vuh = vq->vq_used;
+
+ vue = &vuh->ring[vq->vq_next_used++ & mask];
+ vue->id = idx;
+ vue->len = iolen;
+}
+
+void
+vq_relchain_publish(struct vqueue_info *vq)
+{
+ /*
+ * Ensure the used descriptor is visible before updating the index.
+ * This is necessary on ISAs with memory ordering less strict than x86
+ * (and even on x86 to act as a compiler barrier).
+ */
+ atomic_thread_fence_rel();
+ vq->vq_used->idx = vq->vq_next_used;
+}
+
+/*
+ * Return specified request chain to the guest, setting its I/O length
+ * to the provided value.
+ *
+ * (This chain is the one you handled when you called vq_getchain()
+ * and used its positive return value.)
+ */
+void
+vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
+{
+ vq_relchain_prepare(vq, idx, iolen);
+ vq_relchain_publish(vq);
+}
+
+/*
+ * Driver has finished processing "available" chains and calling
+ * vq_relchain on each one. If driver used all the available
+ * chains, used_all should be set.
+ *
+ * If the "used" index moved we may need to inform the guest, i.e.,
+ * deliver an interrupt. Even if the used index did NOT move we
+ * may need to deliver an interrupt, if the avail ring is empty and
+ * we are supposed to interrupt on empty.
+ *
+ * Note that used_all_avail is provided by the caller because it's
+ * a snapshot of the ring state when he decided to finish interrupt
+ * processing -- it's possible that descriptors became available after
+ * that point. (It's also typically a constant 1/True as well.)
+ */
+void
+vq_endchains(struct vqueue_info *vq, int used_all_avail)
+{
+ struct virtio_softc *vs;
+ uint16_t event_idx, new_idx, old_idx;
+ int intr;
+
+ /*
+ * Interrupt generation: if we're using EVENT_IDX,
+ * interrupt if we've crossed the event threshold.
+ * Otherwise interrupt is generated if we added "used" entries,
+ * but suppressed by VRING_AVAIL_F_NO_INTERRUPT.
+ *
+ * In any case, though, if NOTIFY_ON_EMPTY is set and the
+ * entire avail was processed, we need to interrupt always.
+ */
+ vs = vq->vq_vs;
+ old_idx = vq->vq_save_used;
+ vq->vq_save_used = new_idx = vq->vq_used->idx;
+
+ /*
+ * Use full memory barrier between "idx" store from preceding
+ * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or
+ * "flags" field below.
+ */
+ atomic_thread_fence_seq_cst();
+ if (used_all_avail &&
+ (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
+ intr = 1;
+ else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {
+ event_idx = VQ_USED_EVENT_IDX(vq);
+ /*
+ * This calculation is per docs and the kernel
+ * (see src/sys/dev/virtio/virtio_ring.h).
+ */
+ intr = (uint16_t)(new_idx - event_idx - 1) <
+ (uint16_t)(new_idx - old_idx);
+ } else {
+ intr = new_idx != old_idx &&
+ !(vq->vq_avail->flags & VRING_AVAIL_F_NO_INTERRUPT);
+ }
+ if (intr)
+ vq_interrupt(vs);
+}
+
+/* Note: these are in sorted order to make for a fast search */
+static struct config_reg {
+ uint16_t cr_offset; /* register offset */
+ uint8_t cr_ro; /* true => reg is read only */
+ const char *cr_name; /* name of reg */
+} config_regs[] = {
+ { VIRTIO_MMIO_MAGIC_VALUE, 1,"MMIO_MAGIC_VALUE" },
+ { VIRTIO_MMIO_VERSION, 1, "VERSION" },
+ { VIRTIO_MMIO_DEVICE_ID, 1, "DEVICE_ID" },
+ { VIRTIO_MMIO_VENDOR_ID, 1, "VENDOR_ID" },
+ { VIRTIO_MMIO_HOST_FEATURES, 1, "HOST_FEATURES" },
+ { VIRTIO_MMIO_HOST_FEATURES_SEL, 0, "HOST_FEATURES_SEL" },
+ { VIRTIO_MMIO_GUEST_FEATURES, 0, "GUEST_FEATURES" },
+ { VIRTIO_MMIO_GUEST_FEATURES_SEL, 0, "GUEST_FEATURES_SEL" },
+ { VIRTIO_MMIO_QUEUE_SEL, 0, "QUEUE_SEL" },
+ { VIRTIO_MMIO_QUEUE_NUM_MAX, 1, "QUEUE_NUM_MAX" },
+ { VIRTIO_MMIO_QUEUE_NUM, 0, "QUEUE_NUM" },
+ { VIRTIO_MMIO_QUEUE_READY, 0, "QUEUE_READY" },
+ { VIRTIO_MMIO_QUEUE_NOTIFY, 0, "QUEUE_NOTIFY" },
+ { VIRTIO_MMIO_INTERRUPT_STATUS, 1, "INTERRUPT_STATUS" },
+ { VIRTIO_MMIO_INTERRUPT_ACK, 0, "INTERRUPT_ACK" },
+ { VIRTIO_MMIO_STATUS, 0, "STATUS" },
+ { VIRTIO_MMIO_QUEUE_DESC_LOW, 0, "QUEUE_DESC_LOW" },
+ { VIRTIO_MMIO_QUEUE_DESC_HIGH, 0, "QUEUE_DESC_HIGH" },
+ { VIRTIO_MMIO_QUEUE_AVAIL_LOW, 0, "QUEUE_AVAIL_LOW" },
+ { VIRTIO_MMIO_QUEUE_AVAIL_HIGH, 0, "QUEUE_AVAIL_HIGH" },
+ { VIRTIO_MMIO_QUEUE_USED_LOW, 0, "QUEUE_USED_LOW" },
+ { VIRTIO_MMIO_QUEUE_USED_HIGH, 0, "QUEUE_USED_HIGH" },
+ { VIRTIO_MMIO_CONFIG_GENERATION, 1, "CONFIG_GENERATION" },
+};
+
+static inline struct config_reg *
+vi_find_cr(int offset) {
+ u_int hi, lo, mid;
+ struct config_reg *cr;
+
+ lo = 0;
+ hi = sizeof(config_regs) / sizeof(*config_regs) - 1;
+ while (hi >= lo) {
+ mid = (hi + lo) >> 1;
+ cr = &config_regs[mid];
+ if (cr->cr_offset == offset)
+ return (cr);
+ if (cr->cr_offset < offset)
+ lo = mid + 1;
+ else
+ hi = mid - 1;
+ }
+ return (NULL);
+}
+
+static void
+vi_handle_state_change(struct mmio_devinst *mdi, uint32_t status)
+{
+ switch (mdi->mi_state) {
+ case MIDEV_INVALID:
+ if (status & VIRTIO_CONFIG_STATUS_ACK)
+ mdi->mi_state = MIDEV_ACKNOWLEDGED;
+ break;
+
+ case MIDEV_ACKNOWLEDGED:
+ if (status & VIRTIO_CONFIG_STATUS_DRIVER)
+ mdi->mi_state = MIDEV_DRIVER_FOUND;
+ break;
+
+ case MIDEV_DRIVER_FOUND:
+ if (status & VIRTIO_CONFIG_S_FEATURES_OK)
+ mdi->mi_state = MIDEV_FEATURES_OK;
+ break;
+
+ case MIDEV_FEATURES_OK:
+ if (status & VIRTIO_CONFIG_STATUS_DRIVER_OK)
+ mdi->mi_state = MIDEV_LIVE;
+
+ break;
+
+ case MIDEV_LIVE:
+ break;
+
+ case MIDEV_FAILED:
+ mdi->mi_state = MIDEV_FAILED;
+ break;
+
+ default:
+ EPRINTLN("invalid device state %d", mdi->mi_state);
+ exit(1);
+ }
+}
+
+static void
+vi_handle_status(struct virtio_softc *vs, uint32_t status)
+{
+
+ struct mmio_devinst *mdi = vs->vs_mi;
+
+ if (status & VIRTIO_CONFIG_STATUS_FAILED) {
+ mdi->mi_state = MIDEV_FAILED;
+ return;
+ }
+
+ if (status & VIRTIO_CONFIG_STATUS_RESET) {
+ mdi->mi_state = MIDEV_INVALID;
+ vi_reset_dev(vs);
+ return;
+ }
+
+ vi_handle_state_change(mdi, status);
+}
+
+static void
+vi_handle_host_features_sel(struct virtio_softc *vs, uint32_t sel)
+{
+ uint64_t caps = vs->vs_vc->vc_hv_caps;
+ struct mmio_devinst *mdi = vs->vs_mi;
+
+ if (sel > 1) {
+ EPRINTLN("HOST_FEATURES SEL 0x%x, "
+ "driver confused?", sel);
+ return;
+ }
+
+ if (sel == 1) {
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_HOST_FEATURES,
+ (uint32_t)(caps >> 32));
+ } else {
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_HOST_FEATURES,
+ (uint32_t)caps);
+ }
+}
+
+static void
+vi_handle_guest_features(struct virtio_softc *vs, uint32_t features)
+{
+ struct mmio_devinst *mdi = vs->vs_mi;
+ struct virtio_consts *vc = vs->vs_vc;
+ uint64_t caps;
+ int hi;
+
+ /*
+ * XXX Add asserts to ensure we are negotiating w/ the device
+ * and not in the middle of an operation.
+ */
+
+ hi = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_GUEST_FEATURES_SEL);
+ if (hi > 1) {
+ EPRINTLN("GUEST_FEATURES_SEL 0x%x, "
+ "driver confused?", hi);
+ return;
+ }
+
+ if (hi == 1) {
+ /* Update the upper bits, keep the lower ones intact. */
+ caps = (vc->vc_hv_caps | features) >> 32;
+ vs->vs_negotiated_caps &= (vs->vs_negotiated_caps & (((1UL << 32) - 1)) << 32);
+ vs->vs_negotiated_caps |= (caps << 32);
+ } else {
+ /* Update the lower bits, keep the upper ones intact. */
+ caps = (uint32_t)(vc->vc_hv_caps | features);
+ vs->vs_negotiated_caps &= (vs->vs_negotiated_caps & ((1UL << 32) - 1));
+ vs->vs_negotiated_caps |= caps;
+
+ /* The LSBs get sent second, we are ready to apply the features. */
+ if (vc->vc_apply_features)
+ (*vc->vc_apply_features)(DEV_SOFTC(vs),
+ vs->vs_negotiated_caps);
+ }
+
+}
+
+
+static void
+vi_handle_queue_sel(struct virtio_softc *vs)
+{
+ struct mmio_devinst *mdi = vs->vs_mi;
+ struct vqueue_info *vq;
+
+ vs->vs_curq = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_SEL);
+
+ if (vs->vs_curq < 0 || vs->vs_curq >= vs->vs_vc->vc_nvq) {
+ EPRINTLN("Selected queue %d, driver confused?", vs->vs_curq);
+ return;
+ }
+
+ vq = &vs->vs_queues[vs->vs_curq];
+ if (vq_ring_ready(vq)) {
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_READY, 1);
+ return;
+ }
+
+ /* Part of virtqueue initialization. */
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_NUM_MAX, vq->vq_qsize);
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_READY, 0);
+
+ return;
+}
+
+static void
+vi_handle_queue_num(struct virtio_softc *vs, int32_t qsize)
+{
+ struct vqueue_info *vq = &vs->vs_queues[vs->vs_curq];
+
+ if (qsize > vq->vq_qsize || !powerof2(qsize)) {
+ EPRINTLN("QUEUE_NUM %d is invalid, driver confused?", qsize);
+ return;
+ }
+
+ vq->vq_qsize = qsize;
+}
+
+static void
+vi_handle_queue_ready(struct virtio_softc *vs, uint32_t ready)
+{
+ struct vqueue_info *vq = &vs->vs_queues[vs->vs_curq];
+ struct mmio_devinst *mdi = vs->vs_mi;
+
+ if (ready > 1) {
+ EPRINTLN("QUEUE_READY has value %d, driver confused?", ready);
+ return;
+ }
+
+ if (ready == 1 && !vq_ring_ready(vq)) {
+ vi_vq_init(mdi, vq);
+ return;
+ }
+}
+
+static void
+vi_handle_interrupt_ack(struct virtio_softc *vs, uint32_t ack)
+{
+ struct mmio_devinst *mdi = vs->vs_mi;
+
+ /*
+ * Follow the protocol even if we are executing the
+ * interrupt ourselves, so we are the ones that sent
+ * the ACK from the kernel in the first place.
+ */
+ if (ack != 1) {
+ EPRINTLN("INTERRUPT_ACK has value %d, "
+ "driver confused?", ack);
+ return;
+ }
+
+ mmio_set_cfgdata32(mdi, VIRTIO_MMIO_INTERRUPT_ACK, 0);
+}
+
+static void
+vi_handle_queue_notify(struct virtio_softc *vs, uint32_t ind)
+{
+ struct virtio_consts *vc = vs->vs_vc;
+ struct vqueue_info *vq;
+
+ if (ind >= (unsigned int)vc->vc_nvq) {
+ EPRINTLN("%s: queue %d notify out of range",
+ vc->vc_name, ind);
+ }
+
+ vq = &vs->vs_queues[ind];
+ if (vq->vq_notify) {
+ (*vq->vq_notify)(DEV_SOFTC(vs), vq);
+ } else if (vc->vc_qnotify) {
+ (*vc->vc_qnotify)(DEV_SOFTC(vs), vq);
+ } else {
+ EPRINTLN("%s: qnotify value %d: missing vq/vc notify",
+ vc->vc_name, ind);
+ }
+
+}
+
+void
+vi_mmio_write(struct virtio_softc *vs, uint64_t offset)
+{
+ /* Reported writes are always 32-bit. */
+ const int size = 4;
+
+ struct mmio_devinst *mdi = vs->vs_mi;
+ struct virtio_consts *vc;
+ struct config_reg *cr;
+ const char *name;
+ uint32_t newoff;
+ int32_t value;
+ uint64_t max;
+ int error;
+
+ if (vs->vs_mtx)
+ pthread_mutex_lock(vs->vs_mtx);
+
+ vc = vs->vs_vc;
+ name = vc->vc_name;
+
+ /* If writing in the config space, */
+ if (offset >= VIRTIO_MMIO_CONFIG) {
+ newoff = offset - VIRTIO_MMIO_CONFIG;
+ max = vc->vc_cfgsize ? vc->vc_cfgsize : (mdi->mi_bytes - VIRTIO_MMIO_CONFIG);
+ if (newoff + size > max)
+ goto bad;
+
+ value = mmio_get_cfgdata32(mdi, offset);
+
+ if (vc->vc_cfgwrite != NULL)
+ error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value);
+ else
+ error = 0;
+ if (!error)
+ goto done;
+ }
+
+bad:
+ cr = vi_find_cr(offset);
+ if (cr == NULL) {
+ EPRINTLN("%s: write to bad offset %jd",
+ name, (uintmax_t)offset);
+ goto done;
+
+ }
+
+ if (cr->cr_ro) {
+ EPRINTLN("%s: write to read-only reg %s",
+ name, cr->cr_name);
+ goto done;
+ }
+
+ value = mmio_get_cfgdata32(mdi, cr->cr_offset);
+
+ switch (cr->cr_offset) {
+ case VIRTIO_MMIO_STATUS:
+ vi_handle_status(vs, value);
+ break;
+
+ case VIRTIO_MMIO_HOST_FEATURES_SEL:
+ vi_handle_host_features_sel(vs, value);
+ break;
+
+ case VIRTIO_MMIO_GUEST_FEATURES:
+ vi_handle_guest_features(vs, value);
+ break;
+
+ case VIRTIO_MMIO_QUEUE_SEL:
+ vi_handle_queue_sel(vs);
+ break;
+
+ case VIRTIO_MMIO_QUEUE_NUM:
+ vi_handle_queue_num(vs, value);
+ break;
+
+ case VIRTIO_MMIO_QUEUE_READY:
+ vi_handle_queue_ready(vs, value);
+ break;
+
+ case VIRTIO_MMIO_QUEUE_NOTIFY:
+ vi_handle_queue_notify(vs, value);
+ break;
+
+ case VIRTIO_MMIO_INTERRUPT_ACK:
+ vi_handle_interrupt_ack(vs, value);
+ break;
+ default:
+ EPRINTLN("Unhandled offset %d\n", cr->cr_offset);
+ assert(0);
+ }
+
+ goto done;
+
+done:
+
+ if (vs->vs_mtx)
+ pthread_mutex_unlock(vs->vs_mtx);
+}
diff --git a/tests/sys/virtio/virtiodbg.c b/tests/sys/virtio/virtiodbg.c
new file mode 100644
--- /dev/null
+++ b/tests/sys/virtio/virtiodbg.c
@@ -0,0 +1,105 @@
+#include <err.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include "config.h"
+#include "debug.h"
+#include "mevent.h"
+#include "mmio_emul.h"
+
+static void
+virtiodbg_usage(int code)
+{
+ const char *progname;
+
+ progname = getprogname();
+
+ fprintf(stderr,
+ "Usage: %s [-hot]\n"
+ " -h: help\n"
+ " -o: set config 'var' to 'value'\n"
+ " -t: MMIO device type\n",
+ progname);
+ exit(code);
+}
+
+static bool
+virtiodbg_parse_config_option(nvlist_t *nvl, const char *option)
+{
+ const char *key;
+ char *value;
+
+ key = option;
+ value = strchr(option, '=');
+ if (value == NULL || value[1] == '\0')
+ return (false);
+
+ *value = '\0';
+
+ set_config_value_node(nvl, key, value + 1);
+ return (true);
+}
+
+
+static nvlist_t *
+virtiodbg_optparse(int argc, char **argv)
+{
+ const char *optstr;
+ nvlist_t *nvl;
+ int c;
+
+ nvl = create_config_node("device");
+
+ optstr = "ho:t:";
+ while ((c = getopt(argc, argv, optstr)) != -1) {
+ switch (c) {
+ case 't':
+ if (strncmp(optarg, "help", strlen(optarg)) == 0) {
+ mmio_print_supported_devices();
+ exit(0);
+ } else if (mmio_parse_device(nvl, optarg) != 0)
+ exit(4);
+ else
+ break;
+ case 'o':
+ if (!virtiodbg_parse_config_option(nvl, optarg)) {
+ errx(EX_USAGE,
+ "invalid configuration option '%s'",
+ optarg);
+ }
+ break;
+ case 'h':
+ virtiodbg_usage(0);
+ default:
+ virtiodbg_usage(1);
+ }
+ }
+
+ return (nvl);
+}
+
+int
+main(int argc, char *argv[])
+{
+ nvlist_t *nvl;
+
+ init_config();
+ nvl = virtiodbg_optparse(argc, argv);
+
+ /* Exit if a device emulation finds an error in its initialization */
+ if (init_mmio(nvl) != 0) {
+ EPRINTLN("Device emulation initialization error: %s",
+ strerror(errno));
+ exit(4);
+ }
+
+ /* Head off to the main event dispatch loop. */
+ mevent_dispatch();
+
+ exit(4);
+}

File Metadata

Mime Type
text/plain
Expires
Thu, Nov 27, 9:51 AM (14 h, 30 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
26252425
Default Alt Text
D45370.diff (160 KB)

Event Timeline