diff --git a/sys/conf/files b/sys/conf/files --- a/sys/conf/files +++ b/sys/conf/files @@ -3438,6 +3438,7 @@ dev/virtio/mmio/virtio_mmio_cmdline.c optional virtio_mmio dev/virtio/mmio/virtio_mmio_fdt.c optional virtio_mmio fdt dev/virtio/mmio/virtio_mmio_if.m optional virtio_mmio +dev/virtio/dbg/virtio_dbg.c optional virtio_dbg dev/virtio/network/if_vtnet.c optional vtnet dev/virtio/balloon/virtio_balloon.c optional virtio_balloon dev/virtio/block/virtio_blk.c optional virtio_blk diff --git a/sys/dev/virtio/dbg/virtio_dbg.h b/sys/dev/virtio/dbg/virtio_dbg.h new file mode 100644 --- /dev/null +++ b/sys/dev/virtio/dbg/virtio_dbg.h @@ -0,0 +1,25 @@ +#ifndef _VIRTIO_DBG_ +#define _VIRTIO_DBG_ + +#include +#include + +struct vtdbg_transfer { + caddr_t vtdt_device; + caddr_t vtdt_driver; + size_t vtdt_len; +}; + +struct vtdbg_io_args { + struct vtdbg_transfer *transfers; + size_t cnt; + bool touser; +}; + +#define VIRTIO_DBG_INIT _IO('v', 1) +#define VIRTIO_DBG_KICK _IO('v', 2) +#define VIRTIO_DBG_ACK _IO('v', 3) +#define VIRTIO_DBG_TRANSFER _IOWR('v', 4, struct vtdbg_io_args) + + +#endif /* _VIRTIO_DBG_ */ diff --git a/sys/dev/virtio/dbg/virtio_dbg.c b/sys/dev/virtio/dbg/virtio_dbg.c new file mode 100644 --- /dev/null +++ b/sys/dev/virtio/dbg/virtio_dbg.c @@ -0,0 +1,970 @@ +/*- + * Copyright (c) 2024 Emil Tsalapatis + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "virtio_mmio_if.h" + +#define VTDBG_MAGIC ((uint64_t)0x84848484ULL) + +/* + * XXX Determine these sizes in a well-defined + * per-device fashion. + */ +#define VTDBG_MAPSZ (1024 * 1024 * 10) +#define VTDBG_RESERVE_DEVSPACE (4096) + +/* XXX Remove after development is done. */ +#define VTDBG_WARN(format, ...) \ + do { \ + printf("(%s:%d) " format, __func__, __LINE__, ##__VA_ARGS__); \ + } while (0) + +static device_t vtdbg_parent; +static driver_t *vtdbg_driver; + +#define VTDBG_UPDATE_DESC (0x01) +#define VTDBG_UPDATE_USED (0x02) +#define VTDBG_UPDATE_AVAIL (0x04) +#define VTDBG_INTR_PENDING (0x08) +#define VTDBG_INTR_EXITING (0x10) + +/* + * Information on a debug device instance. Accessed + * through the control device's softc. + */ +struct vtdbg_softc { + struct mtx vtd_mtx; + struct cv vtd_cv; + struct knlist vtd_note; + uint32_t vtd_magic; + + vm_object_t vtd_object; + vm_ooffset_t vtd_baseaddr; + size_t vtd_bytes; + size_t vtd_allocated; + + virtqueue_intr_t *vtd_intr; + void *vtd_intr_arg; + struct proc *vtd_pintr; + + vm_ooffset_t vtd_offset; + + uint32_t vtd_flags; + + device_t vtd_dev; +}; + +/* + * Subclass of vtmmio_softc that also lets the virtio device access + * vtdbg related information while also being usable from vtmmio_* + * methods. The vtdbg_softc * is the softc of the control device and + * is allocated dynamically when opening an instance of the control device, + * while the virtio_dbg_softc here is allocated during device_t creation. + */ +struct virtio_dbg_softc { + struct vtmmio_softc vtmdbg_mmio; + struct vtdbg_softc *vtmdbg_dbg; +}; + +/* + * Store the parent bus and driver pointers for the debug devices, + * because we need them when creating debug devices on-demand later on. + * We are hanging off of the nexus, so we are certain it's not going away. + */ +static void +virtio_dbg_identify(driver_t *driver, device_t parent) +{ + vtdbg_parent = parent; + vtdbg_driver = driver; +} + +static struct vtdbg_softc * +vtmmio_get_vtdbg(device_t dev) +{ + struct virtio_dbg_softc *sc; + + sc = device_get_softc(dev); + MPASS(sc->vtmdbg_dbg->vtd_magic == VTDBG_MAGIC); + + return (sc->vtmdbg_dbg); +} + +/* + * Explicitly turn polling into a no-op. + */ +static int +virtio_dbg_poll(device_t dev) +{ + + return (0); +} + + +/* + * Make sure the shared virtio device region between kernel and userspace + * is configured properly. + */ +static int +virtio_dbg_probe(device_t dev) +{ + struct virtio_dbg_softc *sc; + struct vtmmio_softc *mmiosc; + uint32_t magic, version; + + sc = device_get_softc(dev); + mmiosc = &sc->vtmdbg_mmio; + + /* Fake platform to trigger virtio_mmio_note() on writes. */ + sc->vtmdbg_mmio.platform = dev; + + magic = vtmmio_read_config_4(mmiosc, VIRTIO_MMIO_MAGIC_VALUE); + if (magic != VIRTIO_MMIO_MAGIC_VIRT) { + device_printf(dev, "Bad magic value %#x\n", magic); + return (ENXIO); + } + + version = vtmmio_read_config_4(mmiosc, VIRTIO_MMIO_VERSION); + if (version != 2) { + device_printf(dev, "Unsupported version: %#x\n", version); + return (ENXIO); + } + + if (vtmmio_read_config_4(mmiosc, VIRTIO_MMIO_DEVICE_ID) == 0) + return (ENXIO); + + device_set_desc(dev, "VirtIO Emulated MMIO adapter"); + + return (0); +} + +/* + * Creates the virtio device corresponding to the transport instance. + */ +static int +virtio_dbg_attach(device_t dev) +{ + struct virtio_dbg_softc *sc; + struct vtmmio_softc *mmiosc; + device_t child; + + sc = device_get_softc(dev); + mmiosc = &sc->vtmdbg_mmio; + + mmiosc->dev = dev; + mmiosc->vtmmio_version = vtmmio_read_config_4(mmiosc, VIRTIO_MMIO_VERSION); + + vtmmio_reset(mmiosc); + + /* Tell the host we've noticed this device. */ + vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_ACK); + + mtx_lock(&Giant); + if ((child = device_add_child(dev, NULL, -1)) == NULL) { + device_printf(dev, "Cannot create child device.\n"); + vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED); + + DEVICE_DETACH(dev); + mtx_unlock(&Giant); + + return (ENOMEM); + } + + mmiosc->vtmmio_child_dev = child; + vtmmio_probe_and_attach_child(mmiosc); + + mtx_unlock(&Giant); + + return (0); +} + +/* + * Recompute the queue descriptor to be an offset within the shared user/kernel + * device control region. Our userspace cannot meaningfully translate + * kernel physical addresses, so we transform the values in the queue + * descriptor address registers into offsets. Userspace finds the vq address + * by adding the offset to its own virtual address for the region. + */ +static void +virtio_dbg_qdesc_offset(struct vtmmio_softc *sc, uint64_t baseaddr, + int hireg, int loreg) +{ + struct resource *res = sc->res[0]; + uint32_t hi, lo; + uint64_t qaddr; + + /* Read in the components of the physical address. */ + hi = bus_read_4(res, hireg); + lo = bus_read_4(res, loreg); + + /* Recompute into an offset into the vq control region. */ + qaddr = (((uint64_t)hi) << 32 | (uint64_t)lo); + qaddr -= vtophys(baseaddr); + + /* Update the register values. */ + hi = (qaddr >> 32); + lo = (qaddr & ((1ULL << 32) - 1)); + + /* Direct bus write because to avoid triggering note(). */ + bus_write_4(res, hireg, hi); + bus_write_4(res, loreg, lo); +} + +/* Notify userspace of a write, and wait for a response. */ +static int +virtio_dbg_note(device_t dev, size_t offset, int val) +{ + struct vtdbg_softc *vtdsc; + struct virtio_dbg_softc *sc; + + sc = device_get_softc(dev); + vtdsc = sc->vtmdbg_dbg; + MPASS(vtdsc->vtd_magic == VTDBG_MAGIC); + + /* + * Intercept writes to the QUEUE_{DESC, AVAIL, USED}_{HIGH, LOW} + * registers and instead pass to the user the offset from the beginning + * of the control region. Do not actually notify userspace of the writes, + * we will recompute and notify once we set VIRTIO_MMIO_QUEUE_READY. + * + * Both high and low registers are set together, so just track writes to + * the high address bits. + */ + switch (offset) { + case VIRTIO_MMIO_QUEUE_DESC_HIGH: + vtdsc->vtd_flags |= VTDBG_UPDATE_DESC; + return (1); + case VIRTIO_MMIO_QUEUE_USED_HIGH: + vtdsc->vtd_flags |= VTDBG_UPDATE_USED; + return (1); + case VIRTIO_MMIO_QUEUE_AVAIL_HIGH: + vtdsc->vtd_flags |= VTDBG_UPDATE_AVAIL; + return (1); + } + + /* Only forward the listed register writes to userspace. */ + switch (offset) { + case VIRTIO_MMIO_HOST_FEATURES_SEL: + case VIRTIO_MMIO_GUEST_FEATURES: + case VIRTIO_MMIO_QUEUE_SEL: + case VIRTIO_MMIO_QUEUE_NUM: + case VIRTIO_MMIO_QUEUE_NOTIFY: + case VIRTIO_MMIO_INTERRUPT_ACK: + case VIRTIO_MMIO_STATUS: + break; + case VIRTIO_MMIO_QUEUE_READY: + /* if changed, transform the offsets. */ + if (vtdsc->vtd_flags & VTDBG_UPDATE_DESC) { + virtio_dbg_qdesc_offset(&sc->vtmdbg_mmio, vtdsc->vtd_baseaddr, + VIRTIO_MMIO_QUEUE_DESC_HIGH, VIRTIO_MMIO_QUEUE_DESC_LOW); + vtdsc->vtd_flags &= ~VTDBG_UPDATE_DESC; + } + + if (vtdsc->vtd_flags & VTDBG_UPDATE_USED) { + virtio_dbg_qdesc_offset(&sc->vtmdbg_mmio, vtdsc->vtd_baseaddr, + VIRTIO_MMIO_QUEUE_USED_HIGH, VIRTIO_MMIO_QUEUE_USED_LOW); + vtdsc->vtd_flags &= ~VTDBG_UPDATE_USED; + } + + if (vtdsc->vtd_flags & VTDBG_UPDATE_AVAIL) { + virtio_dbg_qdesc_offset(&sc->vtmdbg_mmio, vtdsc->vtd_baseaddr, + VIRTIO_MMIO_QUEUE_AVAIL_HIGH, VIRTIO_MMIO_QUEUE_AVAIL_LOW); + vtdsc->vtd_flags &= ~VTDBG_UPDATE_AVAIL; + } + break; + default: + return (1); + } + + mtx_lock(&vtdsc->vtd_mtx); + vtdsc->vtd_offset = offset; + KNOTE_LOCKED(&vtdsc->vtd_note, 0); + + /* + * We cannot sleep here because this code is called holding non-sleepable locks. + * This is because this busy wait's corresponding operation for other transports is + * a VM exit, which is instantaneous from the point of view of the guest kernel. + * To prevent a "sleeping thread" panic, we busy wait here. There is always the + * danger of our VMM process leaving us hanging, but that is always a danger even + * with non-emulated virtio transports - it just isn't visible to the guest, since + * the VMM is normally on the host. + */ + while (vtdsc->vtd_offset != 0) { + mtx_unlock(&vtdsc->vtd_mtx); + cpu_spinwait(); + mtx_lock(&vtdsc->vtd_mtx); + } + + mtx_unlock(&vtdsc->vtd_mtx); + + return (1); +} + +/* + * Pass interrupt information to the cdev. The cdev will be directly + * running the device interrupt handling code as an ioctl. + */ +static int +virtio_dbg_setup_intr(device_t dev, device_t mmio_dev, void *handler, void *ih_user) +{ + struct vtdbg_softc *sc; + + sc = vtmmio_get_vtdbg(dev); + MPASS(sc->vtd_magic == VTDBG_MAGIC); + + mtx_lock(&sc->vtd_mtx); + sc->vtd_intr = handler; + sc->vtd_intr_arg = ih_user; + mtx_unlock(&sc->vtd_mtx); + + return (0); +} + +static device_method_t virtio_dbg_methods[] = { + DEVMETHOD(device_attach, virtio_dbg_attach), + DEVMETHOD(device_identify, virtio_dbg_identify), + DEVMETHOD(device_probe, virtio_dbg_probe), + + DEVMETHOD(virtio_mmio_poll, virtio_dbg_poll), + DEVMETHOD(virtio_mmio_note, virtio_dbg_note), + DEVMETHOD(virtio_mmio_setup_intr, virtio_dbg_setup_intr), + + DEVMETHOD_END +}; + +DEFINE_CLASS_1(virtio_dbg, virtio_dbg_driver, virtio_dbg_methods, + sizeof(struct vtdbg_softc), vtmmio_driver); +/* + * XXX We are currently hanging off of the nexus, not 100% it's the right way. + */ +DRIVER_MODULE(virtio_dbg, nexus, virtio_dbg_driver, 0, 0); +MODULE_VERSION(virtio_dbg, 1); + +static struct cdev *vtdbg_dev; + +/* + * Create and map the device memory into the kernel. + */ +static int +vtdbg_map_kernel(struct vtdbg_softc *sc) +{ + vm_object_t obj = sc->vtd_object; + size_t bytes = IDX_TO_OFF(obj->size); + vm_offset_t baseaddr, tmp; + vm_page_t m, end_m; + int error; + + /* XXX Do not allow mapping twice. */ + + vm_object_reference(obj); + + /* + * Populate the object with physically contiguous pages, because + * the object is used to back the virtio device control region. + */ + VM_OBJECT_WLOCK(obj); + m = vm_page_alloc_contig(obj, 0, VM_ALLOC_NORMAL | VM_ALLOC_ZERO, obj->size, + 0, (uint64_t) -1, 1, 0, VM_MEMATTR_DEFAULT); + VM_OBJECT_WUNLOCK(obj); + if (m == NULL) { + vm_object_deallocate(obj); + return (ENOMEM); + } + + + baseaddr = VM_MIN_KERNEL_ADDRESS; + error = vm_map_find(kernel_map, obj, 0, &baseaddr, bytes, VM_MAX_KERNEL_ADDRESS, + VMFS_OPTIMAL_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); + if (error != KERN_SUCCESS) { + vm_object_deallocate(obj); + return (ENOMEM); + } + + end_m = m + (bytes / PAGE_SIZE); + tmp = baseaddr; + for (; m < end_m; m++) { + vm_page_valid(m); + pmap_zero_page(m); + pmap_enter(kernel_pmap, tmp, m, VM_PROT_RW, + VM_PROT_RW | PMAP_ENTER_WIRED, 0); + tmp += PAGE_SIZE; + vm_page_xunbusy(m); + } + + + sc->vtd_baseaddr = baseaddr; + sc->vtd_bytes = bytes; + + /* Reserve space for the device control region. */ + sc->vtd_allocated = VTDBG_RESERVE_DEVSPACE; + + return (0); +} + +static void +vtdbg_intr(void *arg) +{ + struct vtdbg_softc *sc = (struct vtdbg_softc *)arg; + + mtx_lock(&sc->vtd_mtx); + while ((sc->vtd_flags & VTDBG_INTR_EXITING) == 0) { + if ((sc->vtd_flags & VTDBG_INTR_PENDING) == 0) { + cv_wait(&sc->vtd_cv, &sc->vtd_mtx); + continue; + } + + sc->vtd_flags &= ~VTDBG_INTR_PENDING; + mtx_unlock(&sc->vtd_mtx); + + if (sc->vtd_intr) + sc->vtd_intr(sc->vtd_intr_arg); + + mtx_lock(&sc->vtd_mtx); + cv_wait(&sc->vtd_cv, &sc->vtd_mtx); + } + + sc->vtd_pintr = NULL; + cv_signal(&sc->vtd_cv); + + mtx_unlock(&sc->vtd_mtx); + + kproc_exit(0); +} + +/* + * Destroy the virtio transport instance when closing the + * corresponding control device fd. + */ +static void +vtdbg_dtor(void *arg) +{ + struct virtio_dbg_softc *devsc; + struct vtdbg_softc *sc = (struct vtdbg_softc *)arg; + vm_offset_t sva, eva; + device_t dev; + + MPASS(sc->vtd_magic == VTDBG_MAGIC); + + if (sc->vtd_pintr != NULL) { + mtx_lock(&sc->vtd_mtx); + sc->vtd_flags |= VTDBG_INTR_EXITING; + cv_signal(&sc->vtd_cv); + mtx_unlock(&sc->vtd_mtx); + + mtx_lock(&sc->vtd_mtx); + while (sc->vtd_pintr != NULL) + cv_wait(&sc->vtd_cv, &sc->vtd_mtx); + mtx_unlock(&sc->vtd_mtx); + } + + dev = sc->vtd_dev; + if (dev != NULL) { + devsc = device_get_softc(dev); + + mtx_lock(&Giant); + DEVICE_DETACH(dev); + mtx_unlock(&Giant); + + free(devsc->vtmdbg_mmio.res[0], M_DEVBUF); + device_delete_child(vtdbg_parent, dev); + } + + + if (sc->vtd_baseaddr != 0) { + sva = sc->vtd_baseaddr; + eva = sva + sc->vtd_bytes; + vm_map_remove(kernel_map, sva, eva); + pmap_remove(kernel_pmap, sva, eva); + } + + vm_object_deallocate(sc->vtd_object); + + knlist_delete(&sc->vtd_note, curthread, 0); + knlist_destroy(&sc->vtd_note); + + cv_destroy(&sc->vtd_cv); + mtx_destroy(&sc->vtd_mtx); + + free(sc, M_DEVBUF); +} + +static int +vtdbg_open(struct cdev *cdev, int oflags, int devtype, struct thread *td) +{ + size_t sz = round_page(VTDBG_MAPSZ); + struct vtdbg_softc *sc; + int error; + + sc = malloc(sizeof(struct vtdbg_softc), M_DEVBUF, M_NOWAIT|M_ZERO); + if (sc == NULL) + return (ENOMEM); + + sc->vtd_magic = VTDBG_MAGIC; + mtx_init(&sc->vtd_mtx, "vtdbg", NULL, MTX_DEF); + cv_init(&sc->vtd_cv, "vtdbg"); + + knlist_init_mtx(&sc->vtd_note, &sc->vtd_mtx); + + /* Create the common userspace/kernel virtio device region. */ + sc->vtd_object = vm_pager_allocate(OBJT_PHYS, NULL, sz, VM_PROT_ALL, + 0, thread0.td_ucred); + if (sc->vtd_object == NULL) { + vtdbg_dtor(sc); + return (ENOMEM); + } + + error = vtdbg_map_kernel(sc); + if (error != 0) { + vtdbg_dtor(sc); + return (error); + } + + error = kproc_create(vtdbg_intr, (void *)sc, &sc->vtd_pintr, + 0, 0, "vtdbg_intr"); + if (error != 0) { + vtdbg_dtor(sc); + return (error); + } + + error = devfs_set_cdevpriv((void *)sc, vtdbg_dtor); + if (error != 0) + vtdbg_dtor(sc); + + return (error); +} + +static int +vtdbg_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, + vm_size_t size, vm_object_t *objp, int nprot) +{ + struct vtdbg_softc *sc; + int error; + + error = devfs_get_cdevpriv((void **)&sc); + if (error != 0) + return (error); + + if (*offset + size > sc->vtd_bytes) + return (EINVAL); + + vm_object_reference(sc->vtd_object); + *objp = sc->vtd_object; + + return (0); +} + +static void * +vtdbg_ringalloc(device_t dev, size_t size) +{ + struct vtdbg_softc *sc = vtmmio_get_vtdbg(dev); + void *mem; + + MPASS(sc->vtd_magic == VTDBG_MAGIC); + + mtx_lock(&sc->vtd_mtx); + if (sc->vtd_allocated + size > sc->vtd_bytes) { + mtx_unlock(&sc->vtd_mtx); + return (NULL); + } + + mem = (void *)(sc->vtd_baseaddr + sc->vtd_allocated); + sc->vtd_allocated += size; + + mtx_unlock(&sc->vtd_mtx); + + return (mem); +} + +static device_t +vtdbg_create_transport(device_t parent, struct vtdbg_softc *vtdsc) +{ + struct virtio_dbg_softc *sc; + struct vtmmio_softc *mmiosc; + struct resource *res; + device_t transport; + + int uid = 0; + + transport = BUS_ADD_CHILD(parent, 0, virtio_dbg_driver.name, uid); + device_set_driver(transport, vtdbg_driver); + + sc = device_get_softc(transport); + mmiosc = &sc->vtmdbg_mmio; + + /* + * XXX Hack. Create the resource out of thin air to + * keep the vtmmio_write_* calls working. If we wanted to be uniform + * would be reserving the resource out of the RAM pseudobus, + * but it has no associated struct rman * instance, + * and we have already reserved this memory region + * by allocating it anyway so there is no possiblity + * of conflicts.. + */ + res = malloc(sizeof(*res), M_DEVBUF, M_WAITOK); + res->r_bushandle = vtdsc->vtd_baseaddr; + res->r_bustag = X86_BUS_SPACE_MEM; + mmiosc->res[0] = res; + + /* Ring buffer allocation callback. */ + mmiosc->vtmmio_ringalloc_cb = vtdbg_ringalloc; + + return (transport); +} + +static int +vtdbg_linkup_transport(struct vtdbg_softc *vtdsc, device_t dev) +{ + struct virtio_dbg_softc *mmiosc; + + mtx_lock(&vtdsc->vtd_mtx); + if (vtdsc->vtd_dev != NULL) { + mtx_unlock(&vtdsc->vtd_mtx); + return (EALREADY); + } + + mmiosc = device_get_softc(dev); + + /* Have the device and cdev be able to refer to each other. */ + mmiosc->vtmdbg_dbg = vtdsc; + vtdsc->vtd_dev = dev; + + mtx_unlock(&vtdsc->vtd_mtx); + + return (0); +} + +/* + * Create virtio device. This function does the initialization both + * for the emulated transport, and for the virtio device. These are + * normally (e.g., for MMIO)) created at boot time using vtmmio_probe/vtmmio_attach, + * and vtmmio_probe_and_attach_child, respectively. We do this initialization + * here because we are dynamically creating the devices after booting, so + * we must manually invoke the device probe and attach methods. + */ +static int +vtdbg_init(void) +{ + struct virtio_dbg_softc *sc; + struct vtdbg_softc *vtdsc; + device_t transport; + int error; + + /* Retrieve the mapping address/size. */ + error = devfs_get_cdevpriv((void **)&vtdsc); + if (error != 0) + return (error); + + MPASS(vtdsc->vtd_magic == VTDBG_MAGIC); + + transport = vtdbg_create_transport(vtdbg_parent, vtdsc); + + error = vtdbg_linkup_transport(vtdsc, transport); + if (error != 0) + goto err; + + error = DEVICE_PROBE(transport); + if (error != 0) + goto err; + + return (DEVICE_ATTACH(transport)); + +err: + sc = device_get_softc(transport); + + /* + * Release the resource but do not notify + * the parent bus as we didn't reserve it + * from it. + */ + free(sc->vtmdbg_mmio.res[0], M_DEVBUF); + + mtx_lock(&Giant); + device_delete_child(vtdbg_parent, transport); + mtx_unlock(&Giant); + + vtdsc->vtd_dev = NULL; + + return (error); +} + +/* + * Kick the dedicated kernel interrupt process. + */ +static void +vtdbg_kick(struct vtdbg_softc *sc) +{ + mtx_lock(&sc->vtd_mtx); + sc->vtd_flags |= VTDBG_INTR_PENDING; + cv_signal(&sc->vtd_cv); + mtx_unlock(&sc->vtd_mtx); +} + +/* + * The mmio virtio code uses note() to let the host know there has been a write. + * The note() call suspends the thread until the userspace device has been properly + * emulated, at which point a userspace thread will allow it to resume. + * + * There can only be one unacknowledged interrupt outstanding at a time, so a single + * vtd_offset in the softc is enough. + */ +static void +vtdbg_ack(struct vtdbg_softc *sc) +{ + mtx_lock(&sc->vtd_mtx); + sc->vtd_offset = 0; + wakeup(sc); + mtx_unlock(&sc->vtd_mtx); +} + +/* + * Get virtio data in and out of the kernel, required by userspace to interact with + * the data pointed to by the virtqueue descriptors. + */ +static int +vtdbg_io(struct vtdbg_softc *sc, struct vtdbg_io_args *args) +{ + struct vtdbg_transfer *tf; + caddr_t driver, device; + int error = 0; + size_t len; + int i; + + tf = malloc(args->cnt * sizeof(*tf), M_DEVBUF, M_NOWAIT); + if (tf == NULL) + return (ENOMEM); + + error = copyin(args->transfers, tf, args->cnt * (sizeof(*tf))); + if (error != 0) { + free(tf, M_DEVBUF); + return (error); + } + + for (i = 0; i < args->cnt; i++) { + driver = (caddr_t)PHYS_TO_DMAP((vm_paddr_t)tf[i].vtdt_driver); + /* Translate from physical to kernel virtual. */ + device = tf[i].vtdt_device; + len = tf[i].vtdt_len; + + if (args->touser) + error = copyout(driver, device, len); + else + error = copyin(device, driver, len); + + if (error != 0) + break; + } + + free(tf, M_DEVBUF); + + return (error); +} + + +static int +vtdbg_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td) +{ + struct vtdbg_softc *sc; + int ret = 0; + + ret = devfs_get_cdevpriv((void **)&sc); + if (ret != 0) + return (ret); + + MPASS(sc->vtd_magic == VTDBG_MAGIC); + switch (cmd) { + case VIRTIO_DBG_INIT: + ret = vtdbg_init(); + break; + case VIRTIO_DBG_KICK: + vtdbg_kick(sc); + break; + case VIRTIO_DBG_ACK: + vtdbg_ack(sc); + break; + case VIRTIO_DBG_TRANSFER: + ret = vtdbg_io(sc, (struct vtdbg_io_args *)data); + break; + } + + return (ret); +} + +static int +vtdbg_filt_attach(struct knote *kn) +{ + kn->kn_flags |= EV_CLEAR; + return (0); +} + +static void +vtdbg_filt_detach(struct knote *kn) +{ + struct vtdbg_softc *sc; + sc = (struct vtdbg_softc *)kn->kn_hook; + MPASS(sc->vtd_magic == VTDBG_MAGIC); + + knlist_remove(&sc->vtd_note, kn, 0); + kn->kn_hook = NULL; +} + +static int +vtdbg_filt_read(struct knote *kn, long hint) +{ + struct vtdbg_softc *sc; + + + sc = (struct vtdbg_softc *)kn->kn_hook; + MPASS(sc->vtd_magic == VTDBG_MAGIC); + mtx_assert(&sc->vtd_mtx, MA_OWNED); + + if (sc->vtd_offset == 0) + return (0); + + kn->kn_data = sc->vtd_offset; + + return (1); +} + +struct filterops vtdbg_filtops = { + .f_isfd = 1, + .f_attach = vtdbg_filt_attach, + .f_detach = vtdbg_filt_detach, + .f_event = vtdbg_filt_read, +}; + +static int +vtdbg_kqfilter(struct cdev *dev, struct knote *kn) +{ + struct vtdbg_softc *sc; + int error; + + error = devfs_get_cdevpriv((void **)&sc); + if (error != 0) + return (error); + MPASS(sc->vtd_magic == VTDBG_MAGIC); + + if (kn->kn_filter != EVFILT_READ) { + kn->kn_data = EINVAL; + return (EINVAL); + } + + kn->kn_fop = &vtdbg_filtops; + kn->kn_hook = sc; + knlist_add(&sc->vtd_note, kn, 0); + + return (0); + +} + +static struct cdevsw vtdbg_cdevsw = { + .d_open = vtdbg_open, + .d_mmap_single = vtdbg_mmap_single, + .d_ioctl = vtdbg_ioctl, + .d_kqfilter = vtdbg_kqfilter, + .d_name = "vtdbg", + .d_version = D_VERSION, +}; + +static int +vtdbg_dev_create(void) +{ + vtdbg_dev = make_dev(&vtdbg_cdevsw, 0, UID_ROOT, GID_OPERATOR, + S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP, "vtdbg"); + if (vtdbg_dev == NULL) + return (ENOMEM); + + return (0); +} + +static void +vtdbg_dev_destroy(void) +{ + MPASS(vtdbg_dev != NULL); + destroy_dev(vtdbg_dev); +} + +static int +vtdbg_loader(struct module *m, int what, void *arg) +{ + int err = 0; + + switch (what) { + case MOD_LOAD: + err = vtdbg_dev_create(); + break; + case MOD_UNLOAD: + vtdbg_dev_destroy(); + break; + default: + return (EINVAL); + } + + return (err); +} + +static moduledata_t vtdbg_moddata = { + "vtdbg", + vtdbg_loader, + NULL, +}; + +DECLARE_MODULE(vtdbg, vtdbg_moddata, SI_SUB_VFS, SI_ORDER_MIDDLE); diff --git a/sys/dev/virtio/mmio/virtio_mmio.h b/sys/dev/virtio/mmio/virtio_mmio.h --- a/sys/dev/virtio/mmio/virtio_mmio.h +++ b/sys/dev/virtio/mmio/virtio_mmio.h @@ -31,9 +31,12 @@ #ifndef _VIRTIO_MMIO_H #define _VIRTIO_MMIO_H +#ifdef _KERNEL + DECLARE_CLASS(vtmmio_driver); struct vtmmio_virtqueue; +typedef void *vtmmio_alloc_cb_t(device_t, size_t); struct vtmmio_softc { device_t dev; @@ -51,10 +54,25 @@ int vtmmio_nvqs; struct vtmmio_virtqueue *vtmmio_vqs; void *ih; + + vtmmio_alloc_cb_t *vtmmio_ringalloc_cb; }; int vtmmio_probe(device_t); int vtmmio_attach(device_t); +void vtmmio_reset(struct vtmmio_softc *); +uint8_t vtmmio_get_status(device_t); +void vtmmio_set_status(device_t, uint8_t); +void vtmmio_probe_and_attach_child(struct vtmmio_softc *); + +#define vtmmio_read_config_1(sc, o) \ + bus_read_1((sc)->res[0], (o)) +#define vtmmio_read_config_2(sc, o) \ + bus_read_2((sc)->res[0], (o)) +#define vtmmio_read_config_4(sc, o) \ + bus_read_4((sc)->res[0], (o)) + +#endif /* _KERNEL */ #define VIRTIO_MMIO_MAGIC_VALUE 0x000 #define VIRTIO_MMIO_VERSION 0x004 diff --git a/sys/dev/virtio/mmio/virtio_mmio.c b/sys/dev/virtio/mmio/virtio_mmio.c --- a/sys/dev/virtio/mmio/virtio_mmio.c +++ b/sys/dev/virtio/mmio/virtio_mmio.c @@ -84,19 +84,15 @@ static void vtmmio_reinit_complete(device_t); static void vtmmio_notify_virtqueue(device_t, uint16_t, bus_size_t); static int vtmmio_config_generation(device_t); -static uint8_t vtmmio_get_status(device_t); -static void vtmmio_set_status(device_t, uint8_t); static void vtmmio_read_dev_config(device_t, bus_size_t, void *, int); static uint64_t vtmmio_read_dev_config_8(struct vtmmio_softc *, bus_size_t); static void vtmmio_write_dev_config(device_t, bus_size_t, const void *, int); static void vtmmio_describe_features(struct vtmmio_softc *, const char *, uint64_t); -static void vtmmio_probe_and_attach_child(struct vtmmio_softc *); static int vtmmio_reinit_virtqueue(struct vtmmio_softc *, int); static void vtmmio_free_interrupts(struct vtmmio_softc *); static void vtmmio_free_virtqueues(struct vtmmio_softc *); static void vtmmio_release_child_resources(struct vtmmio_softc *); -static void vtmmio_reset(struct vtmmio_softc *); static void vtmmio_select_virtqueue(struct vtmmio_softc *, int); static void vtmmio_vq_intr(void *); @@ -128,13 +124,6 @@ VIRTIO_MMIO_NOTE(sc->platform, (o), (v)); \ } while (0) -#define vtmmio_read_config_1(sc, o) \ - bus_read_1((sc)->res[0], (o)) -#define vtmmio_read_config_2(sc, o) \ - bus_read_2((sc)->res[0], (o)) -#define vtmmio_read_config_4(sc, o) \ - bus_read_4((sc)->res[0], (o)) - static device_method_t vtmmio_methods[] = { /* Device interface. */ DEVMETHOD(device_attach, vtmmio_attach), @@ -572,7 +561,7 @@ error = virtqueue_alloc(dev, idx, size, VIRTIO_MMIO_QUEUE_NOTIFY, VIRTIO_MMIO_VRING_ALIGN, - ~(vm_paddr_t)0, info, &vq); + ~(vm_paddr_t)0, info, &vq, sc->vtmmio_ringalloc_cb); if (error) { device_printf(dev, "cannot allocate virtqueue %d: %d\n", @@ -689,7 +678,7 @@ return (gen); } -static uint8_t +uint8_t vtmmio_get_status(device_t dev) { struct vtmmio_softc *sc; @@ -699,7 +688,7 @@ return (vtmmio_read_config_4(sc, VIRTIO_MMIO_STATUS)); } -static void +void vtmmio_set_status(device_t dev, uint8_t status) { struct vtmmio_softc *sc; @@ -875,7 +864,7 @@ virtio_describe(dev, msg, features, sc->vtmmio_child_feat_desc); } -static void +void vtmmio_probe_and_attach_child(struct vtmmio_softc *sc) { device_t dev, child; @@ -976,7 +965,7 @@ vtmmio_free_virtqueues(sc); } -static void +void vtmmio_reset(struct vtmmio_softc *sc) { diff --git a/sys/dev/virtio/pci/virtio_pci.c b/sys/dev/virtio/pci/virtio_pci.c --- a/sys/dev/virtio/pci/virtio_pci.c +++ b/sys/dev/virtio/pci/virtio_pci.c @@ -362,7 +362,7 @@ notify_offset = vtpci_get_vq_notify_off(cn, idx); error = virtqueue_alloc(dev, idx, size, notify_offset, align, - ~(vm_paddr_t)0, info, &vq); + ~(vm_paddr_t)0, info, &vq, NULL); if (error) { device_printf(dev, "cannot allocate virtqueue %d: %d\n", idx, error); diff --git a/sys/dev/virtio/virtio.h b/sys/dev/virtio/virtio.h --- a/sys/dev/virtio/virtio.h +++ b/sys/dev/virtio/virtio.h @@ -68,7 +68,9 @@ #define VIRTIO_DRIVER_MODULE(name, driver, evh, arg) \ DRIVER_MODULE(name, virtio_mmio, driver, evh, arg); \ - DRIVER_MODULE(name, virtio_pci, driver, evh, arg) + DRIVER_MODULE(name, virtio_pci, driver, evh, arg); \ + DRIVER_MODULE(name, virtio_dbg, driver, evh, arg) + struct virtio_pnp_match { uint32_t device_type; @@ -82,6 +84,8 @@ MODULE_PNP_INFO("U32:device_type;D:#", virtio_mmio, driver, \ &driver ## _match, 1); \ MODULE_PNP_INFO("U32:device_type;D:#", virtio_pci, driver, \ + &driver ## _match, 1) \ + MODULE_PNP_INFO("U32:device_type;D:#", virtio_dbg, driver, \ &driver ## _match, 1) #define VIRTIO_SIMPLE_PROBE(dev, driver) \ (virtio_simple_probe(dev, &driver ## _match)) diff --git a/sys/dev/virtio/virtqueue.h b/sys/dev/virtio/virtqueue.h --- a/sys/dev/virtio/virtqueue.h +++ b/sys/dev/virtio/virtqueue.h @@ -34,6 +34,7 @@ /* Device callback for a virtqueue interrupt. */ typedef void virtqueue_intr_t(void *); +typedef void *virtqueue_alloc_cb_t(device_t, size_t); /* * Hint on how long the next interrupt should be postponed. This is @@ -67,7 +68,8 @@ int virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size, bus_size_t notify_offset, int align, vm_paddr_t highaddr, - struct vq_alloc_info *info, struct virtqueue **vqp); + struct vq_alloc_info *info, struct virtqueue **vqp, + virtqueue_alloc_cb_t *cb); void *virtqueue_drain(struct virtqueue *vq, int *last); void virtqueue_free(struct virtqueue *vq); int virtqueue_reinit(struct virtqueue *vq, uint16_t size); diff --git a/sys/dev/virtio/virtqueue.c b/sys/dev/virtio/virtqueue.c --- a/sys/dev/virtio/virtqueue.c +++ b/sys/dev/virtio/virtqueue.c @@ -151,7 +151,8 @@ int virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size, bus_size_t notify_offset, int align, vm_paddr_t highaddr, - struct vq_alloc_info *info, struct virtqueue **vqp) + struct vq_alloc_info *info, struct virtqueue **vqp, + virtqueue_alloc_cb_t alloc_cb) { struct virtqueue *vq; int error; @@ -206,8 +207,12 @@ } vq->vq_ring_size = round_page(vring_size(size, align)); - vq->vq_ring_mem = contigmalloc(vq->vq_ring_size, M_DEVBUF, - M_NOWAIT | M_ZERO, 0, highaddr, PAGE_SIZE, 0); + if (alloc_cb != NULL) { + vq->vq_ring_mem = alloc_cb(dev, vq->vq_ring_size); + } else { + vq->vq_ring_mem = contigmalloc(vq->vq_ring_size, M_DEVBUF, + M_NOWAIT | M_ZERO, 0, highaddr, PAGE_SIZE, 0); + } if (vq->vq_ring_mem == NULL) { device_printf(dev, "cannot allocate memory for virtqueue ring\n"); diff --git a/tests/sys/Makefile b/tests/sys/Makefile --- a/tests/sys/Makefile +++ b/tests/sys/Makefile @@ -33,6 +33,7 @@ TESTS_SUBDIRS+= ses TESTS_SUBDIRS+= sys TESTS_SUBDIRS+= vfs +TESTS_SUBDIRS+= virtio TESTS_SUBDIRS+= vm TESTS_SUBDIRS+= vmm diff --git a/tests/sys/virtio/Makefile b/tests/sys/virtio/Makefile new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/Makefile @@ -0,0 +1,28 @@ +PROG= virtiodbg + +.PATH: ${SRCTOP}/sys/libkern + +SRCS= block_if.c \ + config.c \ + iov.c \ + iov_emul.c \ + mevent.c \ + mmio_virtio_block.c \ + mmio_emul.c \ + virtio.c \ + virtiodbg.c + +MAN= + +CFLAGS+=-I${.CURDIR} \ + -I${SRCTOP}/sys + +LIBADD+= md nv pthread + +# Disable thread safety analysis since it only finds very simple bugs and +# yields many false positives. +NO_WTHREAD_SAFETY= + +NO_WCAST_ALIGN= + +.include diff --git a/tests/sys/virtio/block_if.h b/tests/sys/virtio/block_if.h new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/block_if.h @@ -0,0 +1,84 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Peter Grehan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * The block API to be used by bhyve block-device emulations. The routines + * are thread safe, with no assumptions about the context of the completion + * callback - it may occur in the caller's context, or asynchronously in + * another thread. + */ + +#ifndef _BLOCK_IF_H_ +#define _BLOCK_IF_H_ + +#include +#include +#include + +/* + * BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in + * a single request. BLOCKIF_RING_MAX is the maxmimum number of + * pending requests that can be queued. + */ +#define BLOCKIF_IOV_MAX 128 /* not practical to be IOV_MAX */ +#define BLOCKIF_RING_MAX 128 + +struct blockif_req { + int br_iovcnt; + off_t br_offset; + ssize_t br_resid; + void (*br_callback)(struct blockif_req *req, int err); + void *br_param; + struct iovec br_iov[BLOCKIF_IOV_MAX]; +}; + +struct mmio_devinst; +struct blockif_ctxt; + +typedef void blockif_resize_cb(struct blockif_ctxt *, void *, size_t, uint64_t); + +int blockif_legacy_config(nvlist_t *nvl, const char *opts); +struct blockif_ctxt *blockif_open(nvlist_t *nvl, const char *ident); +int blockif_register_resize_callback(struct blockif_ctxt *bc, + blockif_resize_cb *cb, void *cb_arg); +off_t blockif_size(struct blockif_ctxt *bc); +void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, + uint8_t *s); +int blockif_sectsz(struct blockif_ctxt *bc); +void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off); +int blockif_queuesz(struct blockif_ctxt *bc); +int blockif_is_ro(struct blockif_ctxt *bc); +int blockif_candelete(struct blockif_ctxt *bc); +int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_close(struct blockif_ctxt *bc); + +#endif /* _BLOCK_IF_H_ */ diff --git a/tests/sys/virtio/block_if.c b/tests/sys/virtio/block_if.c new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/block_if.c @@ -0,0 +1,980 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Peter Grehan + * All rights reserved. + * Copyright 2020 Joyent, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "config.h" +#include "debug.h" +#include "mevent.h" +#include "block_if.h" + +#define BLOCKIF_SIG 0xb109b109 + +#define BLOCKIF_NUMTHR 8 +#define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) + +enum blockop { + BOP_READ, + BOP_WRITE, + BOP_FLUSH, + BOP_DELETE +}; + +enum blockstat { + BST_FREE, + BST_BLOCK, + BST_PEND, + BST_BUSY, + BST_DONE +}; + +struct blockif_elem { + TAILQ_ENTRY(blockif_elem) be_link; + struct blockif_req *be_req; + enum blockop be_op; + enum blockstat be_status; + pthread_t be_tid; + off_t be_block; +}; + +struct blockif_ctxt { + unsigned int bc_magic; + int bc_fd; + int bc_ischr; + int bc_isgeom; + int bc_candelete; + int bc_rdonly; + off_t bc_size; + int bc_sectsz; + int bc_psectsz; + int bc_psectoff; + int bc_closing; + int bc_paused; + pthread_t bc_btid[BLOCKIF_NUMTHR]; + pthread_mutex_t bc_mtx; + pthread_cond_t bc_cond; + pthread_cond_t bc_work_done_cond; + blockif_resize_cb *bc_resize_cb; + void *bc_resize_cb_arg; + struct mevent *bc_resize_event; + + /* Request elements and free/pending/busy queues */ + TAILQ_HEAD(, blockif_elem) bc_freeq; + TAILQ_HEAD(, blockif_elem) bc_pendq; + TAILQ_HEAD(, blockif_elem) bc_busyq; + struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; + int bc_bootindex; +}; + +static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; + +struct blockif_sig_elem { + pthread_mutex_t bse_mtx; + pthread_cond_t bse_cond; + int bse_pending; + struct blockif_sig_elem *bse_next; +}; + +static struct blockif_sig_elem *blockif_bse_head; + +static int +blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, + enum blockop op) +{ + struct blockif_elem *be, *tbe; + off_t off; + int i; + + be = TAILQ_FIRST(&bc->bc_freeq); + assert(be != NULL); + assert(be->be_status == BST_FREE); + TAILQ_REMOVE(&bc->bc_freeq, be, be_link); + be->be_req = breq; + be->be_op = op; + switch (op) { + case BOP_READ: + case BOP_WRITE: + case BOP_DELETE: + off = breq->br_offset; + for (i = 0; i < breq->br_iovcnt; i++) + off += breq->br_iov[i].iov_len; + break; + default: + off = OFF_MAX; + } + be->be_block = off; + TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { + if (tbe->be_block == breq->br_offset) + break; + } + if (tbe == NULL) { + TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { + if (tbe->be_block == breq->br_offset) + break; + } + } + if (tbe == NULL) + be->be_status = BST_PEND; + else + be->be_status = BST_BLOCK; + TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); + return (be->be_status == BST_PEND); +} + +static int +blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) +{ + struct blockif_elem *be; + + TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { + if (be->be_status == BST_PEND) + break; + assert(be->be_status == BST_BLOCK); + } + if (be == NULL) + return (0); + TAILQ_REMOVE(&bc->bc_pendq, be, be_link); + be->be_status = BST_BUSY; + be->be_tid = t; + TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); + *bep = be; + return (1); +} + +static void +blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) +{ + struct blockif_elem *tbe; + + if (be->be_status == BST_DONE || be->be_status == BST_BUSY) + TAILQ_REMOVE(&bc->bc_busyq, be, be_link); + else + TAILQ_REMOVE(&bc->bc_pendq, be, be_link); + TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { + if (tbe->be_req->br_offset == be->be_block) + tbe->be_status = BST_PEND; + } + be->be_tid = 0; + be->be_status = BST_FREE; + be->be_req = NULL; + TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); +} + +static int +blockif_flush_bc(struct blockif_ctxt *bc) +{ + if (bc->bc_ischr) { + if (ioctl(bc->bc_fd, DIOCGFLUSH)) + return (errno); + } else if (fsync(bc->bc_fd)) + return (errno); + + return (0); +} + +static void +blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) +{ + struct spacectl_range range; + struct blockif_req *br; + off_t arg[2]; + ssize_t n; + size_t clen, len, off, boff, voff; + int i, err; + + br = be->be_req; + assert(br->br_resid >= 0); + + if (br->br_iovcnt <= 1) + buf = NULL; + err = 0; + switch (be->be_op) { + case BOP_READ: + if (buf == NULL) { + if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= n; + break; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + n = pread(bc->bc_fd, buf, len, br->br_offset + off); + if (n < 0) { + err = errno; + break; + } + len = (size_t)n; + boff = 0; + do { + clen = MIN(len - boff, br->br_iov[i].iov_len - + voff); + memcpy((uint8_t *)br->br_iov[i].iov_base + voff, + buf + boff, clen); + if (clen < br->br_iov[i].iov_len - voff) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + off += len; + br->br_resid -= len; + } + break; + case BOP_WRITE: + if (bc->bc_rdonly) { + err = EROFS; + break; + } + if (buf == NULL) { + if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= n; + break; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + boff = 0; + do { + clen = MIN(len - boff, br->br_iov[i].iov_len - + voff); + memcpy(buf + boff, + (uint8_t *)br->br_iov[i].iov_base + voff, + clen); + if (clen < br->br_iov[i].iov_len - voff) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + + n = pwrite(bc->bc_fd, buf, len, br->br_offset + off); + if (n < 0) { + err = errno; + break; + } + off += n; + br->br_resid -= n; + } + break; + case BOP_FLUSH: + err = blockif_flush_bc(bc); + break; + case BOP_DELETE: + if (!bc->bc_candelete) + err = EOPNOTSUPP; + else if (bc->bc_rdonly) + err = EROFS; + else if (bc->bc_ischr) { + arg[0] = br->br_offset; + arg[1] = br->br_resid; + if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) + err = errno; + else + br->br_resid = 0; + } else { + range.r_offset = br->br_offset; + range.r_len = br->br_resid; + + while (range.r_len > 0) { + if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC, + &range, 0, &range) != 0) { + err = errno; + break; + } + } + if (err == 0) + br->br_resid = 0; + } + break; + default: + err = EINVAL; + break; + } + + be->be_status = BST_DONE; + + (*br->br_callback)(br, err); +} + +static inline bool +blockif_empty(const struct blockif_ctxt *bc) +{ + return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq)); +} + +static void * +blockif_thr(void *arg) +{ + struct blockif_ctxt *bc; + struct blockif_elem *be; + pthread_t t; + uint8_t *buf; + + bc = arg; + if (bc->bc_isgeom) + buf = malloc(MAXPHYS); + else + buf = NULL; + t = pthread_self(); + + pthread_mutex_lock(&bc->bc_mtx); + for (;;) { + while (blockif_dequeue(bc, t, &be)) { + pthread_mutex_unlock(&bc->bc_mtx); + blockif_proc(bc, be, buf); + pthread_mutex_lock(&bc->bc_mtx); + blockif_complete(bc, be); + } + + /* If none to work, notify the main thread */ + if (blockif_empty(bc)) + pthread_cond_broadcast(&bc->bc_work_done_cond); + + /* Check ctxt status here to see if exit requested */ + if (bc->bc_closing) + break; + + pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); + } + pthread_mutex_unlock(&bc->bc_mtx); + + if (buf) + free(buf); + pthread_exit(NULL); + return (NULL); +} + +static void +blockif_sigcont_handler(int signal __unused, enum ev_type type __unused, + void *arg __unused, uint64_t data __unused) +{ + struct blockif_sig_elem *bse; + + for (;;) { + /* + * Process the entire list even if not intended for + * this thread. + */ + do { + bse = blockif_bse_head; + if (bse == NULL) + return; + } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, + (uintptr_t)bse, + (uintptr_t)bse->bse_next)); + + pthread_mutex_lock(&bse->bse_mtx); + bse->bse_pending = 0; + pthread_cond_signal(&bse->bse_cond); + pthread_mutex_unlock(&bse->bse_mtx); + } +} + +static void +blockif_init(void) +{ + mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); + (void) signal(SIGCONT, SIG_IGN); +} + + +struct blockif_ctxt * +blockif_open(nvlist_t *nvl, const char *ident) +{ + char tname[MAXCOMLEN + 1]; + char name[MAXPATHLEN]; + const char *path, *pssval, *ssval, *bootindex_val; + char *cp; + struct blockif_ctxt *bc; + struct stat sbuf; + struct diocgattr_arg arg; + off_t size, psectsz, psectoff; + int extra, fd, i, sectsz; + int ro, candelete, geom, ssopt, pssopt; + int nodelete; + int bootindex; + +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE }; +#endif + + pthread_once(&blockif_once, blockif_init); + + fd = -1; + extra = 0; + ssopt = 0; + ro = 0; + nodelete = 0; + bootindex = -1; + + if (get_config_bool_node_default(nvl, "nocache", false)) + extra |= O_DIRECT; + if (get_config_bool_node_default(nvl, "nodelete", false)) + nodelete = 1; + if (get_config_bool_node_default(nvl, "sync", false) || + get_config_bool_node_default(nvl, "direct", false)) + extra |= O_SYNC; + if (get_config_bool_node_default(nvl, "ro", false)) + ro = 1; + ssval = get_config_value_node(nvl, "sectorsize"); + if (ssval != NULL) { + ssopt = strtol(ssval, &cp, 10); + if (cp == ssval) { + EPRINTLN("Invalid sector size \"%s\"", ssval); + goto err; + } + if (*cp == '\0') { + pssopt = ssopt; + } else if (*cp == '/') { + pssval = cp + 1; + pssopt = strtol(pssval, &cp, 10); + if (cp == pssval || *cp != '\0') { + EPRINTLN("Invalid sector size \"%s\"", ssval); + goto err; + } + } else { + EPRINTLN("Invalid sector size \"%s\"", ssval); + goto err; + } + } + + bootindex_val = get_config_value_node(nvl, "bootindex"); + if (bootindex_val != NULL) { + bootindex = atoi(bootindex_val); + } + + path = get_config_value_node(nvl, "path"); + if (path == NULL) { + EPRINTLN("Missing \"path\" for block device."); + goto err; + } + + fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra); + if (fd < 0 && !ro) { + /* Attempt a r/w fail with a r/o open */ + fd = open(path, O_RDONLY | extra); + ro = 1; + } + + if (fd < 0) { + warn("Could not open backing file: %s", path); + goto err; + } + + if (fstat(fd, &sbuf) < 0) { + warn("Could not stat backing file %s", path); + goto err; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, + CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF); + if (ro) + cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); + + if (caph_rights_limit(fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + /* + * Deal with raw devices + */ + size = sbuf.st_size; + sectsz = DEV_BSIZE; + psectsz = psectoff = 0; + candelete = geom = 0; + if (S_ISCHR(sbuf.st_mode)) { + if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || + ioctl(fd, DIOCGSECTORSIZE, §sz)) { + perror("Could not fetch dev blk/sector size"); + goto err; + } + assert(size != 0); + assert(sectsz != 0); + if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) + ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); + strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); + arg.len = sizeof(arg.value.i); + if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) + candelete = arg.value.i; + if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) + geom = 1; + } else { + psectsz = sbuf.st_blksize; + /* Avoid fallback implementation */ + candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1; + } + +#ifndef WITHOUT_CAPSICUM + if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + if (ssopt != 0) { + if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || + ssopt > pssopt) { + EPRINTLN("Invalid sector size %d/%d", + ssopt, pssopt); + goto err; + } + + /* + * Some backend drivers (e.g. cd0, ada0) require that the I/O + * size be a multiple of the device's sector size. + * + * Validate that the emulated sector size complies with this + * requirement. + */ + if (S_ISCHR(sbuf.st_mode)) { + if (ssopt < sectsz || (ssopt % sectsz) != 0) { + EPRINTLN("Sector size %d incompatible " + "with underlying device sector size %d", + ssopt, sectsz); + goto err; + } + } + + sectsz = ssopt; + psectsz = pssopt; + psectoff = 0; + } + + bc = calloc(1, sizeof(struct blockif_ctxt)); + if (bc == NULL) { + perror("calloc"); + goto err; + } + + bc->bc_magic = BLOCKIF_SIG; + bc->bc_fd = fd; + bc->bc_ischr = S_ISCHR(sbuf.st_mode); + bc->bc_isgeom = geom; + bc->bc_candelete = candelete; + bc->bc_rdonly = ro; + bc->bc_size = size; + bc->bc_sectsz = sectsz; + bc->bc_psectsz = psectsz; + bc->bc_psectoff = psectoff; + pthread_mutex_init(&bc->bc_mtx, NULL); + pthread_cond_init(&bc->bc_cond, NULL); + bc->bc_paused = 0; + pthread_cond_init(&bc->bc_work_done_cond, NULL); + TAILQ_INIT(&bc->bc_freeq); + TAILQ_INIT(&bc->bc_pendq); + TAILQ_INIT(&bc->bc_busyq); + bc->bc_bootindex = bootindex; + for (i = 0; i < BLOCKIF_MAXREQ; i++) { + bc->bc_reqs[i].be_status = BST_FREE; + TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); + } + + for (i = 0; i < BLOCKIF_NUMTHR; i++) { + pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); + snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); + pthread_set_name_np(bc->bc_btid[i], tname); + } + + return (bc); +err: + if (fd >= 0) + close(fd); + return (NULL); +} + +static void +blockif_resized(int fd, enum ev_type type __unused, void *arg, + uint64_t data __unused) +{ + struct blockif_ctxt *bc; + struct stat sb; + off_t mediasize; + + if (fstat(fd, &sb) != 0) + return; + + if (S_ISCHR(sb.st_mode)) { + if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) { + EPRINTLN("blockif_resized: get mediasize failed: %s", + strerror(errno)); + return; + } + } else + mediasize = sb.st_size; + + bc = arg; + pthread_mutex_lock(&bc->bc_mtx); + if (mediasize != bc->bc_size) { + bc->bc_size = mediasize; + bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size, 0); + } + pthread_mutex_unlock(&bc->bc_mtx); +} + +int +blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb, + void *cb_arg) +{ + struct stat sb; + int err; + + if (cb == NULL) + return (EINVAL); + + err = 0; + + pthread_mutex_lock(&bc->bc_mtx); + if (bc->bc_resize_cb != NULL) { + err = EBUSY; + goto out; + } + + assert(bc->bc_closing == 0); + + if (fstat(bc->bc_fd, &sb) != 0) { + err = errno; + goto out; + } + + bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE, + EVFF_ATTRIB, blockif_resized, bc); + if (bc->bc_resize_event == NULL) { + err = ENXIO; + goto out; + } + + bc->bc_resize_cb = cb; + bc->bc_resize_cb_arg = cb_arg; +out: + pthread_mutex_unlock(&bc->bc_mtx); + + return (err); +} + +static int +blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, + enum blockop op) +{ + int err; + + err = 0; + + pthread_mutex_lock(&bc->bc_mtx); + assert(!bc->bc_paused); + if (!TAILQ_EMPTY(&bc->bc_freeq)) { + /* + * Enqueue and inform the block i/o thread + * that there is work available + */ + if (blockif_enqueue(bc, breq, op)) + pthread_cond_signal(&bc->bc_cond); + } else { + /* + * Callers are not allowed to enqueue more than + * the specified blockif queue limit. Return an + * error to indicate that the queue length has been + * exceeded. + */ + err = E2BIG; + } + pthread_mutex_unlock(&bc->bc_mtx); + + return (err); +} + +int +blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_READ)); +} + +int +blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_WRITE)); +} + +int +blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_FLUSH)); +} + +int +blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_DELETE)); +} + +int +blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + struct blockif_elem *be; + + assert(bc->bc_magic == BLOCKIF_SIG); + + pthread_mutex_lock(&bc->bc_mtx); + /* XXX: not waiting while paused */ + + /* + * Check pending requests. + */ + TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { + if (be->be_req == breq) + break; + } + if (be != NULL) { + /* + * Found it. + */ + blockif_complete(bc, be); + pthread_mutex_unlock(&bc->bc_mtx); + + return (0); + } + + /* + * Check in-flight requests. + */ + TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { + if (be->be_req == breq) + break; + } + if (be == NULL) { + /* + * Didn't find it. + */ + pthread_mutex_unlock(&bc->bc_mtx); + return (EINVAL); + } + + /* + * Interrupt the processing thread to force it return + * prematurely via it's normal callback path. + */ + while (be->be_status == BST_BUSY) { + struct blockif_sig_elem bse, *old_head; + + pthread_mutex_init(&bse.bse_mtx, NULL); + pthread_cond_init(&bse.bse_cond, NULL); + + bse.bse_pending = 1; + + do { + old_head = blockif_bse_head; + bse.bse_next = old_head; + } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, + (uintptr_t)old_head, + (uintptr_t)&bse)); + + pthread_kill(be->be_tid, SIGCONT); + + pthread_mutex_lock(&bse.bse_mtx); + while (bse.bse_pending) + pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); + pthread_mutex_unlock(&bse.bse_mtx); + } + + pthread_mutex_unlock(&bc->bc_mtx); + + /* + * The processing thread has been interrupted. Since it's not + * clear if the callback has been invoked yet, return EBUSY. + */ + return (EBUSY); +} + +int +blockif_close(struct blockif_ctxt *bc) +{ + void *jval; + int i; + + assert(bc->bc_magic == BLOCKIF_SIG); + + /* + * Stop the block i/o thread + */ + pthread_mutex_lock(&bc->bc_mtx); + bc->bc_closing = 1; + if (bc->bc_resize_event != NULL) + mevent_disable(bc->bc_resize_event); + pthread_mutex_unlock(&bc->bc_mtx); + pthread_cond_broadcast(&bc->bc_cond); + for (i = 0; i < BLOCKIF_NUMTHR; i++) + pthread_join(bc->bc_btid[i], &jval); + + /* XXX Cancel queued i/o's ??? */ + + /* + * Release resources + */ + bc->bc_magic = 0; + close(bc->bc_fd); + free(bc); + + return (0); +} + +/* + * Return virtual C/H/S values for a given block. Use the algorithm + * outlined in the VHD specification to calculate values. + */ +void +blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) +{ + off_t sectors; /* total sectors of the block dev */ + off_t hcyl; /* cylinders times heads */ + uint16_t secpt; /* sectors per track */ + uint8_t heads; + + assert(bc->bc_magic == BLOCKIF_SIG); + + sectors = bc->bc_size / bc->bc_sectsz; + + /* Clamp the size to the largest possible with CHS */ + if (sectors > 65535L * 16 * 255) + sectors = 65535L * 16 * 255; + + if (sectors >= 65536L * 16 * 63) { + secpt = 255; + heads = 16; + hcyl = sectors / secpt; + } else { + secpt = 17; + hcyl = sectors / secpt; + heads = (hcyl + 1023) / 1024; + + if (heads < 4) + heads = 4; + + if (hcyl >= (heads * 1024) || heads > 16) { + secpt = 31; + heads = 16; + hcyl = sectors / secpt; + } + if (hcyl >= (heads * 1024)) { + secpt = 63; + heads = 16; + hcyl = sectors / secpt; + } + } + + *c = hcyl / heads; + *h = heads; + *s = secpt; +} + +/* + * Accessors + */ +off_t +blockif_size(struct blockif_ctxt *bc) +{ + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_size); +} + +int +blockif_sectsz(struct blockif_ctxt *bc) +{ + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_sectsz); +} + +void +blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) +{ + assert(bc->bc_magic == BLOCKIF_SIG); + *size = bc->bc_psectsz; + *off = bc->bc_psectoff; +} + +int +blockif_queuesz(struct blockif_ctxt *bc) +{ + assert(bc->bc_magic == BLOCKIF_SIG); + return (BLOCKIF_MAXREQ - 1); +} + +int +blockif_is_ro(struct blockif_ctxt *bc) +{ + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_rdonly); +} + +int +blockif_candelete(struct blockif_ctxt *bc) +{ + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_candelete); +} diff --git a/tests/sys/virtio/config.h b/tests/sys/virtio/config.h new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/config.h @@ -0,0 +1,129 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 John H. Baldwin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __CONFIG_H__ +#define __CONFIG_H__ + +#include + +/*- + * Manages a configuration database backed by an nv(9) list. + * + * The database only stores string values. Callers should parse + * values into other types if needed. String values can reference + * other configuration variables using a '%(name)' syntax. In this + * case, the name must be the full path of the configuration + * variable. The % character can be escaped with a preceding \ to + * avoid expansion. Any \ characters must be escaped. + * + * Configuration variables are stored in a tree. The full path of a + * variable is specified as a dot-separated name similar to sysctl(8) + * OIDs. + */ + +/* + * Fetches the value of a configuration variable. If the "raw" value + * contains references to other configuration variables, this function + * expands those references and returns a pointer to the parsed + * string. The string's storage is only stable until the next call to + * this function. + * + * If no node is found, returns NULL. + * + * If 'parent' is NULL, 'name' is assumed to be a top-level variable. + */ +const char *get_config_value_node(const nvlist_t *parent, const char *name); + +/* + * Similar to get_config_value_node but expects a full path to the + * leaf node. + */ +const char *get_config_value(const char *path); + +/* Initializes the tree to an empty state. */ +void init_config(void); + +/* + * Creates an existing configuration node via a dot-separated OID + * path. Will fail if the path names an existing leaf configuration + * variable. If the node already exists, this returns a pointer to + * the existing node. + */ +nvlist_t *create_config_node(const char *path); + +/* + * Looks for an existing configuration node via a dot-separated OID + * path. Will fail if the path names an existing leaf configuration + * variable. + */ +nvlist_t *find_config_node(const char *path); + +/* + * Similar to the above, but treats the path relative to an existing + * 'parent' node rather than as an absolute path. + */ +nvlist_t *create_relative_config_node(nvlist_t *parent, const char *path); +nvlist_t *find_relative_config_node(nvlist_t *parent, const char *path); + +/* + * Adds or replaces the value of the specified variable. + * + * If 'parent' is NULL, 'name' is assumed to be a top-level variable. + */ +void set_config_value_node(nvlist_t *parent, const char *name, + const char *value); + +/* + * Similar to set_config_value_node but only sets value if it's unset yet. + */ +void set_config_value_node_if_unset(nvlist_t *const parent, + const char *const name, const char *const value); + +/* + * Similar to set_config_value_node but expects a full path to the + * leaf node. + */ +void set_config_value(const char *path, const char *value); + +/* + * Similar to set_config_value but only sets the value if it's unset yet. + */ +void set_config_value_if_unset(const char *const path, + const char *const value); + +/* Convenience wrappers for boolean variables. */ +bool get_config_bool(const char *path); +bool get_config_bool_node(const nvlist_t *parent, const char *name); +bool get_config_bool_default(const char *path, bool def); +bool get_config_bool_node_default(const nvlist_t *parent, const char *name, + bool def); +void set_config_bool(const char *path, bool value); +void set_config_bool_node(nvlist_t *parent, const char *name, bool value); + +void dump_config(void); + +#endif /* !__CONFIG_H__ */ diff --git a/tests/sys/virtio/config.c b/tests/sys/virtio/config.c new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/config.c @@ -0,0 +1,464 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 John H. Baldwin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include + +#include "config.h" + +static nvlist_t *config_root; + +void +init_config(void) +{ + + config_root = nvlist_create(0); + if (config_root == NULL) + err(4, "Failed to create configuration root nvlist"); +} + +static nvlist_t * +_lookup_config_node(nvlist_t *parent, const char *path, bool create) +{ + char *copy, *name, *tofree; + nvlist_t *nvl, *new_nvl; + + copy = strdup(path); + if (copy == NULL) + errx(4, "Failed to allocate memory"); + tofree = copy; + nvl = parent; + while ((name = strsep(©, ".")) != NULL) { + if (*name == '\0') { + warnx("Invalid configuration node: %s", path); + nvl = NULL; + break; + } + if (nvlist_exists_nvlist(nvl, name)) + /* + * XXX-MJ it is incorrect to cast away the const + * qualifier like this since the contract with nvlist + * says that values are immutable, and some consumers + * will indeed add nodes to the returned nvlist. In + * practice, however, it appears to be harmless with the + * current nvlist implementation, so we just live with + * it until the implementation is reworked. + */ + nvl = __DECONST(nvlist_t *, + nvlist_get_nvlist(nvl, name)); + else if (nvlist_exists(nvl, name)) { + for (copy = tofree; copy < name; copy++) + if (*copy == '\0') + *copy = '.'; + warnx( + "Configuration node %s is a child of existing variable %s", + path, tofree); + nvl = NULL; + break; + } else if (create) { + /* + * XXX-MJ as with the case above, "new_nvl" shouldn't be + * mutated after its ownership is given to "nvl". + */ + new_nvl = nvlist_create(0); + if (new_nvl == NULL) + errx(4, "Failed to allocate memory"); + nvlist_move_nvlist(nvl, name, new_nvl); + nvl = new_nvl; + } else { + nvl = NULL; + break; + } + } + free(tofree); + return (nvl); +} + +nvlist_t * +create_config_node(const char *path) +{ + + return (_lookup_config_node(config_root, path, true)); +} + +nvlist_t * +find_config_node(const char *path) +{ + + return (_lookup_config_node(config_root, path, false)); +} + +nvlist_t * +create_relative_config_node(nvlist_t *parent, const char *path) +{ + + return (_lookup_config_node(parent, path, true)); +} + +nvlist_t * +find_relative_config_node(nvlist_t *parent, const char *path) +{ + + return (_lookup_config_node(parent, path, false)); +} + +void +set_config_value_node(nvlist_t *parent, const char *name, const char *value) +{ + + if (strchr(name, '.') != NULL) + errx(4, "Invalid config node name %s", name); + if (parent == NULL) + parent = config_root; + if (nvlist_exists_string(parent, name)) + nvlist_free_string(parent, name); + else if (nvlist_exists(parent, name)) + errx(4, + "Attempting to add value %s to existing node %s of list %p", + value, name, parent); + nvlist_add_string(parent, name, value); +} + +void +set_config_value_node_if_unset(nvlist_t *const parent, const char *const name, + const char *const value) +{ + if (get_config_value_node(parent, name) != NULL) { + return; + } + + set_config_value_node(parent, name, value); +} + +void +set_config_value(const char *path, const char *value) +{ + const char *name; + char *node_name; + nvlist_t *nvl; + + /* Look for last separator. */ + name = strrchr(path, '.'); + if (name == NULL) { + nvl = config_root; + name = path; + } else { + node_name = strndup(path, name - path); + if (node_name == NULL) + errx(4, "Failed to allocate memory"); + nvl = create_config_node(node_name); + if (nvl == NULL) + errx(4, "Failed to create configuration node %s", + node_name); + free(node_name); + + /* Skip over '.'. */ + name++; + } + + if (nvlist_exists_nvlist(nvl, name)) + errx(4, "Attempting to add value %s to existing node %s", + value, path); + set_config_value_node(nvl, name, value); +} + +void +set_config_value_if_unset(const char *const path, const char *const value) +{ + if (get_config_value(path) != NULL) { + return; + } + + set_config_value(path, value); +} + +static const char * +get_raw_config_value(const char *path) +{ + const char *name; + char *node_name; + nvlist_t *nvl; + + /* Look for last separator. */ + name = strrchr(path, '.'); + if (name == NULL) { + nvl = config_root; + name = path; + } else { + node_name = strndup(path, name - path); + if (node_name == NULL) + errx(4, "Failed to allocate memory"); + nvl = find_config_node(node_name); + free(node_name); + if (nvl == NULL) + return (NULL); + + /* Skip over '.'. */ + name++; + } + + if (nvlist_exists_string(nvl, name)) + return (nvlist_get_string(nvl, name)); + if (nvlist_exists_nvlist(nvl, name)) + warnx("Attempting to fetch value of node %s", path); + return (NULL); +} + +static char * +_expand_config_value(const char *value, int depth) +{ + FILE *valfp; + const char *cp, *vp; + char *nestedval, *path, *valbuf; + size_t valsize; + + valfp = open_memstream(&valbuf, &valsize); + if (valfp == NULL) + errx(4, "Failed to allocate memory"); + + vp = value; + while (*vp != '\0') { + switch (*vp) { + case '%': + if (depth > 15) { + warnx( + "Too many recursive references in configuration value"); + fputc('%', valfp); + vp++; + break; + } + if (vp[1] != '(' || vp[2] == '\0') + cp = NULL; + else + cp = strchr(vp + 2, ')'); + if (cp == NULL) { + warnx( + "Invalid reference in configuration value \"%s\"", + value); + fputc('%', valfp); + vp++; + break; + } + vp += 2; + + if (cp == vp) { + warnx( + "Empty reference in configuration value \"%s\"", + value); + vp++; + break; + } + + /* Allocate a C string holding the path. */ + path = strndup(vp, cp - vp); + if (path == NULL) + errx(4, "Failed to allocate memory"); + + /* Advance 'vp' past the reference. */ + vp = cp + 1; + + /* Fetch the referenced value. */ + cp = get_raw_config_value(path); + if (cp == NULL) + warnx( + "Failed to fetch referenced configuration variable %s", + path); + else { + nestedval = _expand_config_value(cp, depth + 1); + fputs(nestedval, valfp); + free(nestedval); + } + free(path); + break; + case '\\': + vp++; + if (*vp == '\0') { + warnx( + "Trailing \\ in configuration value \"%s\"", + value); + break; + } + /* FALLTHROUGH */ + default: + fputc(*vp, valfp); + vp++; + break; + } + } + fclose(valfp); + return (valbuf); +} + +static const char * +expand_config_value(const char *value) +{ + static char *valbuf; + + if (strchr(value, '%') == NULL) + return (value); + + free(valbuf); + valbuf = _expand_config_value(value, 0); + return (valbuf); +} + +const char * +get_config_value(const char *path) +{ + const char *value; + + value = get_raw_config_value(path); + if (value == NULL) + return (NULL); + return (expand_config_value(value)); +} + +const char * +get_config_value_node(const nvlist_t *parent, const char *name) +{ + + if (strchr(name, '.') != NULL) + errx(4, "Invalid config node name %s", name); + if (parent == NULL) + parent = config_root; + if (nvlist_exists_nvlist(parent, name)) + warnx("Attempt to fetch value of node %s of list %p", name, + parent); + if (!nvlist_exists_string(parent, name)) + return (NULL); + + return (expand_config_value(nvlist_get_string(parent, name))); +} + +static bool +_bool_value(const char *name, const char *value) +{ + + if (strcasecmp(value, "true") == 0 || + strcasecmp(value, "on") == 0 || + strcasecmp(value, "yes") == 0 || + strcmp(value, "1") == 0) + return (true); + if (strcasecmp(value, "false") == 0 || + strcasecmp(value, "off") == 0 || + strcasecmp(value, "no") == 0 || + strcmp(value, "0") == 0) + return (false); + err(4, "Invalid value %s for boolean variable %s", value, name); +} + +bool +get_config_bool(const char *path) +{ + const char *value; + + value = get_config_value(path); + if (value == NULL) + err(4, "Failed to fetch boolean variable %s", path); + return (_bool_value(path, value)); +} + +bool +get_config_bool_default(const char *path, bool def) +{ + const char *value; + + value = get_config_value(path); + if (value == NULL) + return (def); + return (_bool_value(path, value)); +} + +bool +get_config_bool_node(const nvlist_t *parent, const char *name) +{ + const char *value; + + value = get_config_value_node(parent, name); + if (value == NULL) + err(4, "Failed to fetch boolean variable %s", name); + return (_bool_value(name, value)); +} + +bool +get_config_bool_node_default(const nvlist_t *parent, const char *name, + bool def) +{ + const char *value; + + value = get_config_value_node(parent, name); + if (value == NULL) + return (def); + return (_bool_value(name, value)); +} + +void +set_config_bool(const char *path, bool value) +{ + + set_config_value(path, value ? "true" : "false"); +} + +void +set_config_bool_node(nvlist_t *parent, const char *name, bool value) +{ + + set_config_value_node(parent, name, value ? "true" : "false"); +} + +static void +dump_tree(const char *prefix, const nvlist_t *nvl) +{ + const char *name; + void *cookie; + int type; + + cookie = NULL; + while ((name = nvlist_next(nvl, &type, &cookie)) != NULL) { + if (type == NV_TYPE_NVLIST) { + char *new_prefix; + + asprintf(&new_prefix, "%s%s.", prefix, name); + dump_tree(new_prefix, nvlist_get_nvlist(nvl, name)); + free(new_prefix); + } else { + assert(type == NV_TYPE_STRING); + printf("%s%s=%s\n", prefix, name, + nvlist_get_string(nvl, name)); + } + } +} + +void +dump_config(void) +{ + dump_tree("", config_root); +} diff --git a/tests/sys/virtio/debug.h b/tests/sys/virtio/debug.h new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/debug.h @@ -0,0 +1,40 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2019 Vincenzo Maffione + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _DEBUG_H_ +#define _DEBUG_H_ + + +#define FPRINTLN(filep, fmt, arg...) \ + do { \ + fprintf(filep, fmt "\n", ##arg); \ + } while (0) + +#define PRINTLN(fmt, arg...) FPRINTLN(stdout, fmt, ##arg) +#define EPRINTLN(fmt, arg...) FPRINTLN(stderr, fmt, ##arg) + +#endif diff --git a/tests/sys/virtio/iov.h b/tests/sys/virtio/iov.h new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/iov.h @@ -0,0 +1,42 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2016 Jakub Klama . + * Copyright (c) 2018 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _IOV_H_ +#define _IOV_H_ + +void seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2, + int *niov2, size_t seek); +void truncate_iov(struct iovec *iov, int *niov, size_t length); +size_t count_iov(const struct iovec *iov, int niov); +ssize_t iov_to_buf(const struct iovec *iov, int niov, void **buf); +ssize_t buf_to_iov(const void *buf, size_t buflen, const struct iovec *iov, + int niov, size_t seek); + +#endif /* _IOV_H_ */ diff --git a/tests/sys/virtio/iov.c b/tests/sys/virtio/iov.c new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/iov.c @@ -0,0 +1,146 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2016 Jakub Klama . + * Copyright (c) 2018 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include "iov.h" + +void +seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2, int *niov2, + size_t seek) +{ + size_t remainder = 0; + size_t left = seek; + int i, j; + + for (i = 0; i < niov1; i++) { + size_t toseek = MIN(left, iov1[i].iov_len); + left -= toseek; + + if (toseek == iov1[i].iov_len) + continue; + + if (left == 0) { + remainder = toseek; + break; + } + } + + for (j = i; j < niov1; j++) { + iov2[j - i].iov_base = (char *)iov1[j].iov_base + remainder; + iov2[j - i].iov_len = iov1[j].iov_len - remainder; + remainder = 0; + } + + *niov2 = j - i; +} + +size_t +count_iov(const struct iovec *iov, int niov) +{ + size_t total = 0; + int i; + + for (i = 0; i < niov; i++) + total += iov[i].iov_len; + + return (total); +} + +void +truncate_iov(struct iovec *iov, int *niov, size_t length) +{ + size_t done = 0; + int i; + + for (i = 0; i < *niov; i++) { + size_t toseek = MIN(length - done, iov[i].iov_len); + done += toseek; + + if (toseek <= iov[i].iov_len) { + iov[i].iov_len = toseek; + *niov = i + 1; + return; + } + } +} + +ssize_t +iov_to_buf(const struct iovec *iov, int niov, void **buf) +{ + size_t ptr, total; + int i; + + total = count_iov(iov, niov); + *buf = realloc(*buf, total); + if (*buf == NULL) + return (-1); + + for (i = 0, ptr = 0; i < niov; i++) { + memcpy((uint8_t *)*buf + ptr, iov[i].iov_base, iov[i].iov_len); + ptr += iov[i].iov_len; + } + + return (total); +} + +ssize_t +buf_to_iov(const void *buf, size_t buflen, const struct iovec *iov, int niov, + size_t seek) +{ + struct iovec *diov; + size_t off = 0, len; + int i; + + if (seek > 0) { + int ndiov; + + diov = malloc(sizeof(struct iovec) * niov); + seek_iov(iov, niov, diov, &ndiov, seek); + iov = diov; + niov = ndiov; + } + + for (i = 0; i < niov && off < buflen; i++) { + len = MIN(iov[i].iov_len, buflen - off); + memcpy(iov[i].iov_base, (const uint8_t *)buf + off, len); + off += len; + } + + if (seek > 0) + free(diov); + + return ((ssize_t)off); +} + diff --git a/tests/sys/virtio/iov_emul.h b/tests/sys/virtio/iov_emul.h new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/iov_emul.h @@ -0,0 +1,20 @@ +#ifndef _IOV_EMUL_E +#define _IOV_EMUL_E + +struct virtio_softc; + +struct iov_emul { + struct vtdbg_transfer *iove_tf; + size_t iove_maxcnt; + size_t iove_ind; +}; + +#define IOVE_INIT (16) + +struct iov_emul *iove_alloc(void); +void iove_free(struct iov_emul *iove); +int iove_add(struct iov_emul *iove, uint64_t phys, size_t len, struct iovec *iov); +int iove_import(int fd, struct iov_emul *iove); +int iove_export(int fd, struct iov_emul *iove); + +#endif /* _IOV_EMUL_E */ diff --git a/tests/sys/virtio/iov_emul.c b/tests/sys/virtio/iov_emul.c new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/iov_emul.c @@ -0,0 +1,106 @@ +#include +#include + +#include +#include +#include + +#include + +#include "debug.h" +#include "iov_emul.h" +#include "mmio_emul.h" +#include "virtio.h" + +struct iov_emul * +iove_alloc(void) +{ + struct iov_emul *iove; + + iove = calloc(1, sizeof(*iove)); + + iove->iove_tf = calloc(IOVE_INIT, sizeof(*iove->iove_tf)); + if (iove->iove_tf == NULL) { + free(iove); + return (NULL); + } + + iove->iove_maxcnt = IOVE_INIT; + + return (iove); +} + +void +iove_free(struct iov_emul *iove) +{ + size_t i; + + for (i = 0; i < iove->iove_ind; i++) + free(iove->iove_tf[i].vtdt_device); + + free(iove); +} + + +int +iove_add(struct iov_emul *iove, uint64_t phys, size_t len, struct iovec *iov) +{ + struct vtdbg_transfer *tf = iove->iove_tf; + size_t ind = iove->iove_ind; + char *base; + + if (ind == iove->iove_maxcnt){ + tf = reallocarray(tf, 2 * iove->iove_maxcnt, + sizeof(*tf)); + if (tf == NULL) + return (ENOMEM); + iove->iove_tf = tf; + iove->iove_maxcnt *= 2; + } + + base = malloc(len); + if (base == NULL) + return (ENOMEM); + + iove->iove_tf[ind].vtdt_device = base; + iove->iove_tf[ind].vtdt_driver = (caddr_t) phys; + iove->iove_tf[ind].vtdt_len = len; + iove->iove_ind += 1; + + iov->iov_base = base; + iov->iov_len = len; + + return (0); +} + + +/* + * Import a read IO vector from the kernel. + */ +int +iove_import(int fd, struct iov_emul *iove) +{ + struct vtdbg_io_args args = { + .transfers = iove->iove_tf, + .cnt = iove->iove_ind, + .touser = true, + }; + + return (ioctl(fd, VIRTIO_DBG_TRANSFER, &args)); +} + +/* + * Export a write IO vector to the kernel. + */ +int +iove_export(int fd, struct iov_emul *iove) +{ + struct vtdbg_io_args args = { + .transfers = iove->iove_tf, + .cnt = iove->iove_ind, + .touser = false, + }; + + return (ioctl(fd, VIRTIO_DBG_TRANSFER, &args)); +} + diff --git a/tests/sys/virtio/mevent.h b/tests/sys/virtio/mevent.h new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/mevent.h @@ -0,0 +1,60 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _MEVENT_H_ +#define _MEVENT_H_ + +enum ev_type { + EVF_READ, + EVF_WRITE, + EVF_TIMER, + EVF_SIGNAL, + EVF_VNODE, +}; + +/* Filter flags for EVF_VNODE */ +#define EVFF_ATTRIB 0x0001 + +typedef void mevent_cb_t(int, enum ev_type, void *, uint64_t); +struct mevent; + +struct mevent *mevent_add(int fd, enum ev_type type, mevent_cb_t *func, + void *param); +struct mevent *mevent_add_flags(int fd, enum ev_type type, int fflags, + mevent_cb_t *func, void *param); +struct mevent *mevent_add_disabled(int fd, enum ev_type type, + mevent_cb_t *func, void *param); +int mevent_enable(struct mevent *evp); +int mevent_disable(struct mevent *evp); +int mevent_delete(struct mevent *evp); +int mevent_delete_close(struct mevent *evp); +int mevent_timer_update(struct mevent *evp, int msecs); + +void mevent_dispatch(void); + +#endif /* _MEVENT_H_ */ diff --git a/tests/sys/virtio/mevent.c b/tests/sys/virtio/mevent.c new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/mevent.c @@ -0,0 +1,564 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Micro event library for FreeBSD, designed for a single i/o thread + * using kqueue, and having events be persistent by default. + */ + +#include +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include + +#include +#include + +#include "mevent.h" + +#define MEVENT_MAX 64 + +static pthread_t mevent_tid; +static pthread_once_t mevent_once = PTHREAD_ONCE_INIT; +static int mevent_timid = 43; +static int mevent_pipefd[2]; +static int mfd; +static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER; + +struct mevent { + mevent_cb_t *me_func; +#define me_msecs me_fd + int me_fd; + int me_timid; + enum ev_type me_type; + void *me_param; + int me_cq; + int me_state; /* Desired kevent flags. */ + int me_closefd; + int me_fflags; + LIST_ENTRY(mevent) me_list; +}; + +enum mevent_update_type { + UPDATE_ENABLE, + UPDATE_DISABLE, + UPDATE_TIMER, +}; + +static LIST_HEAD(listhead, mevent) global_head, change_head; + +static void +mevent_qlock(void) +{ + pthread_mutex_lock(&mevent_lmutex); +} + +static void +mevent_qunlock(void) +{ + pthread_mutex_unlock(&mevent_lmutex); +} + +static void +mevent_pipe_read(int fd, enum ev_type type __unused, void *param __unused, + uint64_t data __unused) +{ + char buf[MEVENT_MAX]; + int status; + + /* + * Drain the pipe read side. The fd is non-blocking so this is + * safe to do. + */ + do { + status = read(fd, buf, sizeof(buf)); + } while (status == MEVENT_MAX); +} + +static void +mevent_notify(void) +{ + char c = '\0'; + + /* + * If calling from outside the i/o thread, write a byte on the + * pipe to force the i/o thread to exit the blocking kevent call. + */ + if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) { + write(mevent_pipefd[1], &c, 1); + } +} + +static void +mevent_init(void) +{ +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + mfd = kqueue(); + assert(mfd > 0); + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_KQUEUE); + if (caph_rights_limit(mfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + LIST_INIT(&change_head); + LIST_INIT(&global_head); +} + +static int +mevent_kq_filter(struct mevent *mevp) +{ + int retval; + + retval = 0; + + if (mevp->me_type == EVF_READ) + retval = EVFILT_READ; + + if (mevp->me_type == EVF_WRITE) + retval = EVFILT_WRITE; + + if (mevp->me_type == EVF_TIMER) + retval = EVFILT_TIMER; + + if (mevp->me_type == EVF_SIGNAL) + retval = EVFILT_SIGNAL; + + if (mevp->me_type == EVF_VNODE) + retval = EVFILT_VNODE; + + return (retval); +} + +static int +mevent_kq_flags(struct mevent *mevp) +{ + int retval; + + retval = mevp->me_state; + + if (mevp->me_type == EVF_VNODE) + retval |= EV_CLEAR; + + return (retval); +} + +static int +mevent_kq_fflags(struct mevent *mevp) +{ + int retval; + + retval = 0; + + switch (mevp->me_type) { + case EVF_VNODE: + if ((mevp->me_fflags & EVFF_ATTRIB) != 0) + retval |= NOTE_ATTRIB; + break; + case EVF_READ: + case EVF_WRITE: + case EVF_TIMER: + case EVF_SIGNAL: + break; + } + + return (retval); +} + +static void +mevent_populate(struct mevent *mevp, struct kevent *kev) +{ + if (mevp->me_type == EVF_TIMER) { + kev->ident = mevp->me_timid; + kev->data = mevp->me_msecs; + } else { + kev->ident = mevp->me_fd; + kev->data = 0; + } + kev->filter = mevent_kq_filter(mevp); + kev->flags = mevent_kq_flags(mevp); + kev->fflags = mevent_kq_fflags(mevp); + kev->udata = mevp; +} + +static int +mevent_build(struct kevent *kev) +{ + struct mevent *mevp, *tmpp; + int i; + + i = 0; + + mevent_qlock(); + + LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { + if (mevp->me_closefd) { + /* + * A close of the file descriptor will remove the + * event + */ + close(mevp->me_fd); + } else { + mevent_populate(mevp, &kev[i]); + i++; + } + + mevp->me_cq = 0; + LIST_REMOVE(mevp, me_list); + + if (mevp->me_state & EV_DELETE) { + free(mevp); + } else { + LIST_INSERT_HEAD(&global_head, mevp, me_list); + } + + assert(i < MEVENT_MAX); + } + + mevent_qunlock(); + + return (i); +} + +static void +mevent_handle(struct kevent *kev, int numev) +{ + struct mevent *mevp; + uint64_t data; + int i; + + for (i = 0; i < numev; i++) { + mevp = kev[i].udata; + data = kev[i].data; + + /* XXX check for EV_ERROR ? */ + + (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param, data); + } +} + +static struct mevent * +mevent_add_state(int tfd, enum ev_type type, mevent_cb_t *func, void *param, + int state, int fflags) +{ + struct kevent kev; + struct mevent *lp, *mevp; + int ret; + + if (tfd < 0 || func == NULL) { + return (NULL); + } + + mevp = NULL; + + pthread_once(&mevent_once, mevent_init); + + mevent_qlock(); + + /* + * Verify that the fd/type tuple is not present in any list + */ + LIST_FOREACH(lp, &global_head, me_list) { + if (type != EVF_TIMER && lp->me_fd == tfd && + lp->me_type == type) { + goto exit; + } + } + + LIST_FOREACH(lp, &change_head, me_list) { + if (type != EVF_TIMER && lp->me_fd == tfd && + lp->me_type == type) { + goto exit; + } + } + + /* + * Allocate an entry and populate it. + */ + mevp = calloc(1, sizeof(struct mevent)); + if (mevp == NULL) { + goto exit; + } + + if (type == EVF_TIMER) { + mevp->me_msecs = tfd; + mevp->me_timid = mevent_timid++; + } else + mevp->me_fd = tfd; + mevp->me_type = type; + mevp->me_func = func; + mevp->me_param = param; + mevp->me_state = state; + mevp->me_fflags = fflags; + + /* + * Try to add the event. If this fails, report the failure to + * the caller. + */ + mevent_populate(mevp, &kev); + ret = kevent(mfd, &kev, 1, NULL, 0, NULL); + if (ret == -1) { + free(mevp); + mevp = NULL; + goto exit; + } + + mevp->me_state &= ~EV_ADD; + LIST_INSERT_HEAD(&global_head, mevp, me_list); + +exit: + mevent_qunlock(); + + return (mevp); +} + +struct mevent * +mevent_add(int tfd, enum ev_type type, mevent_cb_t *func, void *param) +{ + + return (mevent_add_state(tfd, type, func, param, EV_ADD, 0)); +} + +struct mevent * +mevent_add_flags(int tfd, enum ev_type type, int fflags, mevent_cb_t *func, void *param) +{ + + return (mevent_add_state(tfd, type, func, param, EV_ADD, fflags)); +} + +struct mevent * +mevent_add_disabled(int tfd, enum ev_type type, mevent_cb_t *func, void *param) +{ + + return (mevent_add_state(tfd, type, func, param, EV_ADD | EV_DISABLE, 0)); +} + +static int +mevent_update(struct mevent *evp, enum mevent_update_type type, int msecs) +{ + int newstate; + + mevent_qlock(); + + /* + * It's not possible to update a deleted event + */ + assert((evp->me_state & EV_DELETE) == 0); + + newstate = evp->me_state; + if (type == UPDATE_ENABLE) { + newstate |= EV_ENABLE; + newstate &= ~EV_DISABLE; + } else if (type == UPDATE_DISABLE) { + newstate |= EV_DISABLE; + newstate &= ~EV_ENABLE; + } else { + assert(type == UPDATE_TIMER); + assert(evp->me_type == EVF_TIMER); + newstate |= EV_ADD; + evp->me_msecs = msecs; + } + + /* + * No update needed if enable/disable had no effect + */ + if (evp->me_state != newstate || type == UPDATE_TIMER) { + evp->me_state = newstate; + + /* + * Place the entry onto the changed list if not + * already there. + */ + if (evp->me_cq == 0) { + evp->me_cq = 1; + LIST_REMOVE(evp, me_list); + LIST_INSERT_HEAD(&change_head, evp, me_list); + mevent_notify(); + } + } + + mevent_qunlock(); + + return (0); +} + +int +mevent_enable(struct mevent *evp) +{ + return (mevent_update(evp, UPDATE_ENABLE, -1)); +} + +int +mevent_disable(struct mevent *evp) +{ + return (mevent_update(evp, UPDATE_DISABLE, -1)); +} + +int +mevent_timer_update(struct mevent *evp, int msecs) +{ + return (mevent_update(evp, UPDATE_TIMER, msecs)); +} + +static int +mevent_delete_event(struct mevent *evp, int closefd) +{ + mevent_qlock(); + + /* + * Place the entry onto the changed list if not already there, and + * mark as to be deleted. + */ + if (evp->me_cq == 0) { + evp->me_cq = 1; + LIST_REMOVE(evp, me_list); + LIST_INSERT_HEAD(&change_head, evp, me_list); + mevent_notify(); + } + evp->me_state = EV_DELETE; + + if (closefd) + evp->me_closefd = 1; + + mevent_qunlock(); + + return (0); +} + +int +mevent_delete(struct mevent *evp) +{ + + return (mevent_delete_event(evp, 0)); +} + +int +mevent_delete_close(struct mevent *evp) +{ + + return (mevent_delete_event(evp, 1)); +} + +static void +mevent_set_name(void) +{ + + pthread_set_name_np(mevent_tid, "mevent"); +} + +void +mevent_dispatch(void) +{ + struct kevent changelist[MEVENT_MAX]; + struct kevent eventlist[MEVENT_MAX]; + struct mevent *pipev; + int numev; + int ret; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + mevent_tid = pthread_self(); + mevent_set_name(); + + pthread_once(&mevent_once, mevent_init); + + /* + * Open the pipe that will be used for other threads to force + * the blocking kqueue call to exit by writing to it. Set the + * descriptor to non-blocking. + */ + ret = pipe(mevent_pipefd); + if (ret < 0) { + perror("pipe"); + exit(0); + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(mevent_pipefd[0], &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_rights_limit(mevent_pipefd[1], &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + /* + * Add internal event handler for the pipe write fd + */ + pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL); + assert(pipev != NULL); + + for (;;) { + /* + * Build changelist if required. + * XXX the changelist can be put into the blocking call + * to eliminate the extra syscall. Currently better for + * debug. + */ + numev = mevent_build(changelist); + if (numev) { + ret = kevent(mfd, changelist, numev, NULL, 0, NULL); + if (ret == -1) { + perror("Error return from kevent change"); + } + } + + /* + * Block awaiting events + */ + ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL); + if (ret == -1 && errno != EINTR) { + perror("Error return from kevent monitor"); + } + + /* + * Handle reported events + */ + mevent_handle(eventlist, ret); + } +} diff --git a/tests/sys/virtio/mmio_emul.h b/tests/sys/virtio/mmio_emul.h new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/mmio_emul.h @@ -0,0 +1,117 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _MMIO_EMUL_H_ +#define _MMIO_EMUL_H_ + +#include +#include +#include +#include +#include + +#include + +#define MI_NAMESZ (40) + +struct mmio_devinst; + +struct mmio_devemu { + const char *me_emu; /* Name of device emulation */ + + /* instance creation */ + int (*me_init)(struct mmio_devinst *, nvlist_t *); + void (*me_write)(struct mmio_devinst *mdi, uint64_t offset, + int size, uint32_t value); +}; +#define MMIO_EMUL_SET(x) DATA_SET(mmio_devemu_set, x) + +enum mmio_devstate { + MIDEV_INVALID, + MIDEV_ACKNOWLEDGED, + MIDEV_DRIVER_FOUND, + MIDEV_FEATURES_OK, + MIDEV_LIVE, + MIDEV_FAILED, + MIDEV_DEVICE_STATES, +}; + +struct mmio_devinst { + struct mmio_devemu *mi_d; + char mi_name[MI_NAMESZ]; + char *mi_addr; /* VQ control region */ + size_t mi_bytes; /* Size of region in bytes */ + int mi_fd; /* File descriptor for the region. */ + enum mmio_devstate mi_state; +}; + +/* XXX Sensible default until proven otherwise. But we need to link it with the in-kernel header. */ +#define MMIO_TOTAL_SIZE (1024 * 1024 * 10) +#define MMIO_CTRDEV ("/dev/vtdbg") + +int init_mmio(nvlist_t *nvl); +void mmio_print_supported_devices(void); +int mmio_parse_device(nvlist_t *nvl, char *opt); + +static __inline void +mmio_set_cfgdata8(struct mmio_devinst *mdi, int offset, uint8_t val) +{ + *(uint8_t *)(mdi->mi_addr + offset) = val; +} + +static __inline void +mmio_set_cfgdata16(struct mmio_devinst *mdi, int offset, uint16_t val) +{ + *(uint16_t *)(mdi->mi_addr + offset) = htole16(val); +} + +static __inline void +mmio_set_cfgdata32(struct mmio_devinst *mdi, int offset, uint32_t val) +{ + *(uint32_t *)(mdi->mi_addr + offset) = htole32(val); +} + +static __inline uint8_t +mmio_get_cfgdata8(struct mmio_devinst *mdi, int offset) +{ + return (*(uint8_t *)(mdi->mi_addr + offset)); +} + +static __inline uint16_t +mmio_get_cfgdata16(struct mmio_devinst *mdi, int offset) +{ + return le16toh((*(uint16_t *)(mdi->mi_addr + offset))); +} + +static __inline uint32_t +mmio_get_cfgdata32(struct mmio_devinst *mdi, int offset) +{ + return le32toh((*(uint32_t *)(mdi->mi_addr + offset))); +} + +#endif /* _MMIO_EMUL_H_ */ diff --git a/tests/sys/virtio/mmio_emul.c b/tests/sys/virtio/mmio_emul.c new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/mmio_emul.c @@ -0,0 +1,178 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "config.h" +#include "debug.h" +#include "mmio_emul.h" +#include "virtio.h" + +SET_DECLARE(mmio_devemu_set, struct mmio_devemu); + +static struct mmio_devemu * +mmio_emul_finddev(const char *name) +{ + struct mmio_devemu **mdpp, *mdp; + + SET_FOREACH(mdpp, mmio_devemu_set) { + mdp = *mdpp; + if (!strcmp(mdp->me_emu, name)) { + return (mdp); + } + } + + return (NULL); +} + +static void * +mmio_emul_driver_init(void *arg) +{ + int error; + int fd = (int)(long)arg; + + error = ioctl(fd, VIRTIO_DBG_INIT); + if (error < 0) { + EPRINTLN("Control device initialization error: %s", + strerror(errno)); + exit(1); + } + pthread_exit(NULL); +} + +static int +mmio_emul_control_init(struct mmio_devinst *mdi, struct mmio_devemu *mde, nvlist_t *nvl) +{ + pthread_t thread; + char *mmio; + int err; + int fd; + + fd = open(MMIO_CTRDEV, O_RDWR); + if (fd == -1) { + EPRINTLN("Control device open error: %s", + strerror(errno)); + return (-1); + } + + mmio = mmap(NULL, MMIO_TOTAL_SIZE, PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, fd, 0); + if (mmio == MAP_FAILED) { + EPRINTLN("Control device mapping error: %s", + strerror(errno)); + close(fd); + return (-1); + } + + mdi->mi_fd = fd; + mdi->mi_addr = mmio; + mdi->mi_bytes = MMIO_TOTAL_SIZE; + + /* + * XXX Hack. We currently hardwire the block device ID. Propagate + * the device type in a different way. + */ + mmio_set_cfgdata32(mdi, VIRTIO_MMIO_MAGIC_VALUE, VIRTIO_MMIO_MAGIC_VIRT); + mmio_set_cfgdata32(mdi, VIRTIO_MMIO_VERSION, 0x2); + mmio_set_cfgdata32(mdi, VIRTIO_MMIO_DEVICE_ID, 0x2); + mmio_set_cfgdata32(mdi, VIRTIO_MMIO_VENDOR_ID, VIRTIO_VENDOR); + + err = (mde->me_init)(mdi, nvl); + if (err != 0) + return (err); + + /* + * Make the ioctl out of band, because we wll use this thread to to service + * the register the writes triggered by the driver during device attach. + */ + return (pthread_create(&thread, NULL, mmio_emul_driver_init, (void *)(long)fd)); +} + +static int +mmio_emul_init(struct mmio_devemu *mde, nvlist_t *nvl) +{ + struct mmio_devinst *mdi; + int err; + + mdi = calloc(1, sizeof(struct mmio_devinst)); + if (mdi == NULL) + return (ENOMEM); + + snprintf(mdi->mi_name, sizeof(mdi->mi_name), "%s@mmio", mde->me_emu); + mdi->mi_state = MIDEV_INVALID; + mdi->mi_fd = -1; + + err = mmio_emul_control_init(mdi, mde, nvl); + if (err != 0) { + free(mdi); + return (err); + } + + return (0); +} + +int +mmio_parse_device(nvlist_t *nvl, char *opt) +{ + struct mmio_devemu *mde; + char *emul = opt; + + mde = mmio_emul_finddev(emul); + if (mde == NULL) { + EPRINTLN("unknown mmio device %s\n", emul); + return (EINVAL); + } + + if (get_config_value_node(nvl, "devtype") != NULL) { + EPRINTLN("device type already defined!"); + return (EINVAL); + } + + set_config_value_node(nvl, "devtype", mde->me_emu); + + return (0); +} + + +void +mmio_print_supported_devices(void) +{ + struct mmio_devemu **mdpp, *mdp; + + SET_FOREACH(mdpp, mmio_devemu_set) { + mdp = *mdpp; + printf("%s\n", mdp->me_emu); + } +} + +int +init_mmio(nvlist_t *nvl) +{ + struct mmio_devemu *mde; + const char *emul; + + emul = get_config_value_node(nvl, "devtype"); + if (emul == NULL) { + EPRINTLN("mmio device missing devtype value"); + return (EINVAL); + } + + mde = mmio_emul_finddev(emul); + if (mde == NULL) { + EPRINTLN("mmio unknown device \"%s\"", emul); + return (EINVAL); + } + + return (mmio_emul_init(mde, nvl)); +} diff --git a/tests/sys/virtio/mmio_virtio_block.c b/tests/sys/virtio/mmio_virtio_block.c new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/mmio_virtio_block.c @@ -0,0 +1,560 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * Copyright 2020-2021 Joyent, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "config.h" +#include "debug.h" +#include "mevent.h" +#include "mmio_emul.h" +#include "virtio.h" +#include "block_if.h" +#include "iov_emul.h" + +#define VTBLK_BSIZE 512 +#define VTBLK_RINGSZ 128 + +_Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request"); + +#define VTBLK_S_OK 0 +#define VTBLK_S_IOERR 1 +#define VTBLK_S_UNSUPP 2 + +#define VTBLK_BLK_ID_BYTES 20 + 1 + +/* Capability bits */ +#define VTBLK_F_BARRIER (1 << 0) /* Does host support barriers? */ +#define VTBLK_F_SIZE_MAX (1 << 1) /* Indicates maximum segment size */ +#define VTBLK_F_SEG_MAX (1 << 2) /* Indicates maximum # of segments */ +#define VTBLK_F_GEOMETRY (1 << 4) /* Legacy geometry available */ +#define VTBLK_F_RO (1 << 5) /* Disk is read-only */ +#define VTBLK_F_BLK_SIZE (1 << 6) /* Block size of disk is available*/ +#define VTBLK_F_SCSI (1 << 7) /* Supports scsi command passthru */ +#define VTBLK_F_FLUSH (1 << 9) /* Writeback mode enabled after reset */ +#define VTBLK_F_WCE (1 << 9) /* Legacy alias for FLUSH */ +#define VTBLK_F_TOPOLOGY (1 << 10) /* Topology information is available */ +#define VTBLK_F_CONFIG_WCE (1 << 11) /* Writeback mode available in config */ +#define VTBLK_F_MQ (1 << 12) /* Multi-Queue */ +#define VTBLK_F_DISCARD (1 << 13) /* Trim blocks */ +#define VTBLK_F_WRITE_ZEROES (1 << 14) /* Write zeros */ + +/* + * Host capabilities + */ +#define VTBLK_S_HOSTCAPS \ + ( VTBLK_F_SEG_MAX | \ + VTBLK_F_BLK_SIZE | \ + VTBLK_F_FLUSH | \ + VTBLK_F_TOPOLOGY ) + /* XXX Reactivate */ +// VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ + +/* + * The current blockif_delete() interface only allows a single delete + * request at a time. + */ +#define VTBLK_MAX_DISCARD_SEG 1 + +/* + * An arbitrary limit to prevent excessive latency due to large + * delete requests. + */ +#define VTBLK_MAX_DISCARD_SECT ((16 << 20) / VTBLK_BSIZE) /* 16 MiB */ + +/* + * Config space "registers" + */ +struct vtblk_config { + uint64_t vbc_capacity; + uint32_t vbc_size_max; + uint32_t vbc_seg_max; + struct { + uint16_t cylinders; + uint8_t heads; + uint8_t sectors; + } vbc_geometry; + uint32_t vbc_blk_size; + struct { + uint8_t physical_block_exp; + uint8_t alignment_offset; + uint16_t min_io_size; + uint32_t opt_io_size; + } vbc_topology; + uint8_t vbc_writeback; + uint8_t unused0[1]; + uint16_t num_queues; + uint32_t max_discard_sectors; + uint32_t max_discard_seg; + uint32_t discard_sector_alignment; + uint32_t max_write_zeroes_sectors; + uint32_t max_write_zeroes_seg; + uint8_t write_zeroes_may_unmap; + uint8_t unused1[3]; +} __packed; + +/* + * Fixed-size block header + */ +struct virtio_blk_hdr { +#define VBH_OP_READ 0 +#define VBH_OP_WRITE 1 +#define VBH_OP_SCSI_CMD 2 +#define VBH_OP_SCSI_CMD_OUT 3 +#define VBH_OP_FLUSH 4 +#define VBH_OP_FLUSH_OUT 5 +#define VBH_OP_IDENT 8 +#define VBH_OP_DISCARD 11 +#define VBH_OP_WRITE_ZEROES 13 + +#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ + uint32_t vbh_type; + uint32_t vbh_ioprio; + uint64_t vbh_sector; +} __packed; + +/* + * Debug printf + */ +static int mmio_vtblk_debug; +#define DPRINTF(params) if (mmio_vtblk_debug) PRINTLN params +#define WPRINTF(params) PRINTLN params + +struct mmio_vtblk_ioreq { + struct blockif_req io_req; + struct mmio_vtblk_softc *io_sc; + uint8_t *io_status; + uint16_t io_idx; + struct iov_emul *io_iove; +}; + +struct virtio_blk_discard_write_zeroes { + uint64_t sector; + uint32_t num_sectors; + struct { + uint32_t unmap:1; + uint32_t reserved:31; + } flags; +}; + +/* + * Per-device softc + */ +struct mmio_vtblk_softc { + struct virtio_softc vbsc_vs; + pthread_mutex_t vsc_mtx; + struct vqueue_info vbsc_vq; + struct vtblk_config *vbsc_cfg; + struct virtio_consts vbsc_consts; + struct blockif_ctxt *bc; + char vbsc_ident[VTBLK_BLK_ID_BYTES]; + struct mmio_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ]; +}; + +static void mmio_vtblk_reset(void *); +static void mmio_vtblk_notify(void *, struct vqueue_info *); +static int mmio_vtblk_cfgread(void *, int, int, uint32_t *); +static int mmio_vtblk_cfgwrite(void *, int, int, uint32_t); + +static struct virtio_consts vtblk_vi_consts = { + .vc_name = "vtblk", + .vc_nvq = 1, + .vc_cfgsize = sizeof(struct vtblk_config), + .vc_reset = mmio_vtblk_reset, + .vc_qnotify = mmio_vtblk_notify, + .vc_cfgread = mmio_vtblk_cfgread, + .vc_cfgwrite = mmio_vtblk_cfgwrite, + .vc_apply_features = NULL, + .vc_hv_caps = VTBLK_S_HOSTCAPS, +}; + +static void +mmio_vtblk_reset(void *vsc) +{ + struct mmio_vtblk_softc *sc = vsc; + + DPRINTF(("vtblk: device reset requested !")); + vi_reset_dev(&sc->vbsc_vs); +} + +static void +mmio_vtblk_done_locked(struct mmio_vtblk_ioreq *io, int err) +{ + struct mmio_vtblk_softc *sc = io->io_sc; + int fd = sc->vbsc_vs.vs_mi->mi_fd; + + /* convert errno into a virtio block error return */ + if (err == EOPNOTSUPP || err == ENOSYS) + *io->io_status = VTBLK_S_UNSUPP; + else if (err != 0) + *io->io_status = VTBLK_S_IOERR; + else + *io->io_status = VTBLK_S_OK; + + + iove_export(fd, io->io_iove); + iove_free(io->io_iove); + io->io_iove = NULL; + + /* + * Return the descriptor back to the host. + * We wrote 1 byte (our status) to host. + */ + vq_relchain(&sc->vbsc_vq, io->io_idx, 1); + vq_endchains(&sc->vbsc_vq, 0); +} + +static void +mmio_vtblk_done(struct blockif_req *br, int err) +{ + struct mmio_vtblk_ioreq *io = br->br_param; + struct mmio_vtblk_softc *sc = io->io_sc; + + pthread_mutex_lock(&sc->vsc_mtx); + mmio_vtblk_done_locked(io, err); + pthread_mutex_unlock(&sc->vsc_mtx); +} + +static void +mmio_vtblk_proc(struct mmio_vtblk_softc *sc, struct vqueue_info *vq) +{ + struct virtio_blk_hdr *vbh; + struct mmio_vtblk_ioreq *io; + int i, n; + int err; + ssize_t iolen; + int writeop, type; + struct vi_req req; + struct iovec iov[BLOCKIF_IOV_MAX + 2]; + struct virtio_blk_discard_write_zeroes *discard; + + n = vq_getchain(vq, iov, BLOCKIF_IOV_MAX + 2, &req); + + /* + * The first descriptor will be the read-only fixed header, + * and the last is for status (hence +2 above and below). + * The remaining iov's are the actual data I/O vectors. + * + * XXX - note - this fails on crash dump, which does a + * VIRTIO_BLK_T_FLUSH with a zero transfer length + */ + assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2); + + io = &sc->vbsc_ios[req.idx]; + assert(req.readable != 0); + assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); + vbh = (struct virtio_blk_hdr *)iov[0].iov_base; + memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2)); + io->io_req.br_iovcnt = n - 2; + io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE; + io->io_status = (uint8_t *)iov[--n].iov_base; + io->io_iove = req.iove; + assert(req.writable != 0); + assert(iov[n].iov_len == 1); + + /* + * XXX + * The guest should not be setting the BARRIER flag because + * we don't advertise the capability. + */ + type = vbh->vbh_type & ~VBH_FLAG_BARRIER; + writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD); + /* + * - Write op implies read-only descriptor + * - Read/ident op implies write-only descriptor + * + * By taking away either the read-only fixed header or the write-only + * status iovec, the following condition should hold true. + */ + assert(n == (writeop ? req.readable : req.writable)); + + iolen = 0; + for (i = 1; i < n; i++) { + iolen += iov[i].iov_len; + } + io->io_req.br_resid = iolen; + + DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld", + writeop ? "write/discard" : "read/ident", iolen, i - 1, + io->io_req.br_offset)); + + switch (type) { + case VBH_OP_READ: + err = blockif_read(sc->bc, &io->io_req); + break; + case VBH_OP_WRITE: + err = blockif_write(sc->bc, &io->io_req); + break; + case VBH_OP_DISCARD: + /* + * We currently only support a single request, if the guest + * has submitted a request that doesn't conform to the + * requirements, we return a error. + */ + if (iov[1].iov_len != sizeof (*discard)) { + mmio_vtblk_done_locked(io, EINVAL); + return; + } + + /* The segments to discard are provided rather than data */ + discard = (struct virtio_blk_discard_write_zeroes *) + iov[1].iov_base; + + /* + * virtio v1.1 5.2.6.2: + * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP + * for discard and write zeroes commands if any unknown flag is + * set. Furthermore, the device MUST set the status byte to + * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag + * is set. + * + * Currently there are no known flags for a DISCARD request. + */ + if (discard->flags.unmap != 0 || discard->flags.reserved != 0) { + mmio_vtblk_done_locked(io, ENOTSUP); + return; + } + + /* Make sure the request doesn't exceed our size limit */ + if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) { + mmio_vtblk_done_locked(io, EINVAL); + return; + } + + io->io_req.br_offset = discard->sector * VTBLK_BSIZE; + io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE; + err = blockif_delete(sc->bc, &io->io_req); + break; + case VBH_OP_FLUSH: + case VBH_OP_FLUSH_OUT: + err = blockif_flush(sc->bc, &io->io_req); + break; + case VBH_OP_IDENT: + /* Assume a single buffer */ + /* S/n equal to buffer is not zero-terminated. */ + memset(iov[1].iov_base, 0, iov[1].iov_len); + strncpy(iov[1].iov_base, sc->vbsc_ident, + MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); + mmio_vtblk_done_locked(io, 0); + return; + default: + mmio_vtblk_done_locked(io, EOPNOTSUPP); + return; + } + assert(err == 0); +} + +static void +mmio_vtblk_notify(void *vsc, struct vqueue_info *vq) +{ + struct mmio_vtblk_softc *sc = vsc; + + while (vq_has_descs(vq)) + mmio_vtblk_proc(sc, vq); +} + +static void +mmio_vtblk_resized(struct blockif_ctxt *bctxt __unused, void *arg, + size_t new_size, uint64_t data __unused) +{ + struct mmio_vtblk_softc *sc; + + sc = arg; + + sc->vbsc_cfg->vbc_capacity = new_size / VTBLK_BSIZE; /* 512-byte units */ + /* XXX Handle resizing. */ + printf("UNIMPLEMENTED %s\n", __func__); + exit(1); +} + +static void +mmio_vtblk_event(int fd, enum ev_type type, void *arg, uint64_t offset) +{ + struct mmio_vtblk_softc *sc = (struct mmio_vtblk_softc *)arg; + struct mmio_devinst *mdi = sc->vbsc_vs.vs_mi; + + assert(fd == mdi->mi_fd); + assert(type == EVF_READ); + + vi_mmio_write(&sc->vbsc_vs, offset); + + /* Let in-progress operations continue. */ + ioctl(mdi->mi_fd, VIRTIO_DBG_ACK); +} + +static int +mmio_vtblk_init(struct mmio_devinst *mdi, nvlist_t *nvl) +{ + char bident[MI_NAMESZ]; + struct blockif_ctxt *bctxt; + const char *path, *serial; + MD5_CTX mdctx; + u_char digest[16]; + struct mmio_vtblk_softc *sc; + off_t size; + int i, sectsz, sts, sto; + + /* + * The supplied backing file has to exist + */ + /* Make sure the name fits */ + snprintf(bident, sizeof(bident), "%s", mdi->mi_name); + bctxt = blockif_open(nvl, bident); + if (bctxt == NULL) { + perror("Could not open backing file"); + return (1); + } + + size = blockif_size(bctxt); + sectsz = blockif_sectsz(bctxt); + blockif_psectsz(bctxt, &sts, &sto); + + sc = calloc(1, sizeof(struct mmio_vtblk_softc)); + sc->vbsc_cfg = (struct vtblk_config *)((uint64_t)mdi->mi_addr + VIRTIO_MMIO_CONFIG); + + sc->bc = bctxt; + for (i = 0; i < VTBLK_RINGSZ; i++) { + struct mmio_vtblk_ioreq *io = &sc->vbsc_ios[i]; + io->io_req.br_callback = mmio_vtblk_done; + io->io_req.br_param = io; + io->io_sc = sc; + io->io_idx = i; + } + + bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts)); + if (blockif_candelete(sc->bc)) + sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD; + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + /* init virtio softc and virtqueues */ + vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, mdi, &sc->vbsc_vq); + sc->vbsc_vs.vs_mtx = &sc->vsc_mtx; + + sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; + /* sc->vbsc_vq.vq_notify = we have no per-queue notify */ + + /* + * If an explicit identifier is not given, create an + * identifier using parts of the md5 sum of the filename. + */ + bzero(sc->vbsc_ident, VTBLK_BLK_ID_BYTES); + if ((serial = get_config_value_node(nvl, "serial")) != NULL || + (serial = get_config_value_node(nvl, "ser")) != NULL) { + strlcpy(sc->vbsc_ident, serial, VTBLK_BLK_ID_BYTES); + } else { + path = get_config_value_node(nvl, "path"); + MD5Init(&mdctx); + MD5Update(&mdctx, path, strlen(path)); + MD5Final(digest, &mdctx); + snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES, + "BHYVE-%02X%02X-%02X%02X-%02X%02X", + digest[0], digest[1], digest[2], digest[3], digest[4], + digest[5]); + } + + /* setup virtio block config space */ + sc->vbsc_cfg->vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */ + sc->vbsc_cfg->vbc_size_max = 0; /* not negotiated */ + + /* + * If Linux is presented with a seg_max greater than the virtio queue + * size, it can stumble into situations where it violates its own + * invariants and panics. For safety, we keep seg_max clamped, paying + * heed to the two extra descriptors needed for the header and status + * of a request. + */ + sc->vbsc_cfg->vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX); + sc->vbsc_cfg->vbc_geometry.cylinders = 0; /* no geometry */ + sc->vbsc_cfg->vbc_geometry.heads = 0; + sc->vbsc_cfg->vbc_geometry.sectors = 0; + sc->vbsc_cfg->vbc_blk_size = sectsz; + sc->vbsc_cfg->vbc_topology.physical_block_exp = + (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0; + sc->vbsc_cfg->vbc_topology.alignment_offset = + (sto != 0) ? ((sts - sto) / sectsz) : 0; + sc->vbsc_cfg->vbc_topology.min_io_size = 0; + sc->vbsc_cfg->vbc_topology.opt_io_size = 0; + sc->vbsc_cfg->vbc_writeback = 0; + sc->vbsc_cfg->max_discard_sectors = VTBLK_MAX_DISCARD_SECT; + sc->vbsc_cfg->max_discard_seg = VTBLK_MAX_DISCARD_SEG; + sc->vbsc_cfg->discard_sector_alignment = MAX(sectsz, sts) / VTBLK_BSIZE; + + mevent_add(mdi->mi_fd, EVF_READ, mmio_vtblk_event, sc); + blockif_register_resize_callback(sc->bc, mmio_vtblk_resized, sc); + + return (0); +} + +static int +mmio_vtblk_cfgwrite(void *vsc __unused, int offset, int size __unused, + uint32_t value __unused) +{ + + DPRINTF(("vtblk: write to readonly reg %d", offset)); + return (1); +} + +static int +mmio_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct mmio_vtblk_softc *sc = vsc; + void *ptr; + + /* our caller has already verified offset and size */ + ptr = (uint8_t *)sc->vbsc_cfg + offset; + memcpy(retval, ptr, size); + return (0); +} + + +static const struct mmio_devemu mmio_de_vblk = { + .me_emu = "virtio-blk", + .me_init = mmio_vtblk_init, +}; +MMIO_EMUL_SET(mmio_de_vblk); diff --git a/tests/sys/virtio/virtio.h b/tests/sys/virtio/virtio.h new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/virtio.h @@ -0,0 +1,323 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Chris Torek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _BHYVE_VIRTIO_H_ +#define _BHYVE_VIRTIO_H_ + +#include + +#include +#include +#include + +/* + * These are derived from several virtio specifications. + * + * Some useful links: + * https://github.com/rustyrussell/virtio-spec + * http://people.redhat.com/pbonzini/virtio-spec.pdf + */ + +/* + * A virtual device has zero or more "virtual queues" (virtqueue). + * Each virtqueue uses at least two 4096-byte pages, laid out thus: + * + * +-----------------------------------------------+ + * | "desc": descriptors, 16 bytes each | + * | ----------------------------------------- | + * | "avail": 2 uint16; uint16; 1 uint16 | + * | ----------------------------------------- | + * | pad to 4k boundary | + * +-----------------------------------------------+ + * | "used": 2 x uint16; elems; 1 uint16 | + * | ----------------------------------------- | + * | pad to 4k boundary | + * +-----------------------------------------------+ + * + * The number that appears here is always a power of two and is + * limited to no more than 32768 (as it must fit in a 16-bit field). + * If is sufficiently large, the above will occupy more than + * two pages. In any case, all pages must be physically contiguous + * within the guest's physical address space. + * + * The 16-byte "desc" descriptors consist of a 64-bit guest + * physical address , a 32-bit length , a 16-bit + * , and a 16-bit field (all in guest byte order). + * + * There are three flags that may be set : + * NEXT descriptor is chained, so use its "next" field + * WRITE descriptor is for host to write into guest RAM + * (else host is to read from guest RAM) + * INDIRECT descriptor address field is (guest physical) + * address of a linear array of descriptors + * + * Unless INDIRECT is set, is the number of bytes that may + * be read/written from guest physical address . If + * INDIRECT is set, WRITE is ignored and provides the length + * of the indirect descriptors (and must be a multiple of + * 16). Note that NEXT may still be set in the main descriptor + * pointing to the indirect, and should be set in each indirect + * descriptor that uses the next descriptor (these should generally + * be numbered sequentially). However, INDIRECT must not be set + * in the indirect descriptors. Upon reaching an indirect descriptor + * without a NEXT bit, control returns to the direct descriptors. + * + * Except inside an indirect, each value must be in the + * range [0 .. N) (i.e., the half-open interval). (Inside an + * indirect, each must be in the range [0 .. /16).) + * + * The "avail" data structures reside in the same pages as the + * "desc" structures since both together are used by the device to + * pass information to the hypervisor's virtual driver. These + * begin with a 16-bit field and 16-bit index , then + * have 16-bit values, followed by one final 16-bit + * field . The entries are simply indices + * into the descriptor ring (and thus must meet the same + * constraints as each value). However, is counted + * up from 0 (initially) and simply wraps around after 65535; it + * is taken mod to find the next available entry. + * + * The "used" ring occupies a separate page or pages, and contains + * values written from the virtual driver back to the guest OS. + * This begins with a 16-bit and 16-bit , then there + * are "vring_used" elements, followed by a 16-bit . + * The "vring_used" elements consist of a 32-bit and a + * 32-bit (vu_tlen below). The is simply the index of + * the head of a descriptor chain the guest made available + * earlier, and the is the number of bytes actually written, + * e.g., in the case of a network driver that provided a large + * receive buffer but received only a small amount of data. + * + * The two event fields, and , in the + * avail and used rings (respectively -- note the reversal!), are + * always provided, but are used only if the virtual device + * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature + * negotiation. Similarly, both rings provide a flag -- + * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in + * their field, indicating that the guest does not need an + * interrupt, or that the hypervisor driver does not need a + * notify, when descriptors are added to the corresponding ring. + * (These are provided only for interrupt optimization and need + * not be implemented.) + */ +#define VRING_ALIGN 4096 + +/* + * PCI vendor/device IDs + */ +#define VIRTIO_VENDOR 0x1AF4 +#define VIRTIO_DEV_NET 0x1000 +#define VIRTIO_DEV_BLOCK 0x1001 +#define VIRTIO_DEV_CONSOLE 0x1003 +#define VIRTIO_DEV_SCSI 0x1004 +#define VIRTIO_DEV_RANDOM 0x1005 +#define VIRTIO_DEV_9P 0x1009 +#define VIRTIO_DEV_INPUT 0x1052 + +/* + * PCI revision IDs + */ +#define VIRTIO_REV_INPUT 1 + +/* + * PCI subvendor IDs + */ +#define VIRTIO_SUBVEN_INPUT 0x108E + +/* + * PCI subdevice IDs + */ +#define VIRTIO_SUBDEV_INPUT 0x1100 + +/* From section 2.3, "Virtqueue Configuration", of the virtio specification */ +static inline int +vring_size_aligned(u_int qsz) +{ + return (roundup2(vring_size(qsz, VRING_ALIGN), VRING_ALIGN)); +} + +struct mmio_devinst; +struct vqueue_info; + +struct virtio_softc { + struct virtio_consts *vs_vc; /* constants (see below) */ + int vs_flags; /* VIRTIO_* flags from above */ + pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */ + struct mmio_devinst *vs_mi; /* MMIO device instance */ + uint32_t vs_negotiated_caps; /* negotiated capabilities */ + struct vqueue_info *vs_queues; /* one per vc_nvq */ + int vs_curq; /* current queue */ +}; + +#define VS_LOCK(vs) \ +do { \ + if (vs->vs_mtx) \ + pthread_mutex_lock(vs->vs_mtx); \ +} while (0) + +#define VS_UNLOCK(vs) \ +do { \ + if (vs->vs_mtx) \ + pthread_mutex_unlock(vs->vs_mtx); \ +} while (0) + +struct virtio_consts { + const char *vc_name; /* name of driver (for diagnostics) */ + int vc_nvq; /* number of virtual queues */ + size_t vc_cfgsize; /* size of dev-specific config regs */ + void (*vc_reset)(void *); /* called on virtual device reset */ + void (*vc_qnotify)(void *, struct vqueue_info *); + /* called on QNOTIFY if no VQ notify */ + int (*vc_cfgread)(void *, int, int, uint32_t *); + /* called to read config regs */ + int (*vc_cfgwrite)(void *, int, int, uint32_t); + /* called to write config regs */ + void (*vc_apply_features)(void *, uint64_t); + /* called to apply negotiated features */ + uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ +}; + +/* + * Data structure allocated (statically) per virtual queue. + * + * Drivers may change vq_qsize after a reset. When the guest OS + * requests a device reset, the hypervisor first calls + * vs->vs_vc->vc_reset(); then the data structure below is + * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq). + * + * The remaining fields should only be fussed-with by the generic + * code. + * + * Note: the addresses of vq_desc, vq_avail, and vq_used are all + * computable from each other, but it's a lot simpler if we just + * keep a pointer to each one. The event indices are similarly + * (but more easily) computable, and this time we'll compute them: + * they're just XX_ring[N]. + */ +#define VQ_ALLOC 0x01 /* set once we have a pfn */ +#define VQ_BROKED 0x02 /* ??? */ +struct vqueue_info { + uint16_t vq_qsize; /* size of this queue (a power of 2) */ + void (*vq_notify)(void *, struct vqueue_info *); + /* called instead of vc_notify, if not NULL */ + + struct virtio_softc *vq_vs; /* backpointer to softc */ + uint16_t vq_num; /* we're the num'th queue in the softc */ + + uint16_t vq_flags; /* flags (see above) */ + uint16_t vq_last_avail; /* a recent value of vq_avail->idx */ + uint16_t vq_next_used; /* index of the next used slot to be filled */ + uint16_t vq_save_used; /* saved vq_used->idx; see vq_endchains */ + + uint32_t vq_offset; /* Offset in the control region */ + + struct vring_desc *vq_desc; /* descriptor array */ + struct vring_avail *vq_avail; /* the "avail" ring */ + struct vring_used *vq_used; /* the "used" ring */ +}; + +/* as noted above, these are sort of backwards, name-wise */ +#define VQ_AVAIL_EVENT_IDX(vq) \ + (*(uint16_t *)&(vq)->vq_used->ring[(vq)->vq_qsize]) +#define VQ_USED_EVENT_IDX(vq) \ + ((vq)->vq_avail->ring[(vq)->vq_qsize]) + +/* + * Is this ring ready for I/O? + */ +static inline int +vq_ring_ready(struct vqueue_info *vq) +{ + + return (vq->vq_flags & VQ_ALLOC); +} + +/* + * Are there "available" descriptors? (This does not count + * how many, just returns True if there are some.) + */ +static inline int +vq_has_descs(struct vqueue_info *vq) +{ + + return (vq_ring_ready(vq) && vq->vq_last_avail != + vq->vq_avail->idx); +} + + +static inline void +vq_kick_enable(struct vqueue_info *vq) +{ + + vq->vq_used->flags &= ~VRING_USED_F_NO_NOTIFY; + /* + * Full memory barrier to make sure the store to vq_used->flags + * happens before the load from vq_avail->idx, which results from a + * subsequent call to vq_has_descs(). + */ + atomic_thread_fence_seq_cst(); +} + +static inline void +vq_kick_disable(struct vqueue_info *vq) +{ + + vq->vq_used->flags |= VRING_USED_F_NO_NOTIFY; +} + +struct iovec; + +/* + * Request description returned by vq_getchain. + * + * Writable iovecs start at iov[req.readable]. + */ +struct vi_req { + int readable; /* num of readable iovecs */ + int writable; /* num of writable iovecs */ + unsigned int idx; /* ring index */ + struct iov_emul *iove; /* Export io vector */ +}; + +void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, + void *dev_softc, struct mmio_devinst *mi, + struct vqueue_info *queues); +int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix); +void vi_reset_dev(struct virtio_softc *); + +int vq_getchain(struct vqueue_info *vq, struct iovec *iov, int niov, + struct vi_req *reqp); +void vq_retchains(struct vqueue_info *vq, uint16_t n_chains); +void vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, + uint32_t iolen); +void vq_relchain_publish(struct vqueue_info *vq); +void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); +void vq_endchains(struct vqueue_info *vq, int used_all_avail); + +void vi_mmio_write(struct virtio_softc *vs, uint64_t offset); +#endif /* _BHYVE_VIRTIO_H_ */ diff --git a/tests/sys/virtio/virtio.c b/tests/sys/virtio/virtio.c new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/virtio.c @@ -0,0 +1,886 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Chris Torek + * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "debug.h" +#include "iov_emul.h" +#include "mmio_emul.h" +#include "virtio.h" + +/* + * Functions for dealing with generalized "virtual devices" as + * defined by + */ + +/* + * In case we decide to relax the "virtio softc comes at the + * front of virtio-based device softc" constraint, let's use + * this to convert. + */ +#define DEV_SOFTC(vs) ((void *)(vs)) + +/* + * Link a virtio_softc to its constants, the device softc, and + * the PCI emulation. + */ +void +vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, + void *dev_softc, struct mmio_devinst *mdi, + struct vqueue_info *queues) +{ + int i; + + /* vs and dev_softc addresses must match */ + assert((void *)vs == dev_softc); + vs->vs_vc = vc; + vs->vs_mi = mdi; + + vs->vs_queues = queues; + for (i = 0; i < vc->vc_nvq; i++) { + queues[i].vq_vs = vs; + queues[i].vq_num = i; + } +} + +/* + * Deliver an interrupt to the guest device. + */ +static void +vq_interrupt(struct virtio_softc *vs) +{ + int fd = vs->vs_mi->mi_fd; + int error; + + mmio_set_cfgdata32(vs->vs_mi, VIRTIO_MMIO_INTERRUPT_STATUS, VIRTIO_MMIO_INT_VRING); + error = ioctl(fd, VIRTIO_DBG_KICK); + if (error != 0) + EPRINTLN("device kick failed with %d\n", error); + +} + +/* + * Reset device (device-wide). This erases all queues, i.e., + * all the queues become invalid (though we don't wipe out the + * internal pointers, we just clear the VQ_ALLOC flag). + * + * It resets negotiated features to "none". + */ +void +vi_reset_dev(struct virtio_softc *vs) +{ + struct mmio_devinst *mdi = vs->vs_mi; + struct vqueue_info *vq; + int i, nvq; + + if (vs->vs_mtx) + assert(pthread_mutex_isowned_np(vs->vs_mtx)); + + nvq = vs->vs_vc->vc_nvq; + for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) { + vq->vq_flags = 0; + vq->vq_last_avail = 0; + vq->vq_next_used = 0; + vq->vq_save_used = 0; + /* XXX Is this right? How should we actually set it? */ + vq->vq_offset = UINT_MAX; + } + vs->vs_negotiated_caps = 0; + vs->vs_curq = 0; + + mdi->mi_state = MIDEV_INVALID; + mmio_set_cfgdata32(mdi, VIRTIO_MMIO_INTERRUPT_STATUS, 0); + mmio_set_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_READY, 0); + +} + +/* + * Initialize the currently-selected virtio queue (vs->vs_curq). + * The guest just gave us a page frame number, from which we can + * calculate the addresses of the queue. + */ +/* XXX Switch it back to using the virtio softc. */ +static void +vi_vq_init(struct mmio_devinst *mdi, struct vqueue_info *vq) +{ + uint64_t offset; + + offset = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_DESC_HIGH); + offset <<= 32; + offset |= mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_DESC_LOW); + vq->vq_desc = (struct vring_desc *)(mdi->mi_addr + offset); + + offset = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_AVAIL_HIGH); + offset <<= 32; + offset |= mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_AVAIL_LOW); + vq->vq_avail = (struct vring_avail *)(mdi->mi_addr + offset); + + offset = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_USED_HIGH); + offset <<= 32; + offset |= mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_USED_LOW); + vq->vq_used = (struct vring_used *)(mdi->mi_addr + offset); + + /* Mark queue as allocated, and start at 0 when we use it. */ + vq->vq_flags = VQ_ALLOC; + vq->vq_last_avail = 0; + vq->vq_next_used = 0; + vq->vq_save_used = 0; +} + + +/* + * Helper inline for vq_getchain(): record the i'th "real" + * descriptor. + */ +static inline void +_vq_record(int i, struct vring_desc *vd, struct iovec *iov, + int n_iov, struct vi_req *reqp, struct iov_emul *wiove, + struct iov_emul *riove) +{ + if (i >= n_iov) + return; + + /* XXX Handle OOM scenarios leading to iove_add failures. */ + + /* Preallocate a descriptor data region for the descriptor */ + if ((vd->flags & VRING_DESC_F_WRITE) == 0) { + if (iove_add(riove, vd->addr, vd->len, &iov[i]) != 0) + return; + + reqp->readable++; + } else { + if (iove_add(wiove, vd->addr, vd->len, &iov[i]) != 0) + return; + + reqp->writable++; + } +} +#define VQ_MAX_DESCRIPTORS 512 /* see below */ + +static int +vq_import_indirect(struct vring_desc __unused **vdp) +{ + /* XXX Use the provided vd address to read in the indirect descriptor */ + printf("UNIMPLEMENTED %s\n", __func__); + exit(1); +} + +/* + * Examine the chain of descriptors starting at the "next one" to + * make sure that they describe a sensible request. If so, return + * the number of "real" descriptors that would be needed/used in + * acting on this request. This may be smaller than the number of + * available descriptors, e.g., if there are two available but + * they are two separate requests, this just returns 1. Or, it + * may be larger: if there are indirect descriptors involved, + * there may only be one descriptor available but it may be an + * indirect pointing to eight more. We return 8 in this case, + * i.e., we do not count the indirect descriptors, only the "real" + * ones. + * + * Basically, this vets the "flags" and "next" field of each + * descriptor and tells you how many are involved. Since some may + * be indirect, this also needs the vmctx (in the pci_devinst + * at vs->vs_pi) so that it can find indirect descriptors. + * + * As we process each descriptor, we copy and adjust it (guest to + * host address wise, also using the vmtctx) into the given iov[] + * array (of the given size). If the array overflows, we stop + * placing values into the array but keep processing descriptors, + * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1. + * So you, the caller, must not assume that iov[] is as big as the + * return value (you can process the same thing twice to allocate + * a larger iov array if needed, or supply a zero length to find + * out how much space is needed). + * + * If some descriptor(s) are invalid, this prints a diagnostic message + * and returns -1. If no descriptors are ready now it simply returns 0. + * + * You are assumed to have done a vq_ring_ready() if needed (note + * that vq_has_descs() does one). + */ +int +vq_getchain(struct vqueue_info *vq, struct iovec *iov, int niov, + struct vi_req *reqp) +{ + int i; + u_int ndesc, n_indir; + u_int idx, next; + struct vi_req req; + struct vring_desc *vdir, *vindir, *vp; + struct virtio_softc *vs; + const char *name; + int error; + struct iov_emul *riove, *wiove; + int fd; + + vs = vq->vq_vs; + fd = vs->vs_mi->mi_fd; + name = vs->vs_vc->vc_name; + memset(&req, 0, sizeof(req)); + + vindir = NULL; + riove = iove_alloc(); + wiove = iove_alloc(); + if (riove == NULL || wiove == NULL) { + iove_free(riove); + iove_free(wiove); + return (ENOMEM); + } + + /* + * Note: it's the responsibility of the guest not to + * update vq->vq_avail->idx until all of the descriptors + * the guest has written are valid (including all their + * "next" fields and "flags"). + * + * Compute (vq_avail->idx - last_avail) in integers mod 2**16. This is + * the number of descriptors the device has made available + * since the last time we updated vq->vq_last_avail. + * + * We just need to do the subtraction as an unsigned int, + * then trim off excess bits. + */ + idx = vq->vq_last_avail; + ndesc = (uint16_t)((u_int)vq->vq_avail->idx - idx); + if (ndesc == 0) + return (0); + if (ndesc > vq->vq_qsize) { + /* XXX need better way to diagnose issues */ + EPRINTLN( + "%s: ndesc (%u) out of range, driver confused?", + name, (u_int)ndesc); + return (-1); + } + + /* + * Now count/parse "involved" descriptors starting from + * the head of the chain. + * + * To prevent loops, we could be more complicated and + * check whether we're re-visiting a previously visited + * index, but we just abort if the count gets excessive. + */ + req.idx = next = vq->vq_avail->ring[idx & (vq->vq_qsize - 1)]; + req.iove = wiove; + vq->vq_last_avail++; + for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->next) { + if (next >= vq->vq_qsize) { + EPRINTLN( + "%s: descriptor index %u out of range, " + "driver confused?", + name, next); + goto error; + } + vdir = &vq->vq_desc[next]; + if ((vdir->flags & VRING_DESC_F_INDIRECT) == 0) { + _vq_record(i, vdir, iov, niov, &req, wiove, riove); + i++; + } else if ((vs->vs_vc->vc_hv_caps & + VIRTIO_RING_F_INDIRECT_DESC) == 0) { + EPRINTLN( + "%s: descriptor has forbidden INDIRECT flag, " + "driver confused?", + name); + goto error; + } else { + n_indir = vdir->len / 16; + if ((vdir->len & 0xf) || n_indir == 0) { + EPRINTLN( + "%s: invalid indir len 0x%x, " + "driver confused?", + name, (u_int)vdir->len); + goto error; + } + + error = vq_import_indirect(&vindir); + if (error != 0) + goto error; + /* + * Indirects start at the 0th, then follow + * their own embedded "next"s until those run + * out. Each one's indirect flag must be off + * (we don't really have to check, could just + * ignore errors...). + */ + next = 0; + for (;;) { + vp = &vindir[next]; + if (vp->flags & VRING_DESC_F_INDIRECT) { + EPRINTLN( + "%s: indirect desc has INDIR flag," + " driver confused?", + name); + goto error; + } + _vq_record(i, vp, iov, niov, &req, wiove, riove); + if (++i > VQ_MAX_DESCRIPTORS) { + EPRINTLN( + "%s: descriptor loop? count > %d - driver confused?", + name, i); + goto error; + } + if ((vp->flags & VRING_DESC_F_NEXT) == 0) + break; + next = vp->next; + if (next >= n_indir) { + EPRINTLN( + "%s: invalid next %u > %u, " + "driver confused?", + name, (u_int)next, n_indir); + goto error; + } + } + } + if ((vdir->flags & VRING_DESC_F_NEXT) == 0) + goto done; + } + +error: + iove_free(riove); + iove_free(wiove); + free(vindir); + + return (-1); + +done: + /* Read in readable descriptors from the kernel. */ + error = iove_import(fd, riove); + iove_free(riove); + free(vindir); + + if (error != 0) { + EPRINTLN("Reading in data failed with %d", error); + return (-1); + } + + *reqp = req; + return (i); +} + +/* + * Return the first n_chain request chains back to the available queue. + * + * (These chains are the ones you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_retchains(struct vqueue_info *vq, uint16_t n_chains) +{ + + vq->vq_last_avail -= n_chains; +} + +void +vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) +{ + struct vring_used *vuh; + struct vring_used_elem *vue; + uint16_t mask; + + /* + * Notes: + * - mask is N-1 where N is a power of 2 so computes x % N + * - vuh points to the "used" data shared with guest + * - vue points to the "used" ring entry we want to update + */ + mask = vq->vq_qsize - 1; + vuh = vq->vq_used; + + vue = &vuh->ring[vq->vq_next_used++ & mask]; + vue->id = idx; + vue->len = iolen; +} + +void +vq_relchain_publish(struct vqueue_info *vq) +{ + /* + * Ensure the used descriptor is visible before updating the index. + * This is necessary on ISAs with memory ordering less strict than x86 + * (and even on x86 to act as a compiler barrier). + */ + atomic_thread_fence_rel(); + vq->vq_used->idx = vq->vq_next_used; +} + +/* + * Return specified request chain to the guest, setting its I/O length + * to the provided value. + * + * (This chain is the one you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) +{ + vq_relchain_prepare(vq, idx, iolen); + vq_relchain_publish(vq); +} + +/* + * Driver has finished processing "available" chains and calling + * vq_relchain on each one. If driver used all the available + * chains, used_all should be set. + * + * If the "used" index moved we may need to inform the guest, i.e., + * deliver an interrupt. Even if the used index did NOT move we + * may need to deliver an interrupt, if the avail ring is empty and + * we are supposed to interrupt on empty. + * + * Note that used_all_avail is provided by the caller because it's + * a snapshot of the ring state when he decided to finish interrupt + * processing -- it's possible that descriptors became available after + * that point. (It's also typically a constant 1/True as well.) + */ +void +vq_endchains(struct vqueue_info *vq, int used_all_avail) +{ + struct virtio_softc *vs; + uint16_t event_idx, new_idx, old_idx; + int intr; + + /* + * Interrupt generation: if we're using EVENT_IDX, + * interrupt if we've crossed the event threshold. + * Otherwise interrupt is generated if we added "used" entries, + * but suppressed by VRING_AVAIL_F_NO_INTERRUPT. + * + * In any case, though, if NOTIFY_ON_EMPTY is set and the + * entire avail was processed, we need to interrupt always. + */ + vs = vq->vq_vs; + old_idx = vq->vq_save_used; + vq->vq_save_used = new_idx = vq->vq_used->idx; + + /* + * Use full memory barrier between "idx" store from preceding + * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or + * "flags" field below. + */ + atomic_thread_fence_seq_cst(); + if (used_all_avail && + (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) + intr = 1; + else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) { + event_idx = VQ_USED_EVENT_IDX(vq); + /* + * This calculation is per docs and the kernel + * (see src/sys/dev/virtio/virtio_ring.h). + */ + intr = (uint16_t)(new_idx - event_idx - 1) < + (uint16_t)(new_idx - old_idx); + } else { + intr = new_idx != old_idx && + !(vq->vq_avail->flags & VRING_AVAIL_F_NO_INTERRUPT); + } + if (intr) + vq_interrupt(vs); +} + +/* Note: these are in sorted order to make for a fast search */ +static struct config_reg { + uint16_t cr_offset; /* register offset */ + uint8_t cr_ro; /* true => reg is read only */ + const char *cr_name; /* name of reg */ +} config_regs[] = { + { VIRTIO_MMIO_MAGIC_VALUE, 1,"MMIO_MAGIC_VALUE" }, + { VIRTIO_MMIO_VERSION, 1, "VERSION" }, + { VIRTIO_MMIO_DEVICE_ID, 1, "DEVICE_ID" }, + { VIRTIO_MMIO_VENDOR_ID, 1, "VENDOR_ID" }, + { VIRTIO_MMIO_HOST_FEATURES, 1, "HOST_FEATURES" }, + { VIRTIO_MMIO_HOST_FEATURES_SEL, 0, "HOST_FEATURES_SEL" }, + { VIRTIO_MMIO_GUEST_FEATURES, 0, "GUEST_FEATURES" }, + { VIRTIO_MMIO_GUEST_FEATURES_SEL, 0, "GUEST_FEATURES_SEL" }, + { VIRTIO_MMIO_QUEUE_SEL, 0, "QUEUE_SEL" }, + { VIRTIO_MMIO_QUEUE_NUM_MAX, 1, "QUEUE_NUM_MAX" }, + { VIRTIO_MMIO_QUEUE_NUM, 0, "QUEUE_NUM" }, + { VIRTIO_MMIO_QUEUE_READY, 0, "QUEUE_READY" }, + { VIRTIO_MMIO_QUEUE_NOTIFY, 0, "QUEUE_NOTIFY" }, + { VIRTIO_MMIO_INTERRUPT_STATUS, 1, "INTERRUPT_STATUS" }, + { VIRTIO_MMIO_INTERRUPT_ACK, 0, "INTERRUPT_ACK" }, + { VIRTIO_MMIO_STATUS, 0, "STATUS" }, + { VIRTIO_MMIO_QUEUE_DESC_LOW, 0, "QUEUE_DESC_LOW" }, + { VIRTIO_MMIO_QUEUE_DESC_HIGH, 0, "QUEUE_DESC_HIGH" }, + { VIRTIO_MMIO_QUEUE_AVAIL_LOW, 0, "QUEUE_AVAIL_LOW" }, + { VIRTIO_MMIO_QUEUE_AVAIL_HIGH, 0, "QUEUE_AVAIL_HIGH" }, + { VIRTIO_MMIO_QUEUE_USED_LOW, 0, "QUEUE_USED_LOW" }, + { VIRTIO_MMIO_QUEUE_USED_HIGH, 0, "QUEUE_USED_HIGH" }, + { VIRTIO_MMIO_CONFIG_GENERATION, 1, "CONFIG_GENERATION" }, +}; + +static inline struct config_reg * +vi_find_cr(int offset) { + u_int hi, lo, mid; + struct config_reg *cr; + + lo = 0; + hi = sizeof(config_regs) / sizeof(*config_regs) - 1; + while (hi >= lo) { + mid = (hi + lo) >> 1; + cr = &config_regs[mid]; + if (cr->cr_offset == offset) + return (cr); + if (cr->cr_offset < offset) + lo = mid + 1; + else + hi = mid - 1; + } + return (NULL); +} + +static void +vi_handle_state_change(struct mmio_devinst *mdi, uint32_t status) +{ + switch (mdi->mi_state) { + case MIDEV_INVALID: + if (status & VIRTIO_CONFIG_STATUS_ACK) + mdi->mi_state = MIDEV_ACKNOWLEDGED; + break; + + case MIDEV_ACKNOWLEDGED: + if (status & VIRTIO_CONFIG_STATUS_DRIVER) + mdi->mi_state = MIDEV_DRIVER_FOUND; + break; + + case MIDEV_DRIVER_FOUND: + if (status & VIRTIO_CONFIG_S_FEATURES_OK) + mdi->mi_state = MIDEV_FEATURES_OK; + break; + + case MIDEV_FEATURES_OK: + if (status & VIRTIO_CONFIG_STATUS_DRIVER_OK) + mdi->mi_state = MIDEV_LIVE; + + break; + + case MIDEV_LIVE: + break; + + case MIDEV_FAILED: + mdi->mi_state = MIDEV_FAILED; + break; + + default: + EPRINTLN("invalid device state %d", mdi->mi_state); + exit(1); + } +} + +static void +vi_handle_status(struct virtio_softc *vs, uint32_t status) +{ + + struct mmio_devinst *mdi = vs->vs_mi; + + if (status & VIRTIO_CONFIG_STATUS_FAILED) { + mdi->mi_state = MIDEV_FAILED; + return; + } + + if (status & VIRTIO_CONFIG_STATUS_RESET) { + mdi->mi_state = MIDEV_INVALID; + vi_reset_dev(vs); + return; + } + + vi_handle_state_change(mdi, status); +} + +static void +vi_handle_host_features_sel(struct virtio_softc *vs, uint32_t sel) +{ + uint64_t caps = vs->vs_vc->vc_hv_caps; + struct mmio_devinst *mdi = vs->vs_mi; + + if (sel > 1) { + EPRINTLN("HOST_FEATURES SEL 0x%x, " + "driver confused?", sel); + return; + } + + if (sel == 1) { + mmio_set_cfgdata32(mdi, VIRTIO_MMIO_HOST_FEATURES, + (uint32_t)(caps >> 32)); + } else { + mmio_set_cfgdata32(mdi, VIRTIO_MMIO_HOST_FEATURES, + (uint32_t)caps); + } +} + +static void +vi_handle_guest_features(struct virtio_softc *vs, uint32_t features) +{ + struct mmio_devinst *mdi = vs->vs_mi; + struct virtio_consts *vc = vs->vs_vc; + uint64_t caps; + int hi; + + /* + * XXX Add asserts to ensure we are negotiating w/ the device + * and not in the middle of an operation. + */ + + hi = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_GUEST_FEATURES_SEL); + if (hi > 1) { + EPRINTLN("GUEST_FEATURES_SEL 0x%x, " + "driver confused?", hi); + return; + } + + if (hi == 1) { + /* Update the upper bits, keep the lower ones intact. */ + caps = (vc->vc_hv_caps | features) >> 32; + vs->vs_negotiated_caps &= (vs->vs_negotiated_caps & (((1UL << 32) - 1)) << 32); + vs->vs_negotiated_caps |= (caps << 32); + } else { + /* Update the lower bits, keep the upper ones intact. */ + caps = (uint32_t)(vc->vc_hv_caps | features); + vs->vs_negotiated_caps &= (vs->vs_negotiated_caps & ((1UL << 32) - 1)); + vs->vs_negotiated_caps |= caps; + + /* The LSBs get sent second, we are ready to apply the features. */ + if (vc->vc_apply_features) + (*vc->vc_apply_features)(DEV_SOFTC(vs), + vs->vs_negotiated_caps); + } + +} + + +static void +vi_handle_queue_sel(struct virtio_softc *vs) +{ + struct mmio_devinst *mdi = vs->vs_mi; + struct vqueue_info *vq; + + vs->vs_curq = mmio_get_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_SEL); + + if (vs->vs_curq < 0 || vs->vs_curq >= vs->vs_vc->vc_nvq) { + EPRINTLN("Selected queue %d, driver confused?", vs->vs_curq); + return; + } + + vq = &vs->vs_queues[vs->vs_curq]; + if (vq_ring_ready(vq)) { + mmio_set_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_READY, 1); + return; + } + + /* Part of virtqueue initialization. */ + mmio_set_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_NUM_MAX, vq->vq_qsize); + mmio_set_cfgdata32(mdi, VIRTIO_MMIO_QUEUE_READY, 0); + + return; +} + +static void +vi_handle_queue_num(struct virtio_softc *vs, int32_t qsize) +{ + struct vqueue_info *vq = &vs->vs_queues[vs->vs_curq]; + + if (qsize > vq->vq_qsize || !powerof2(qsize)) { + EPRINTLN("QUEUE_NUM %d is invalid, driver confused?", qsize); + return; + } + + vq->vq_qsize = qsize; +} + +static void +vi_handle_queue_ready(struct virtio_softc *vs, uint32_t ready) +{ + struct vqueue_info *vq = &vs->vs_queues[vs->vs_curq]; + struct mmio_devinst *mdi = vs->vs_mi; + + if (ready > 1) { + EPRINTLN("QUEUE_READY has value %d, driver confused?", ready); + return; + } + + if (ready == 1 && !vq_ring_ready(vq)) { + vi_vq_init(mdi, vq); + return; + } +} + +static void +vi_handle_interrupt_ack(struct virtio_softc *vs, uint32_t ack) +{ + struct mmio_devinst *mdi = vs->vs_mi; + + /* + * Follow the protocol even if we are executing the + * interrupt ourselves, so we are the ones that sent + * the ACK from the kernel in the first place. + */ + if (ack != 1) { + EPRINTLN("INTERRUPT_ACK has value %d, " + "driver confused?", ack); + return; + } + + mmio_set_cfgdata32(mdi, VIRTIO_MMIO_INTERRUPT_ACK, 0); +} + +static void +vi_handle_queue_notify(struct virtio_softc *vs, uint32_t ind) +{ + struct virtio_consts *vc = vs->vs_vc; + struct vqueue_info *vq; + + if (ind >= (unsigned int)vc->vc_nvq) { + EPRINTLN("%s: queue %d notify out of range", + vc->vc_name, ind); + } + + vq = &vs->vs_queues[ind]; + if (vq->vq_notify) { + (*vq->vq_notify)(DEV_SOFTC(vs), vq); + } else if (vc->vc_qnotify) { + (*vc->vc_qnotify)(DEV_SOFTC(vs), vq); + } else { + EPRINTLN("%s: qnotify value %d: missing vq/vc notify", + vc->vc_name, ind); + } + +} + +void +vi_mmio_write(struct virtio_softc *vs, uint64_t offset) +{ + /* Reported writes are always 32-bit. */ + const int size = 4; + + struct mmio_devinst *mdi = vs->vs_mi; + struct virtio_consts *vc; + struct config_reg *cr; + const char *name; + uint32_t newoff; + int32_t value; + uint64_t max; + int error; + + if (vs->vs_mtx) + pthread_mutex_lock(vs->vs_mtx); + + vc = vs->vs_vc; + name = vc->vc_name; + + /* If writing in the config space, */ + if (offset >= VIRTIO_MMIO_CONFIG) { + newoff = offset - VIRTIO_MMIO_CONFIG; + max = vc->vc_cfgsize ? vc->vc_cfgsize : (mdi->mi_bytes - VIRTIO_MMIO_CONFIG); + if (newoff + size > max) + goto bad; + + value = mmio_get_cfgdata32(mdi, offset); + + if (vc->vc_cfgwrite != NULL) + error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value); + else + error = 0; + if (!error) + goto done; + } + +bad: + cr = vi_find_cr(offset); + if (cr == NULL) { + EPRINTLN("%s: write to bad offset %jd", + name, (uintmax_t)offset); + goto done; + + } + + if (cr->cr_ro) { + EPRINTLN("%s: write to read-only reg %s", + name, cr->cr_name); + goto done; + } + + value = mmio_get_cfgdata32(mdi, cr->cr_offset); + + switch (cr->cr_offset) { + case VIRTIO_MMIO_STATUS: + vi_handle_status(vs, value); + break; + + case VIRTIO_MMIO_HOST_FEATURES_SEL: + vi_handle_host_features_sel(vs, value); + break; + + case VIRTIO_MMIO_GUEST_FEATURES: + vi_handle_guest_features(vs, value); + break; + + case VIRTIO_MMIO_QUEUE_SEL: + vi_handle_queue_sel(vs); + break; + + case VIRTIO_MMIO_QUEUE_NUM: + vi_handle_queue_num(vs, value); + break; + + case VIRTIO_MMIO_QUEUE_READY: + vi_handle_queue_ready(vs, value); + break; + + case VIRTIO_MMIO_QUEUE_NOTIFY: + vi_handle_queue_notify(vs, value); + break; + + case VIRTIO_MMIO_INTERRUPT_ACK: + vi_handle_interrupt_ack(vs, value); + break; + default: + EPRINTLN("Unhandled offset %d\n", cr->cr_offset); + assert(0); + } + + goto done; + +done: + + if (vs->vs_mtx) + pthread_mutex_unlock(vs->vs_mtx); +} diff --git a/tests/sys/virtio/virtiodbg.c b/tests/sys/virtio/virtiodbg.c new file mode 100644 --- /dev/null +++ b/tests/sys/virtio/virtiodbg.c @@ -0,0 +1,105 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "config.h" +#include "debug.h" +#include "mevent.h" +#include "mmio_emul.h" + +static void +virtiodbg_usage(int code) +{ + const char *progname; + + progname = getprogname(); + + fprintf(stderr, + "Usage: %s [-hot]\n" + " -h: help\n" + " -o: set config 'var' to 'value'\n" + " -t: MMIO device type\n", + progname); + exit(code); +} + +static bool +virtiodbg_parse_config_option(nvlist_t *nvl, const char *option) +{ + const char *key; + char *value; + + key = option; + value = strchr(option, '='); + if (value == NULL || value[1] == '\0') + return (false); + + *value = '\0'; + + set_config_value_node(nvl, key, value + 1); + return (true); +} + + +static nvlist_t * +virtiodbg_optparse(int argc, char **argv) +{ + const char *optstr; + nvlist_t *nvl; + int c; + + nvl = create_config_node("device"); + + optstr = "ho:t:"; + while ((c = getopt(argc, argv, optstr)) != -1) { + switch (c) { + case 't': + if (strncmp(optarg, "help", strlen(optarg)) == 0) { + mmio_print_supported_devices(); + exit(0); + } else if (mmio_parse_device(nvl, optarg) != 0) + exit(4); + else + break; + case 'o': + if (!virtiodbg_parse_config_option(nvl, optarg)) { + errx(EX_USAGE, + "invalid configuration option '%s'", + optarg); + } + break; + case 'h': + virtiodbg_usage(0); + default: + virtiodbg_usage(1); + } + } + + return (nvl); +} + +int +main(int argc, char *argv[]) +{ + nvlist_t *nvl; + + init_config(); + nvl = virtiodbg_optparse(argc, argv); + + /* Exit if a device emulation finds an error in its initialization */ + if (init_mmio(nvl) != 0) { + EPRINTLN("Device emulation initialization error: %s", + strerror(errno)); + exit(4); + } + + /* Head off to the main event dispatch loop. */ + mevent_dispatch(); + + exit(4); +}