diff --git a/usr.sbin/bhyve/pci_virtio_9p.c b/usr.sbin/bhyve/pci_virtio_9p.c index e27159eb22cb..fed18ce5a8cc 100644 --- a/usr.sbin/bhyve/pci_virtio_9p.c +++ b/usr.sbin/bhyve/pci_virtio_9p.c @@ -1,360 +1,352 @@ /*- * Copyright (c) 2015 iXsystems Inc. * Copyright (c) 2017-2018 Jakub Klama * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * VirtIO filesystem passthrough using 9p protocol. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #define VT9P_MAX_IOV 128 #define VT9P_RINGSZ 256 #define VT9P_MAXTAGSZ 256 #define VT9P_CONFIGSPACESZ (VT9P_MAXTAGSZ + sizeof(uint16_t)) static int pci_vt9p_debug; #define DPRINTF(params) if (pci_vt9p_debug) printf params #define WPRINTF(params) printf params /* * Per-device softc */ struct pci_vt9p_softc { struct virtio_softc vsc_vs; struct vqueue_info vsc_vq; pthread_mutex_t vsc_mtx; uint64_t vsc_cfg; uint64_t vsc_features; char * vsc_rootpath; struct pci_vt9p_config * vsc_config; struct l9p_backend * vsc_fs_backend; struct l9p_server * vsc_server; struct l9p_connection * vsc_conn; }; struct pci_vt9p_request { struct pci_vt9p_softc * vsr_sc; struct iovec * vsr_iov; size_t vsr_niov; size_t vsr_respidx; size_t vsr_iolen; uint16_t vsr_idx; }; struct pci_vt9p_config { uint16_t tag_len; char tag[0]; } __attribute__((packed)); static int pci_vt9p_send(struct l9p_request *, const struct iovec *, const size_t, const size_t, void *); static void pci_vt9p_drop(struct l9p_request *, const struct iovec *, size_t, void *); static void pci_vt9p_reset(void *); static void pci_vt9p_notify(void *, struct vqueue_info *); static int pci_vt9p_cfgread(void *, int, int, uint32_t *); static void pci_vt9p_neg_features(void *, uint64_t); static struct virtio_consts vt9p_vi_consts = { "vt9p", /* our name */ 1, /* we support 1 virtqueue */ VT9P_CONFIGSPACESZ, /* config reg size */ pci_vt9p_reset, /* reset */ pci_vt9p_notify, /* device-wide qnotify */ pci_vt9p_cfgread, /* read virtio config */ NULL, /* write virtio config */ pci_vt9p_neg_features, /* apply negotiated features */ (1 << 0), /* our capabilities */ }; static void pci_vt9p_reset(void *vsc) { struct pci_vt9p_softc *sc; sc = vsc; DPRINTF(("vt9p: device reset requested !\n")); vi_reset_dev(&sc->vsc_vs); } static void pci_vt9p_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vt9p_softc *sc = vsc; sc->vsc_features = negotiated_features; } static int pci_vt9p_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vt9p_softc *sc = vsc; void *ptr; ptr = (uint8_t *)sc->vsc_config + offset; memcpy(retval, ptr, size); return (0); } static int pci_vt9p_get_buffer(struct l9p_request *req, struct iovec *iov, size_t *niov, void *arg) { struct pci_vt9p_request *preq = req->lr_aux; size_t n = preq->vsr_niov - preq->vsr_respidx; memcpy(iov, preq->vsr_iov + preq->vsr_respidx, n * sizeof(struct iovec)); *niov = n; return (0); } static int pci_vt9p_send(struct l9p_request *req, const struct iovec *iov, const size_t niov, const size_t iolen, void *arg) { struct pci_vt9p_request *preq = req->lr_aux; struct pci_vt9p_softc *sc = preq->vsr_sc; preq->vsr_iolen = iolen; pthread_mutex_lock(&sc->vsc_mtx); vq_relchain(&sc->vsc_vq, preq->vsr_idx, preq->vsr_iolen); vq_endchains(&sc->vsc_vq, 1); pthread_mutex_unlock(&sc->vsc_mtx); free(preq); return (0); } static void pci_vt9p_drop(struct l9p_request *req, const struct iovec *iov, size_t niov, void *arg) { struct pci_vt9p_request *preq = req->lr_aux; struct pci_vt9p_softc *sc = preq->vsr_sc; pthread_mutex_lock(&sc->vsc_mtx); vq_relchain(&sc->vsc_vq, preq->vsr_idx, 0); vq_endchains(&sc->vsc_vq, 1); pthread_mutex_unlock(&sc->vsc_mtx); free(preq); } static void pci_vt9p_notify(void *vsc, struct vqueue_info *vq) { struct iovec iov[VT9P_MAX_IOV]; struct pci_vt9p_softc *sc; struct pci_vt9p_request *preq; - uint16_t idx, n, i; - uint16_t flags[VT9P_MAX_IOV]; + struct vi_req req; + uint16_t n; sc = vsc; while (vq_has_descs(vq)) { - n = vq_getchain(vq, &idx, iov, VT9P_MAX_IOV, flags); + n = vq_getchain(vq, iov, VT9P_MAX_IOV, &req); preq = calloc(1, sizeof(struct pci_vt9p_request)); preq->vsr_sc = sc; - preq->vsr_idx = idx; + preq->vsr_idx = req.idx; preq->vsr_iov = iov; preq->vsr_niov = n; - preq->vsr_respidx = 0; - - /* Count readable descriptors */ - for (i = 0; i < n; i++) { - if (flags[i] & VRING_DESC_F_WRITE) - break; - - preq->vsr_respidx++; - } + preq->vsr_respidx = req.readable; for (int i = 0; i < n; i++) { DPRINTF(("vt9p: vt9p_notify(): desc%d base=%p, " - "len=%zu, flags=0x%04x\r\n", i, iov[i].iov_base, - iov[i].iov_len, flags[i])); + "len=%zu\r\n", i, iov[i].iov_base, + iov[i].iov_len)); } l9p_connection_recv(sc->vsc_conn, iov, preq->vsr_respidx, preq); } } static int pci_vt9p_legacy_config(nvlist_t *nvl, const char *opts) { char *sharename, *tofree, *token, *tokens; if (opts == NULL) return (0); tokens = tofree = strdup(opts); while ((token = strsep(&tokens, ",")) != NULL) { if (strchr(token, '=') != NULL) { if (sharename != NULL) { EPRINTLN( "virtio-9p: more than one share name given"); return (-1); } sharename = strsep(&token, "="); set_config_value_node(nvl, "sharename", sharename); set_config_value_node(nvl, "path", token); } else set_config_bool_node(nvl, token, true); } free(tofree); return (0); } static int pci_vt9p_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { struct pci_vt9p_softc *sc; const char *value; const char *sharename; int rootfd; bool ro; cap_rights_t rootcap; ro = get_config_bool_node_default(nvl, "ro", false); value = get_config_value_node(nvl, "path"); if (value == NULL) { EPRINTLN("virtio-9p: path required"); return (1); } rootfd = open(value, O_DIRECTORY); if (rootfd < 0) { EPRINTLN("virtio-9p: failed to open '%s': %s", value, strerror(errno)); return (-1); } sharename = get_config_value_node(nvl, "sharename"); if (sharename == NULL) { EPRINTLN("virtio-9p: share name required"); return (1); } if (strlen(sharename) > VT9P_MAXTAGSZ) { EPRINTLN("virtio-9p: share name too long"); return (1); } sc = calloc(1, sizeof(struct pci_vt9p_softc)); sc->vsc_config = calloc(1, sizeof(struct pci_vt9p_config) + VT9P_MAXTAGSZ); pthread_mutex_init(&sc->vsc_mtx, NULL); cap_rights_init(&rootcap, CAP_LOOKUP, CAP_ACL_CHECK, CAP_ACL_DELETE, CAP_ACL_GET, CAP_ACL_SET, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_FSTAT, CAP_CREATE, CAP_FCHMODAT, CAP_FCHOWNAT, CAP_FTRUNCATE, CAP_LINKAT_SOURCE, CAP_LINKAT_TARGET, CAP_MKDIRAT, CAP_MKNODAT, CAP_PREAD, CAP_PWRITE, CAP_RENAMEAT_SOURCE, CAP_RENAMEAT_TARGET, CAP_SEEK, CAP_SYMLINKAT, CAP_UNLINKAT, CAP_EXTATTR_DELETE, CAP_EXTATTR_GET, CAP_EXTATTR_LIST, CAP_EXTATTR_SET, CAP_FUTIMES, CAP_FSTATFS, CAP_FSYNC, CAP_FPATHCONF); if (cap_rights_limit(rootfd, &rootcap) != 0) return (1); sc->vsc_config->tag_len = (uint16_t)strlen(sharename); memcpy(sc->vsc_config->tag, sharename, sc->vsc_config->tag_len); if (l9p_backend_fs_init(&sc->vsc_fs_backend, rootfd, ro) != 0) { errno = ENXIO; return (1); } if (l9p_server_init(&sc->vsc_server, sc->vsc_fs_backend) != 0) { errno = ENXIO; return (1); } if (l9p_connection_init(sc->vsc_server, &sc->vsc_conn) != 0) { errno = EIO; return (1); } sc->vsc_conn->lc_msize = L9P_MAX_IOV * PAGE_SIZE; sc->vsc_conn->lc_lt.lt_get_response_buffer = pci_vt9p_get_buffer; sc->vsc_conn->lc_lt.lt_send_response = pci_vt9p_send; sc->vsc_conn->lc_lt.lt_drop_response = pci_vt9p_drop; vi_softc_linkup(&sc->vsc_vs, &vt9p_vi_consts, sc, pi, &sc->vsc_vq); sc->vsc_vs.vs_mtx = &sc->vsc_mtx; sc->vsc_vq.vq_qsize = VT9P_RINGSZ; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_9P); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_9P); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) return (1); vi_set_io_bar(&sc->vsc_vs, 0); return (0); } struct pci_devemu pci_de_v9p = { .pe_emu = "virtio-9p", .pe_legacy_config = pci_vt9p_legacy_config, .pe_init = pci_vt9p_init, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read }; PCI_EMUL_SET(pci_de_v9p); diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c index 0dc58e49594b..8a172c54eda7 100644 --- a/usr.sbin/bhyve/pci_virtio_block.c +++ b/usr.sbin/bhyve/pci_virtio_block.c @@ -1,576 +1,577 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * Copyright 2020 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #include "block_if.h" #define VTBLK_BSIZE 512 #define VTBLK_RINGSZ 128 _Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request"); #define VTBLK_S_OK 0 #define VTBLK_S_IOERR 1 #define VTBLK_S_UNSUPP 2 #define VTBLK_BLK_ID_BYTES 20 + 1 /* Capability bits */ #define VTBLK_F_BARRIER (1 << 0) /* Does host support barriers? */ #define VTBLK_F_SIZE_MAX (1 << 1) /* Indicates maximum segment size */ #define VTBLK_F_SEG_MAX (1 << 2) /* Indicates maximum # of segments */ #define VTBLK_F_GEOMETRY (1 << 4) /* Legacy geometry available */ #define VTBLK_F_RO (1 << 5) /* Disk is read-only */ #define VTBLK_F_BLK_SIZE (1 << 6) /* Block size of disk is available*/ #define VTBLK_F_SCSI (1 << 7) /* Supports scsi command passthru */ #define VTBLK_F_FLUSH (1 << 9) /* Writeback mode enabled after reset */ #define VTBLK_F_WCE (1 << 9) /* Legacy alias for FLUSH */ #define VTBLK_F_TOPOLOGY (1 << 10) /* Topology information is available */ #define VTBLK_F_CONFIG_WCE (1 << 11) /* Writeback mode available in config */ #define VTBLK_F_MQ (1 << 12) /* Multi-Queue */ #define VTBLK_F_DISCARD (1 << 13) /* Trim blocks */ #define VTBLK_F_WRITE_ZEROES (1 << 14) /* Write zeros */ /* * Host capabilities */ #define VTBLK_S_HOSTCAPS \ ( VTBLK_F_SEG_MAX | \ VTBLK_F_BLK_SIZE | \ VTBLK_F_FLUSH | \ VTBLK_F_TOPOLOGY | \ VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ /* * The current blockif_delete() interface only allows a single delete * request at a time. */ #define VTBLK_MAX_DISCARD_SEG 1 /* * An arbitrary limit to prevent excessive latency due to large * delete requests. */ #define VTBLK_MAX_DISCARD_SECT ((16 << 20) / VTBLK_BSIZE) /* 16 MiB */ /* * Config space "registers" */ struct vtblk_config { uint64_t vbc_capacity; uint32_t vbc_size_max; uint32_t vbc_seg_max; struct { uint16_t cylinders; uint8_t heads; uint8_t sectors; } vbc_geometry; uint32_t vbc_blk_size; struct { uint8_t physical_block_exp; uint8_t alignment_offset; uint16_t min_io_size; uint32_t opt_io_size; } vbc_topology; uint8_t vbc_writeback; uint8_t unused0[1]; uint16_t num_queues; uint32_t max_discard_sectors; uint32_t max_discard_seg; uint32_t discard_sector_alignment; uint32_t max_write_zeroes_sectors; uint32_t max_write_zeroes_seg; uint8_t write_zeroes_may_unmap; uint8_t unused1[3]; } __packed; /* * Fixed-size block header */ struct virtio_blk_hdr { #define VBH_OP_READ 0 #define VBH_OP_WRITE 1 #define VBH_OP_SCSI_CMD 2 #define VBH_OP_SCSI_CMD_OUT 3 #define VBH_OP_FLUSH 4 #define VBH_OP_FLUSH_OUT 5 #define VBH_OP_IDENT 8 #define VBH_OP_DISCARD 11 #define VBH_OP_WRITE_ZEROES 13 #define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ uint32_t vbh_type; uint32_t vbh_ioprio; uint64_t vbh_sector; } __packed; /* * Debug printf */ static int pci_vtblk_debug; #define DPRINTF(params) if (pci_vtblk_debug) PRINTLN params #define WPRINTF(params) PRINTLN params struct pci_vtblk_ioreq { struct blockif_req io_req; struct pci_vtblk_softc *io_sc; uint8_t *io_status; uint16_t io_idx; }; struct virtio_blk_discard_write_zeroes { uint64_t sector; uint32_t num_sectors; struct { uint32_t unmap:1; uint32_t reserved:31; } flags; }; /* * Per-device softc */ struct pci_vtblk_softc { struct virtio_softc vbsc_vs; pthread_mutex_t vsc_mtx; struct vqueue_info vbsc_vq; struct vtblk_config vbsc_cfg; struct virtio_consts vbsc_consts; struct blockif_ctxt *bc; char vbsc_ident[VTBLK_BLK_ID_BYTES]; struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ]; }; static void pci_vtblk_reset(void *); static void pci_vtblk_notify(void *, struct vqueue_info *); static int pci_vtblk_cfgread(void *, int, int, uint32_t *); static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); #ifdef BHYVE_SNAPSHOT static void pci_vtblk_pause(void *); static void pci_vtblk_resume(void *); static int pci_vtblk_snapshot(void *, struct vm_snapshot_meta *); #endif static struct virtio_consts vtblk_vi_consts = { "vtblk", /* our name */ 1, /* we support 1 virtqueue */ sizeof(struct vtblk_config), /* config reg size */ pci_vtblk_reset, /* reset */ pci_vtblk_notify, /* device-wide qnotify */ pci_vtblk_cfgread, /* read PCI config */ pci_vtblk_cfgwrite, /* write PCI config */ NULL, /* apply negotiated features */ VTBLK_S_HOSTCAPS, /* our capabilities */ #ifdef BHYVE_SNAPSHOT pci_vtblk_pause, /* pause blockif threads */ pci_vtblk_resume, /* resume blockif threads */ pci_vtblk_snapshot, /* save / restore device state */ #endif }; static void pci_vtblk_reset(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device reset requested !")); vi_reset_dev(&sc->vbsc_vs); } static void pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err) { struct pci_vtblk_softc *sc = io->io_sc; /* convert errno into a virtio block error return */ if (err == EOPNOTSUPP || err == ENOSYS) *io->io_status = VTBLK_S_UNSUPP; else if (err != 0) *io->io_status = VTBLK_S_IOERR; else *io->io_status = VTBLK_S_OK; /* * Return the descriptor back to the host. * We wrote 1 byte (our status) to host. */ vq_relchain(&sc->vbsc_vq, io->io_idx, 1); vq_endchains(&sc->vbsc_vq, 0); } #ifdef BHYVE_SNAPSHOT static void pci_vtblk_pause(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device pause requested !\n")); blockif_pause(sc->bc); } static void pci_vtblk_resume(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device resume requested !\n")); blockif_resume(sc->bc); } static int pci_vtblk_snapshot(void *vsc, struct vm_snapshot_meta *meta) { int ret; struct pci_vtblk_softc *sc = vsc; SNAPSHOT_VAR_OR_LEAVE(sc->vbsc_cfg, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(sc->vbsc_ident, sizeof(sc->vbsc_ident), meta, ret, done); done: return (ret); } #endif static void pci_vtblk_done(struct blockif_req *br, int err) { struct pci_vtblk_ioreq *io = br->br_param; struct pci_vtblk_softc *sc = io->io_sc; pthread_mutex_lock(&sc->vsc_mtx); pci_vtblk_done_locked(io, err); pthread_mutex_unlock(&sc->vsc_mtx); } static void pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) { struct virtio_blk_hdr *vbh; struct pci_vtblk_ioreq *io; int i, n; int err; ssize_t iolen; int writeop, type; + struct vi_req req; struct iovec iov[BLOCKIF_IOV_MAX + 2]; - uint16_t idx, flags[BLOCKIF_IOV_MAX + 2]; struct virtio_blk_discard_write_zeroes *discard; - n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags); + n = vq_getchain(vq, iov, BLOCKIF_IOV_MAX + 2, &req); /* * The first descriptor will be the read-only fixed header, * and the last is for status (hence +2 above and below). * The remaining iov's are the actual data I/O vectors. * * XXX - note - this fails on crash dump, which does a * VIRTIO_BLK_T_FLUSH with a zero transfer length */ assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2); - io = &sc->vbsc_ios[idx]; - assert((flags[0] & VRING_DESC_F_WRITE) == 0); + io = &sc->vbsc_ios[req.idx]; + assert(req.readable != 0); assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); vbh = (struct virtio_blk_hdr *)iov[0].iov_base; memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2)); io->io_req.br_iovcnt = n - 2; io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE; io->io_status = (uint8_t *)iov[--n].iov_base; + assert(req.writable != 0); assert(iov[n].iov_len == 1); - assert(flags[n] & VRING_DESC_F_WRITE); /* * XXX * The guest should not be setting the BARRIER flag because * we don't advertise the capability. */ type = vbh->vbh_type & ~VBH_FLAG_BARRIER; writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD); + /* + * - Write op implies read-only descriptor + * - Read/ident op implies write-only descriptor + * + * By taking away either the read-only fixed header or the write-only + * status iovec, the following condition should hold true. + */ + assert(n == (writeop ? req.readable : req.writable)); iolen = 0; for (i = 1; i < n; i++) { - /* - * - write op implies read-only descriptor, - * - read/ident op implies write-only descriptor, - * therefore test the inverse of the descriptor bit - * to the op. - */ - assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop); iolen += iov[i].iov_len; } io->io_req.br_resid = iolen; DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld", writeop ? "write/discard" : "read/ident", iolen, i - 1, io->io_req.br_offset)); switch (type) { case VBH_OP_READ: err = blockif_read(sc->bc, &io->io_req); break; case VBH_OP_WRITE: err = blockif_write(sc->bc, &io->io_req); break; case VBH_OP_DISCARD: /* * We currently only support a single request, if the guest * has submitted a request that doesn't conform to the * requirements, we return a error. */ if (iov[1].iov_len != sizeof (*discard)) { pci_vtblk_done_locked(io, EINVAL); return; } /* The segments to discard are provided rather than data */ discard = (struct virtio_blk_discard_write_zeroes *) iov[1].iov_base; /* * virtio v1.1 5.2.6.2: * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP * for discard and write zeroes commands if any unknown flag is * set. Furthermore, the device MUST set the status byte to * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag * is set. * * Currently there are no known flags for a DISCARD request. */ if (discard->flags.unmap != 0 || discard->flags.reserved != 0) { pci_vtblk_done_locked(io, ENOTSUP); return; } /* Make sure the request doesn't exceed our size limit */ if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) { pci_vtblk_done_locked(io, EINVAL); return; } io->io_req.br_offset = discard->sector * VTBLK_BSIZE; io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE; err = blockif_delete(sc->bc, &io->io_req); break; case VBH_OP_FLUSH: case VBH_OP_FLUSH_OUT: err = blockif_flush(sc->bc, &io->io_req); break; case VBH_OP_IDENT: /* Assume a single buffer */ /* S/n equal to buffer is not zero-terminated. */ memset(iov[1].iov_base, 0, iov[1].iov_len); strncpy(iov[1].iov_base, sc->vbsc_ident, MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); pci_vtblk_done_locked(io, 0); return; default: pci_vtblk_done_locked(io, EOPNOTSUPP); return; } assert(err == 0); } static void pci_vtblk_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtblk_softc *sc = vsc; while (vq_has_descs(vq)) pci_vtblk_proc(sc, vq); } static int pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { char bident[sizeof("XX:X:X")]; struct blockif_ctxt *bctxt; const char *path; MD5_CTX mdctx; u_char digest[16]; struct pci_vtblk_softc *sc; off_t size; int i, sectsz, sts, sto; /* * The supplied backing file has to exist */ snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); bctxt = blockif_open(nvl, bident); if (bctxt == NULL) { perror("Could not open backing file"); return (1); } size = blockif_size(bctxt); sectsz = blockif_sectsz(bctxt); blockif_psectsz(bctxt, &sts, &sto); sc = calloc(1, sizeof(struct pci_vtblk_softc)); sc->bc = bctxt; for (i = 0; i < VTBLK_RINGSZ; i++) { struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i]; io->io_req.br_callback = pci_vtblk_done; io->io_req.br_param = io; io->io_sc = sc; io->io_idx = i; } bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts)); if (blockif_candelete(sc->bc)) sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD; pthread_mutex_init(&sc->vsc_mtx, NULL); /* init virtio softc and virtqueues */ vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, pi, &sc->vbsc_vq); sc->vbsc_vs.vs_mtx = &sc->vsc_mtx; sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; /* sc->vbsc_vq.vq_notify = we have no per-queue notify */ /* * Create an identifier for the backing file. Use parts of the * md5 sum of the filename */ path = get_config_value_node(nvl, "path"); MD5Init(&mdctx); MD5Update(&mdctx, path, strlen(path)); MD5Final(digest, &mdctx); snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES, "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); /* setup virtio block config space */ sc->vbsc_cfg.vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */ sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ /* * If Linux is presented with a seg_max greater than the virtio queue * size, it can stumble into situations where it violates its own * invariants and panics. For safety, we keep seg_max clamped, paying * heed to the two extra descriptors needed for the header and status * of a request. */ sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX); sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */ sc->vbsc_cfg.vbc_geometry.heads = 0; sc->vbsc_cfg.vbc_geometry.sectors = 0; sc->vbsc_cfg.vbc_blk_size = sectsz; sc->vbsc_cfg.vbc_topology.physical_block_exp = (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0; sc->vbsc_cfg.vbc_topology.alignment_offset = (sto != 0) ? ((sts - sto) / sectsz) : 0; sc->vbsc_cfg.vbc_topology.min_io_size = 0; sc->vbsc_cfg.vbc_topology.opt_io_size = 0; sc->vbsc_cfg.vbc_writeback = 0; sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT; sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG; sc->vbsc_cfg.discard_sector_alignment = MAX(sectsz, sts) / VTBLK_BSIZE; /* * Should we move some of this into virtio.c? Could * have the device, class, and subdev_0 as fields in * the virtio constants structure. */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_BLOCK); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) { blockif_close(sc->bc); free(sc); return (1); } vi_set_io_bar(&sc->vbsc_vs, 0); return (0); } static int pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value) { DPRINTF(("vtblk: write to readonly reg %d", offset)); return (1); } static int pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtblk_softc *sc = vsc; void *ptr; /* our caller has already verified offset and size */ ptr = (uint8_t *)&sc->vbsc_cfg + offset; memcpy(retval, ptr, size); return (0); } struct pci_devemu pci_de_vblk = { .pe_emu = "virtio-blk", .pe_init = pci_vtblk_init, .pe_legacy_config = blockif_legacy_config, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = vi_pci_snapshot, #endif }; PCI_EMUL_SET(pci_de_vblk); diff --git a/usr.sbin/bhyve/pci_virtio_console.c b/usr.sbin/bhyve/pci_virtio_console.c index 88d6c37f582e..6832a3f92774 100644 --- a/usr.sbin/bhyve/pci_virtio_console.c +++ b/usr.sbin/bhyve/pci_virtio_console.c @@ -1,759 +1,759 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 iXsystems Inc. * All rights reserved. * * This software was developed by Jakub Klama * under sponsorship from iXsystems Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #include "mevent.h" #include "sockstream.h" #define VTCON_RINGSZ 64 #define VTCON_MAXPORTS 16 #define VTCON_MAXQ (VTCON_MAXPORTS * 2 + 2) #define VTCON_DEVICE_READY 0 #define VTCON_DEVICE_ADD 1 #define VTCON_DEVICE_REMOVE 2 #define VTCON_PORT_READY 3 #define VTCON_CONSOLE_PORT 4 #define VTCON_CONSOLE_RESIZE 5 #define VTCON_PORT_OPEN 6 #define VTCON_PORT_NAME 7 #define VTCON_F_SIZE 0 #define VTCON_F_MULTIPORT 1 #define VTCON_F_EMERG_WRITE 2 #define VTCON_S_HOSTCAPS \ (VTCON_F_SIZE | VTCON_F_MULTIPORT | VTCON_F_EMERG_WRITE) static int pci_vtcon_debug; #define DPRINTF(params) if (pci_vtcon_debug) PRINTLN params #define WPRINTF(params) PRINTLN params struct pci_vtcon_softc; struct pci_vtcon_port; struct pci_vtcon_config; typedef void (pci_vtcon_cb_t)(struct pci_vtcon_port *, void *, struct iovec *, int); struct pci_vtcon_port { struct pci_vtcon_softc * vsp_sc; int vsp_id; const char * vsp_name; bool vsp_enabled; bool vsp_console; bool vsp_rx_ready; bool vsp_open; int vsp_rxq; int vsp_txq; void * vsp_arg; pci_vtcon_cb_t * vsp_cb; }; struct pci_vtcon_sock { struct pci_vtcon_port * vss_port; const char * vss_path; struct mevent * vss_server_evp; struct mevent * vss_conn_evp; int vss_server_fd; int vss_conn_fd; bool vss_open; }; struct pci_vtcon_softc { struct virtio_softc vsc_vs; struct vqueue_info vsc_queues[VTCON_MAXQ]; pthread_mutex_t vsc_mtx; uint64_t vsc_cfg; uint64_t vsc_features; char * vsc_rootdir; int vsc_kq; bool vsc_ready; struct pci_vtcon_port vsc_control_port; struct pci_vtcon_port vsc_ports[VTCON_MAXPORTS]; struct pci_vtcon_config *vsc_config; }; struct pci_vtcon_config { uint16_t cols; uint16_t rows; uint32_t max_nr_ports; uint32_t emerg_wr; } __attribute__((packed)); struct pci_vtcon_control { uint32_t id; uint16_t event; uint16_t value; } __attribute__((packed)); struct pci_vtcon_console_resize { uint16_t cols; uint16_t rows; } __attribute__((packed)); static void pci_vtcon_reset(void *); static void pci_vtcon_notify_rx(void *, struct vqueue_info *); static void pci_vtcon_notify_tx(void *, struct vqueue_info *); static int pci_vtcon_cfgread(void *, int, int, uint32_t *); static int pci_vtcon_cfgwrite(void *, int, int, uint32_t); static void pci_vtcon_neg_features(void *, uint64_t); static void pci_vtcon_sock_accept(int, enum ev_type, void *); static void pci_vtcon_sock_rx(int, enum ev_type, void *); static void pci_vtcon_sock_tx(struct pci_vtcon_port *, void *, struct iovec *, int); static void pci_vtcon_control_send(struct pci_vtcon_softc *, struct pci_vtcon_control *, const void *, size_t); static void pci_vtcon_announce_port(struct pci_vtcon_port *); static void pci_vtcon_open_port(struct pci_vtcon_port *, bool); static struct virtio_consts vtcon_vi_consts = { "vtcon", /* our name */ VTCON_MAXQ, /* we support VTCON_MAXQ virtqueues */ sizeof(struct pci_vtcon_config), /* config reg size */ pci_vtcon_reset, /* reset */ NULL, /* device-wide qnotify */ pci_vtcon_cfgread, /* read virtio config */ pci_vtcon_cfgwrite, /* write virtio config */ pci_vtcon_neg_features, /* apply negotiated features */ VTCON_S_HOSTCAPS, /* our capabilities */ }; static void pci_vtcon_reset(void *vsc) { struct pci_vtcon_softc *sc; sc = vsc; DPRINTF(("vtcon: device reset requested!")); vi_reset_dev(&sc->vsc_vs); } static void pci_vtcon_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vtcon_softc *sc = vsc; sc->vsc_features = negotiated_features; } static int pci_vtcon_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtcon_softc *sc = vsc; void *ptr; ptr = (uint8_t *)sc->vsc_config + offset; memcpy(retval, ptr, size); return (0); } static int pci_vtcon_cfgwrite(void *vsc, int offset, int size, uint32_t val) { return (0); } static inline struct pci_vtcon_port * pci_vtcon_vq_to_port(struct pci_vtcon_softc *sc, struct vqueue_info *vq) { uint16_t num = vq->vq_num; if (num == 0 || num == 1) return (&sc->vsc_ports[0]); if (num == 2 || num == 3) return (&sc->vsc_control_port); return (&sc->vsc_ports[(num / 2) - 1]); } static inline struct vqueue_info * pci_vtcon_port_to_vq(struct pci_vtcon_port *port, bool tx_queue) { int qnum; qnum = tx_queue ? port->vsp_txq : port->vsp_rxq; return (&port->vsp_sc->vsc_queues[qnum]); } static struct pci_vtcon_port * pci_vtcon_port_add(struct pci_vtcon_softc *sc, int port_id, const char *name, pci_vtcon_cb_t *cb, void *arg) { struct pci_vtcon_port *port; port = &sc->vsc_ports[port_id]; if (port->vsp_enabled) { errno = EBUSY; return (NULL); } port->vsp_id = port_id; port->vsp_sc = sc; port->vsp_name = name; port->vsp_cb = cb; port->vsp_arg = arg; if (port->vsp_id == 0) { /* port0 */ port->vsp_txq = 0; port->vsp_rxq = 1; } else { port->vsp_txq = (port_id + 1) * 2; port->vsp_rxq = port->vsp_txq + 1; } port->vsp_enabled = true; return (port); } static int pci_vtcon_sock_add(struct pci_vtcon_softc *sc, const char *port_name, const nvlist_t *nvl) { struct pci_vtcon_sock *sock; struct sockaddr_un sun; const char *name, *path; char *cp, *pathcopy; long port; int s = -1, fd = -1, error = 0; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif port = strtol(port_name, &cp, 0); if (*cp != '\0' || port < 0 || port >= VTCON_MAXPORTS) { EPRINTLN("vtcon: Invalid port %s", port_name); error = -1; goto out; } path = get_config_value_node(nvl, "path"); if (path == NULL) { EPRINTLN("vtcon: required path missing for port %ld", port); error = -1; goto out; } sock = calloc(1, sizeof(struct pci_vtcon_sock)); if (sock == NULL) { error = -1; goto out; } s = socket(AF_UNIX, SOCK_STREAM, 0); if (s < 0) { error = -1; goto out; } pathcopy = strdup(path); if (pathcopy == NULL) { error = -1; goto out; } fd = open(dirname(pathcopy), O_RDONLY | O_DIRECTORY); if (fd < 0) { free(pathcopy); error = -1; goto out; } sun.sun_family = AF_UNIX; sun.sun_len = sizeof(struct sockaddr_un); strcpy(pathcopy, path); strlcpy(sun.sun_path, basename(pathcopy), sizeof(sun.sun_path)); free(pathcopy); if (bindat(fd, s, (struct sockaddr *)&sun, sun.sun_len) < 0) { error = -1; goto out; } if (fcntl(s, F_SETFL, O_NONBLOCK) < 0) { error = -1; goto out; } if (listen(s, 1) < 0) { error = -1; goto out; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE); if (caph_rights_limit(s, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif name = get_config_value_node(nvl, "name"); if (name == NULL) { EPRINTLN("vtcon: required name missing for port %ld", port); error = -1; goto out; } sock->vss_port = pci_vtcon_port_add(sc, port, name, pci_vtcon_sock_tx, sock); if (sock->vss_port == NULL) { error = -1; goto out; } sock->vss_open = false; sock->vss_conn_fd = -1; sock->vss_server_fd = s; sock->vss_server_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_accept, sock); if (sock->vss_server_evp == NULL) { error = -1; goto out; } out: if (fd != -1) close(fd); if (error != 0) { if (s != -1) close(s); free(sock); } return (error); } static void pci_vtcon_sock_accept(int fd __unused, enum ev_type t __unused, void *arg) { struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; int s; s = accept(sock->vss_server_fd, NULL, NULL); if (s < 0) return; if (sock->vss_open) { close(s); return; } sock->vss_open = true; sock->vss_conn_fd = s; sock->vss_conn_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_rx, sock); pci_vtcon_open_port(sock->vss_port, true); } static void pci_vtcon_sock_rx(int fd __unused, enum ev_type t __unused, void *arg) { struct pci_vtcon_port *port; struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; struct vqueue_info *vq; + struct vi_req req; struct iovec iov; static char dummybuf[2048]; int len, n; - uint16_t idx; port = sock->vss_port; vq = pci_vtcon_port_to_vq(port, true); if (!sock->vss_open || !port->vsp_rx_ready) { len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); if (len == 0) goto close; return; } if (!vq_has_descs(vq)) { len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); vq_endchains(vq, 1); if (len == 0) goto close; return; } do { - n = vq_getchain(vq, &idx, &iov, 1, NULL); + n = vq_getchain(vq, &iov, 1, &req); len = readv(sock->vss_conn_fd, &iov, n); if (len == 0 || (len < 0 && errno == EWOULDBLOCK)) { vq_retchains(vq, 1); vq_endchains(vq, 0); if (len == 0) goto close; return; } - vq_relchain(vq, idx, len); + vq_relchain(vq, req.idx, len); } while (vq_has_descs(vq)); vq_endchains(vq, 1); close: mevent_delete_close(sock->vss_conn_evp); sock->vss_conn_fd = -1; sock->vss_open = false; } static void pci_vtcon_sock_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov, int niov) { struct pci_vtcon_sock *sock; int i, ret; sock = (struct pci_vtcon_sock *)arg; if (sock->vss_conn_fd == -1) return; for (i = 0; i < niov; i++) { ret = stream_write(sock->vss_conn_fd, iov[i].iov_base, iov[i].iov_len); if (ret <= 0) break; } if (ret <= 0) { mevent_delete_close(sock->vss_conn_evp); sock->vss_conn_fd = -1; sock->vss_open = false; } } static void pci_vtcon_control_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov, int niov) { struct pci_vtcon_softc *sc; struct pci_vtcon_port *tmp; struct pci_vtcon_control resp, *ctrl; int i; assert(niov == 1); sc = port->vsp_sc; ctrl = (struct pci_vtcon_control *)iov->iov_base; switch (ctrl->event) { case VTCON_DEVICE_READY: sc->vsc_ready = true; /* set port ready events for registered ports */ for (i = 0; i < VTCON_MAXPORTS; i++) { tmp = &sc->vsc_ports[i]; if (tmp->vsp_enabled) pci_vtcon_announce_port(tmp); if (tmp->vsp_open) pci_vtcon_open_port(tmp, true); } break; case VTCON_PORT_READY: tmp = &sc->vsc_ports[ctrl->id]; if (ctrl->id >= VTCON_MAXPORTS || !tmp->vsp_enabled) { WPRINTF(("VTCON_PORT_READY event for unknown port %d", ctrl->id)); return; } if (tmp->vsp_console) { resp.event = VTCON_CONSOLE_PORT; resp.id = ctrl->id; resp.value = 1; pci_vtcon_control_send(sc, &resp, NULL, 0); } break; } } static void pci_vtcon_announce_port(struct pci_vtcon_port *port) { struct pci_vtcon_control event; event.id = port->vsp_id; event.event = VTCON_DEVICE_ADD; event.value = 1; pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); event.event = VTCON_PORT_NAME; pci_vtcon_control_send(port->vsp_sc, &event, port->vsp_name, strlen(port->vsp_name)); } static void pci_vtcon_open_port(struct pci_vtcon_port *port, bool open) { struct pci_vtcon_control event; if (!port->vsp_sc->vsc_ready) { port->vsp_open = true; return; } event.id = port->vsp_id; event.event = VTCON_PORT_OPEN; event.value = (int)open; pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); } static void pci_vtcon_control_send(struct pci_vtcon_softc *sc, struct pci_vtcon_control *ctrl, const void *payload, size_t len) { struct vqueue_info *vq; + struct vi_req req; struct iovec iov; - uint16_t idx; int n; vq = pci_vtcon_port_to_vq(&sc->vsc_control_port, true); if (!vq_has_descs(vq)) return; - n = vq_getchain(vq, &idx, &iov, 1, NULL); + n = vq_getchain(vq, &iov, 1, &req); assert(n == 1); memcpy(iov.iov_base, ctrl, sizeof(struct pci_vtcon_control)); if (payload != NULL && len > 0) memcpy(iov.iov_base + sizeof(struct pci_vtcon_control), payload, len); - vq_relchain(vq, idx, sizeof(struct pci_vtcon_control) + len); + vq_relchain(vq, req.idx, sizeof(struct pci_vtcon_control) + len); vq_endchains(vq, 1); } static void pci_vtcon_notify_tx(void *vsc, struct vqueue_info *vq) { struct pci_vtcon_softc *sc; struct pci_vtcon_port *port; struct iovec iov[1]; - uint16_t idx, n; - uint16_t flags[8]; + struct vi_req req; + uint16_t n; sc = vsc; port = pci_vtcon_vq_to_port(sc, vq); while (vq_has_descs(vq)) { - n = vq_getchain(vq, &idx, iov, 1, flags); + n = vq_getchain(vq, iov, 1, &req); assert(n >= 1); if (port != NULL) port->vsp_cb(port, port->vsp_arg, iov, 1); /* * Release this chain and handle more */ - vq_relchain(vq, idx, 0); + vq_relchain(vq, req.idx, 0); } vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ } static void pci_vtcon_notify_rx(void *vsc, struct vqueue_info *vq) { struct pci_vtcon_softc *sc; struct pci_vtcon_port *port; sc = vsc; port = pci_vtcon_vq_to_port(sc, vq); if (!port->vsp_rx_ready) { port->vsp_rx_ready = 1; vq_kick_disable(vq); } } /* * Each console device has a "port" node which contains nodes for * each port. Ports are numbered starting at 0. */ static int pci_vtcon_legacy_config_port(nvlist_t *nvl, int port, char *opt) { char *name, *path; char node_name[sizeof("XX")]; nvlist_t *port_nvl; name = strsep(&opt, "="); path = opt; if (path == NULL) { EPRINTLN("vtcon: port %s requires a path", name); return (-1); } if (port >= VTCON_MAXPORTS) { EPRINTLN("vtcon: too many ports"); return (-1); } snprintf(node_name, sizeof(node_name), "%d", port); port_nvl = create_relative_config_node(nvl, node_name); set_config_value_node(port_nvl, "name", name); set_config_value_node(port_nvl, "path", path); return (0); } static int pci_vtcon_legacy_config(nvlist_t *nvl, const char *opts) { char *opt, *str, *tofree; nvlist_t *ports_nvl; int error, port; ports_nvl = create_relative_config_node(nvl, "port"); tofree = str = strdup(opts); error = 0; port = 0; while ((opt = strsep(&str, ",")) != NULL) { error = pci_vtcon_legacy_config_port(ports_nvl, port, opt); if (error) break; port++; } free(tofree); return (error); } static int pci_vtcon_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { struct pci_vtcon_softc *sc; nvlist_t *ports_nvl; int i; sc = calloc(1, sizeof(struct pci_vtcon_softc)); sc->vsc_config = calloc(1, sizeof(struct pci_vtcon_config)); sc->vsc_config->max_nr_ports = VTCON_MAXPORTS; sc->vsc_config->cols = 80; sc->vsc_config->rows = 25; vi_softc_linkup(&sc->vsc_vs, &vtcon_vi_consts, sc, pi, sc->vsc_queues); sc->vsc_vs.vs_mtx = &sc->vsc_mtx; for (i = 0; i < VTCON_MAXQ; i++) { sc->vsc_queues[i].vq_qsize = VTCON_RINGSZ; sc->vsc_queues[i].vq_notify = i % 2 == 0 ? pci_vtcon_notify_rx : pci_vtcon_notify_tx; } /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_CONSOLE); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_CONSOLE); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) return (1); vi_set_io_bar(&sc->vsc_vs, 0); /* create control port */ sc->vsc_control_port.vsp_sc = sc; sc->vsc_control_port.vsp_txq = 2; sc->vsc_control_port.vsp_rxq = 3; sc->vsc_control_port.vsp_cb = pci_vtcon_control_tx; sc->vsc_control_port.vsp_enabled = true; ports_nvl = find_relative_config_node(nvl, "port"); if (ports_nvl != NULL) { const char *name; void *cookie; int type; cookie = NULL; while ((name = nvlist_next(ports_nvl, &type, &cookie)) != NULL) { if (type != NV_TYPE_NVLIST) continue; if (pci_vtcon_sock_add(sc, name, nvlist_get_nvlist(ports_nvl, name)) < 0) { EPRINTLN("cannot create port %s: %s", name, strerror(errno)); return (1); } } } return (0); } struct pci_devemu pci_de_vcon = { .pe_emu = "virtio-console", .pe_init = pci_vtcon_init, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read }; PCI_EMUL_SET(pci_de_vcon); diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c index 0ea470a71b56..d253b081d13a 100644 --- a/usr.sbin/bhyve/pci_virtio_net.c +++ b/usr.sbin/bhyve/pci_virtio_net.c @@ -1,817 +1,819 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "mevent.h" #include "virtio.h" #include "net_utils.h" #include "net_backends.h" #include "iov.h" #define VTNET_RINGSZ 1024 #define VTNET_MAXSEGS 256 #define VTNET_MAX_PKT_LEN (65536 + 64) #define VTNET_MIN_MTU ETHERMIN #define VTNET_MAX_MTU 65535 #define VTNET_S_HOSTCAPS \ ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | \ VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC) /* * PCI config-space "registers" */ struct virtio_net_config { uint8_t mac[6]; uint16_t status; uint16_t max_virtqueue_pairs; uint16_t mtu; } __packed; /* * Queue definitions. */ #define VTNET_RXQ 0 #define VTNET_TXQ 1 #define VTNET_CTLQ 2 /* NB: not yet supported */ #define VTNET_MAXQ 3 /* * Debug printf */ static int pci_vtnet_debug; #define DPRINTF(params) if (pci_vtnet_debug) PRINTLN params #define WPRINTF(params) PRINTLN params /* * Per-device softc */ struct pci_vtnet_softc { struct virtio_softc vsc_vs; struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; pthread_mutex_t vsc_mtx; net_backend_t *vsc_be; bool features_negotiated; /* protected by rx_mtx */ int resetting; /* protected by tx_mtx */ uint64_t vsc_features; /* negotiated features */ pthread_mutex_t rx_mtx; int rx_merge; /* merged rx bufs in use */ pthread_t tx_tid; pthread_mutex_t tx_mtx; pthread_cond_t tx_cond; int tx_in_progress; size_t vhdrlen; size_t be_vhdrlen; struct virtio_net_config vsc_config; struct virtio_consts vsc_consts; }; static void pci_vtnet_reset(void *); /* static void pci_vtnet_notify(void *, struct vqueue_info *); */ static int pci_vtnet_cfgread(void *, int, int, uint32_t *); static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); static void pci_vtnet_neg_features(void *, uint64_t); #ifdef BHYVE_SNAPSHOT static void pci_vtnet_pause(void *); static void pci_vtnet_resume(void *); static int pci_vtnet_snapshot(void *, struct vm_snapshot_meta *); #endif static struct virtio_consts vtnet_vi_consts = { "vtnet", /* our name */ VTNET_MAXQ - 1, /* we currently support 2 virtqueues */ sizeof(struct virtio_net_config), /* config reg size */ pci_vtnet_reset, /* reset */ NULL, /* device-wide qnotify -- not used */ pci_vtnet_cfgread, /* read PCI config */ pci_vtnet_cfgwrite, /* write PCI config */ pci_vtnet_neg_features, /* apply negotiated features */ VTNET_S_HOSTCAPS, /* our capabilities */ #ifdef BHYVE_SNAPSHOT pci_vtnet_pause, /* pause rx/tx threads */ pci_vtnet_resume, /* resume rx/tx threads */ pci_vtnet_snapshot, /* save / restore device state */ #endif }; static void pci_vtnet_reset(void *vsc) { struct pci_vtnet_softc *sc = vsc; DPRINTF(("vtnet: device reset requested !")); /* Acquire the RX lock to block RX processing. */ pthread_mutex_lock(&sc->rx_mtx); /* * Make sure receive operation is disabled at least until we * re-negotiate the features, since receive operation depends * on the value of sc->rx_merge and the header length, which * are both set in pci_vtnet_neg_features(). * Receive operation will be enabled again once the guest adds * the first receive buffers and kicks us. */ sc->features_negotiated = false; netbe_rx_disable(sc->vsc_be); /* Set sc->resetting and give a chance to the TX thread to stop. */ pthread_mutex_lock(&sc->tx_mtx); sc->resetting = 1; while (sc->tx_in_progress) { pthread_mutex_unlock(&sc->tx_mtx); usleep(10000); pthread_mutex_lock(&sc->tx_mtx); } /* * Now reset rings, MSI-X vectors, and negotiated capabilities. * Do that with the TX lock held, since we need to reset * sc->resetting. */ vi_reset_dev(&sc->vsc_vs); sc->resetting = 0; pthread_mutex_unlock(&sc->tx_mtx); pthread_mutex_unlock(&sc->rx_mtx); } static __inline struct iovec * iov_trim_hdr(struct iovec *iov, int *iovcnt, unsigned int hlen) { struct iovec *riov; if (iov[0].iov_len < hlen) { /* * Not enough header space in the first fragment. * That's not ok for us. */ return NULL; } iov[0].iov_len -= hlen; if (iov[0].iov_len == 0) { *iovcnt -= 1; if (*iovcnt == 0) { /* * Only space for the header. That's not * enough for us. */ return NULL; } riov = &iov[1]; } else { iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + hlen); riov = &iov[0]; } return (riov); } struct virtio_mrg_rxbuf_info { uint16_t idx; uint16_t pad; uint32_t len; }; static void pci_vtnet_rx(struct pci_vtnet_softc *sc) { int prepend_hdr_len = sc->vhdrlen - sc->be_vhdrlen; struct virtio_mrg_rxbuf_info info[VTNET_MAXSEGS]; struct iovec iov[VTNET_MAXSEGS + 1]; struct vqueue_info *vq; + struct vi_req req; vq = &sc->vsc_queues[VTNET_RXQ]; /* Features must be negotiated */ if (!sc->features_negotiated) { return; } for (;;) { struct virtio_net_rxhdr *hdr; uint32_t riov_bytes; struct iovec *riov; uint32_t ulen; int riov_len; int n_chains; ssize_t rlen; ssize_t plen; plen = netbe_peek_recvlen(sc->vsc_be); if (plen <= 0) { /* * No more packets (plen == 0), or backend errored * (plen < 0). Interrupt if needed and stop. */ vq_endchains(vq, /*used_all_avail=*/0); return; } plen += prepend_hdr_len; /* * Get a descriptor chain to store the next ingress * packet. In case of mergeable rx buffers, get as * many chains as necessary in order to make room * for plen bytes. */ riov_bytes = 0; riov_len = 0; riov = iov; n_chains = 0; do { - int n = vq_getchain(vq, &info[n_chains].idx, riov, - VTNET_MAXSEGS - riov_len, NULL); + int n = vq_getchain(vq, riov, VTNET_MAXSEGS - riov_len, + &req); + info[n_chains].idx = req.idx; if (n == 0) { /* * No rx buffers. Enable RX kicks and double * check. */ vq_kick_enable(vq); if (!vq_has_descs(vq)) { /* * Still no buffers. Return the unused * chains (if any), interrupt if needed * (including for NOTIFY_ON_EMPTY), and * disable the backend until the next * kick. */ vq_retchains(vq, n_chains); vq_endchains(vq, /*used_all_avail=*/1); netbe_rx_disable(sc->vsc_be); return; } /* More rx buffers found, so keep going. */ vq_kick_disable(vq); continue; } assert(n >= 1 && riov_len + n <= VTNET_MAXSEGS); riov_len += n; if (!sc->rx_merge) { n_chains = 1; break; } info[n_chains].len = (uint32_t)count_iov(riov, n); riov_bytes += info[n_chains].len; riov += n; n_chains++; } while (riov_bytes < plen && riov_len < VTNET_MAXSEGS); riov = iov; hdr = riov[0].iov_base; if (prepend_hdr_len > 0) { /* * The frontend uses a virtio-net header, but the * backend does not. We need to prepend a zeroed * header. */ riov = iov_trim_hdr(riov, &riov_len, prepend_hdr_len); if (riov == NULL) { /* * The first collected chain is nonsensical, * as it is not even enough to store the * virtio-net header. Just drop it. */ vq_relchain(vq, info[0].idx, 0); vq_retchains(vq, n_chains - 1); continue; } memset(hdr, 0, prepend_hdr_len); } rlen = netbe_recv(sc->vsc_be, riov, riov_len); if (rlen != plen - prepend_hdr_len) { /* * If this happens it means there is something * wrong with the backend (e.g., some other * process is stealing our packets). */ WPRINTF(("netbe_recv: expected %zd bytes, " "got %zd", plen - prepend_hdr_len, rlen)); vq_retchains(vq, n_chains); continue; } ulen = (uint32_t)plen; /* * Publish the used buffers to the guest, reporting the * number of bytes that we wrote. */ if (!sc->rx_merge) { vq_relchain(vq, info[0].idx, ulen); } else { uint32_t iolen; int i = 0; do { iolen = info[i].len; if (iolen > ulen) { iolen = ulen; } vq_relchain_prepare(vq, info[i].idx, iolen); ulen -= iolen; i++; } while (ulen > 0); hdr->vrh_bufs = i; vq_relchain_publish(vq); assert(i == n_chains); } } } /* * Called when there is read activity on the backend file descriptor. * Each buffer posted by the guest is assumed to be able to contain * an entire ethernet frame + rx header. */ static void pci_vtnet_rx_callback(int fd, enum ev_type type, void *param) { struct pci_vtnet_softc *sc = param; pthread_mutex_lock(&sc->rx_mtx); pci_vtnet_rx(sc); pthread_mutex_unlock(&sc->rx_mtx); } /* Called on RX kick. */ static void pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) { struct pci_vtnet_softc *sc = vsc; /* * A qnotify means that the rx process can now begin. * Enable RX only if features are negotiated. */ pthread_mutex_lock(&sc->rx_mtx); if (!sc->features_negotiated) { pthread_mutex_unlock(&sc->rx_mtx); return; } vq_kick_disable(vq); netbe_rx_enable(sc->vsc_be); pthread_mutex_unlock(&sc->rx_mtx); } /* TX virtqueue processing, called by the TX thread. */ static void pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) { struct iovec iov[VTNET_MAXSEGS + 1]; struct iovec *siov = iov; - uint16_t idx; + struct vi_req req; ssize_t len; int n; /* * Obtain chain of descriptors. The first descriptor also * contains the virtio-net header. */ - n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); + n = vq_getchain(vq, iov, VTNET_MAXSEGS, &req); assert(n >= 1 && n <= VTNET_MAXSEGS); if (sc->vhdrlen != sc->be_vhdrlen) { /* * The frontend uses a virtio-net header, but the backend * does not. We simply strip the header and ignore it, as * it should be zero-filled. */ siov = iov_trim_hdr(siov, &n, sc->vhdrlen); } if (siov == NULL) { /* The chain is nonsensical. Just drop it. */ len = 0; } else { len = netbe_send(sc->vsc_be, siov, n); if (len < 0) { /* * If send failed, report that 0 bytes * were read. */ len = 0; } } /* * Return the processed chain to the guest, reporting * the number of bytes that we read. */ - vq_relchain(vq, idx, len); + vq_relchain(vq, req.idx, len); } /* Called on TX kick. */ static void pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) { struct pci_vtnet_softc *sc = vsc; /* * Any ring entries to process? */ if (!vq_has_descs(vq)) return; /* Signal the tx thread for processing */ pthread_mutex_lock(&sc->tx_mtx); vq_kick_disable(vq); if (sc->tx_in_progress == 0) pthread_cond_signal(&sc->tx_cond); pthread_mutex_unlock(&sc->tx_mtx); } /* * Thread which will handle processing of TX desc */ static void * pci_vtnet_tx_thread(void *param) { struct pci_vtnet_softc *sc = param; struct vqueue_info *vq; int error; vq = &sc->vsc_queues[VTNET_TXQ]; /* * Let us wait till the tx queue pointers get initialised & * first tx signaled */ pthread_mutex_lock(&sc->tx_mtx); error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); assert(error == 0); for (;;) { /* note - tx mutex is locked here */ while (sc->resetting || !vq_has_descs(vq)) { vq_kick_enable(vq); if (!sc->resetting && vq_has_descs(vq)) break; sc->tx_in_progress = 0; error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); assert(error == 0); } vq_kick_disable(vq); sc->tx_in_progress = 1; pthread_mutex_unlock(&sc->tx_mtx); do { /* * Run through entries, placing them into * iovecs and sending when an end-of-packet * is found */ pci_vtnet_proctx(sc, vq); } while (vq_has_descs(vq)); /* * Generate an interrupt if needed. */ vq_endchains(vq, /*used_all_avail=*/1); pthread_mutex_lock(&sc->tx_mtx); } } #ifdef notyet static void pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) { DPRINTF(("vtnet: control qnotify!")); } #endif static int pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { struct pci_vtnet_softc *sc; const char *value; char tname[MAXCOMLEN + 1]; unsigned long mtu = ETHERMTU; int err; /* * Allocate data structures for further virtio initializations. * sc also contains a copy of vtnet_vi_consts, since capabilities * change depending on the backend. */ sc = calloc(1, sizeof(struct pci_vtnet_softc)); sc->vsc_consts = vtnet_vi_consts; pthread_mutex_init(&sc->vsc_mtx, NULL); sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; #ifdef notyet sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; #endif value = get_config_value_node(nvl, "mac"); if (value != NULL) { err = net_parsemac(value, sc->vsc_config.mac); if (err) { free(sc); return (err); } } else net_genmac(pi, sc->vsc_config.mac); value = get_config_value_node(nvl, "mtu"); if (value != NULL) { err = net_parsemtu(value, &mtu); if (err) { free(sc); return (err); } if (mtu < VTNET_MIN_MTU || mtu > VTNET_MAX_MTU) { err = EINVAL; errno = EINVAL; free(sc); return (err); } sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MTU; } sc->vsc_config.mtu = mtu; /* Permit interfaces without a configured backend. */ if (get_config_value_node(nvl, "backend") != NULL) { err = netbe_init(&sc->vsc_be, nvl, pci_vtnet_rx_callback, sc); if (err) { free(sc); return (err); } } sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF | netbe_get_cap(sc->vsc_be); /* * Since we do not actually support multiqueue, * set the maximum virtqueue pairs to 1. */ sc->vsc_config.max_virtqueue_pairs = 1; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_NETWORK); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); /* Link is always up. */ sc->vsc_config.status = 1; vi_softc_linkup(&sc->vsc_vs, &sc->vsc_consts, sc, pi, sc->vsc_queues); sc->vsc_vs.vs_mtx = &sc->vsc_mtx; /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) { free(sc); return (1); } /* use BAR 0 to map config regs in IO space */ vi_set_io_bar(&sc->vsc_vs, 0); sc->resetting = 0; sc->rx_merge = 0; sc->vhdrlen = sizeof(struct virtio_net_rxhdr) - 2; pthread_mutex_init(&sc->rx_mtx, NULL); /* * Initialize tx semaphore & spawn TX processing thread. * As of now, only one thread for TX desc processing is * spawned. */ sc->tx_in_progress = 0; pthread_mutex_init(&sc->tx_mtx, NULL); pthread_cond_init(&sc->tx_cond, NULL); pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, pi->pi_func); pthread_set_name_np(sc->tx_tid, tname); return (0); } static int pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) { struct pci_vtnet_softc *sc = vsc; void *ptr; if (offset < (int)sizeof(sc->vsc_config.mac)) { assert(offset + size <= (int)sizeof(sc->vsc_config.mac)); /* * The driver is allowed to change the MAC address */ ptr = &sc->vsc_config.mac[offset]; memcpy(ptr, &value, size); } else { /* silently ignore other writes */ DPRINTF(("vtnet: write to readonly reg %d", offset)); } return (0); } static int pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtnet_softc *sc = vsc; void *ptr; ptr = (uint8_t *)&sc->vsc_config + offset; memcpy(retval, ptr, size); return (0); } static void pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vtnet_softc *sc = vsc; sc->vsc_features = negotiated_features; if (negotiated_features & VIRTIO_NET_F_MRG_RXBUF) { sc->vhdrlen = sizeof(struct virtio_net_rxhdr); sc->rx_merge = 1; } else { /* * Without mergeable rx buffers, virtio-net header is 2 * bytes shorter than sizeof(struct virtio_net_rxhdr). */ sc->vhdrlen = sizeof(struct virtio_net_rxhdr) - 2; sc->rx_merge = 0; } /* Tell the backend to enable some capabilities it has advertised. */ netbe_set_cap(sc->vsc_be, negotiated_features, sc->vhdrlen); sc->be_vhdrlen = netbe_get_vnet_hdr_len(sc->vsc_be); assert(sc->be_vhdrlen == 0 || sc->be_vhdrlen == sc->vhdrlen); pthread_mutex_lock(&sc->rx_mtx); sc->features_negotiated = true; pthread_mutex_unlock(&sc->rx_mtx); } #ifdef BHYVE_SNAPSHOT static void pci_vtnet_pause(void *vsc) { struct pci_vtnet_softc *sc = vsc; DPRINTF(("vtnet: device pause requested !\n")); /* Acquire the RX lock to block RX processing. */ pthread_mutex_lock(&sc->rx_mtx); /* Wait for the transmit thread to finish its processing. */ pthread_mutex_lock(&sc->tx_mtx); while (sc->tx_in_progress) { pthread_mutex_unlock(&sc->tx_mtx); usleep(10000); pthread_mutex_lock(&sc->tx_mtx); } } static void pci_vtnet_resume(void *vsc) { struct pci_vtnet_softc *sc = vsc; DPRINTF(("vtnet: device resume requested !\n")); pthread_mutex_unlock(&sc->tx_mtx); /* The RX lock should have been acquired in vtnet_pause. */ pthread_mutex_unlock(&sc->rx_mtx); } static int pci_vtnet_snapshot(void *vsc, struct vm_snapshot_meta *meta) { int ret; struct pci_vtnet_softc *sc = vsc; DPRINTF(("vtnet: device snapshot requested !\n")); /* * Queues and consts should have been saved by the more generic * vi_pci_snapshot function. We need to save only our features and * config. */ SNAPSHOT_VAR_OR_LEAVE(sc->vsc_features, meta, ret, done); /* Force reapply negociated features at restore time */ if (meta->op == VM_SNAPSHOT_RESTORE) { pci_vtnet_neg_features(sc, sc->vsc_features); netbe_rx_enable(sc->vsc_be); } SNAPSHOT_VAR_OR_LEAVE(sc->vsc_config, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rx_merge, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->vhdrlen, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->be_vhdrlen, meta, ret, done); done: return (ret); } #endif static struct pci_devemu pci_de_vnet = { .pe_emu = "virtio-net", .pe_init = pci_vtnet_init, .pe_legacy_config = netbe_legacy_config, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = vi_pci_snapshot, .pe_pause = vi_pci_pause, .pe_resume = vi_pci_resume, #endif }; PCI_EMUL_SET(pci_de_vnet); diff --git a/usr.sbin/bhyve/pci_virtio_rnd.c b/usr.sbin/bhyve/pci_virtio_rnd.c index d51301b32534..1d2d6144f949 100644 --- a/usr.sbin/bhyve/pci_virtio_rnd.c +++ b/usr.sbin/bhyve/pci_virtio_rnd.c @@ -1,213 +1,213 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Nahanni Systems Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * virtio entropy device emulation. * Randomness is sourced from /dev/random which does not block * once it has been seeded at bootup. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #define VTRND_RINGSZ 64 static int pci_vtrnd_debug; #define DPRINTF(params) if (pci_vtrnd_debug) PRINTLN params #define WPRINTF(params) PRINTLN params /* * Per-device softc */ struct pci_vtrnd_softc { struct virtio_softc vrsc_vs; struct vqueue_info vrsc_vq; pthread_mutex_t vrsc_mtx; uint64_t vrsc_cfg; int vrsc_fd; }; static void pci_vtrnd_reset(void *); static void pci_vtrnd_notify(void *, struct vqueue_info *); static struct virtio_consts vtrnd_vi_consts = { "vtrnd", /* our name */ 1, /* we support 1 virtqueue */ 0, /* config reg size */ pci_vtrnd_reset, /* reset */ pci_vtrnd_notify, /* device-wide qnotify */ NULL, /* read virtio config */ NULL, /* write virtio config */ NULL, /* apply negotiated features */ 0, /* our capabilities */ }; static void pci_vtrnd_reset(void *vsc) { struct pci_vtrnd_softc *sc; sc = vsc; DPRINTF(("vtrnd: device reset requested !")); vi_reset_dev(&sc->vrsc_vs); } static void pci_vtrnd_notify(void *vsc, struct vqueue_info *vq) { struct iovec iov; struct pci_vtrnd_softc *sc; + struct vi_req req; int len; - uint16_t idx; sc = vsc; if (sc->vrsc_fd < 0) { vq_endchains(vq, 0); return; } while (vq_has_descs(vq)) { - vq_getchain(vq, &idx, &iov, 1, NULL); + vq_getchain(vq, &iov, 1, &req); len = read(sc->vrsc_fd, iov.iov_base, iov.iov_len); DPRINTF(("vtrnd: vtrnd_notify(): %d", len)); /* Catastrophe if unable to read from /dev/random */ assert(len > 0); /* * Release this chain and handle more */ - vq_relchain(vq, idx, len); + vq_relchain(vq, req.idx, len); } vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ } static int pci_vtrnd_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { struct pci_vtrnd_softc *sc; int fd; int len; uint8_t v; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif /* * Should always be able to open /dev/random. */ fd = open("/dev/random", O_RDONLY | O_NONBLOCK); assert(fd >= 0); #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_READ); if (caph_rights_limit(fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif /* * Check that device is seeded and non-blocking. */ len = read(fd, &v, sizeof(v)); if (len <= 0) { WPRINTF(("vtrnd: /dev/random not ready, read(): %d", len)); close(fd); return (1); } sc = calloc(1, sizeof(struct pci_vtrnd_softc)); vi_softc_linkup(&sc->vrsc_vs, &vtrnd_vi_consts, sc, pi, &sc->vrsc_vq); sc->vrsc_vs.vs_mtx = &sc->vrsc_mtx; sc->vrsc_vq.vq_qsize = VTRND_RINGSZ; /* keep /dev/random opened while emulating */ sc->vrsc_fd = fd; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_RANDOM); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_CRYPTO); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_ENTROPY); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vrsc_vs, 1, fbsdrun_virtio_msix())) return (1); vi_set_io_bar(&sc->vrsc_vs, 0); return (0); } struct pci_devemu pci_de_vrnd = { .pe_emu = "virtio-rnd", .pe_init = pci_vtrnd_init, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = vi_pci_snapshot, #endif }; PCI_EMUL_SET(pci_de_vrnd); diff --git a/usr.sbin/bhyve/pci_virtio_scsi.c b/usr.sbin/bhyve/pci_virtio_scsi.c index eee9847494dc..aed2fe2dbb23 100644 --- a/usr.sbin/bhyve/pci_virtio_scsi.c +++ b/usr.sbin/bhyve/pci_virtio_scsi.c @@ -1,746 +1,738 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Jakub Klama . * Copyright (c) 2018 Marcelo Araujo . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #include "iov.h" #define VTSCSI_RINGSZ 64 #define VTSCSI_REQUESTQ 1 #define VTSCSI_THR_PER_Q 16 #define VTSCSI_MAXQ (VTSCSI_REQUESTQ + 2) #define VTSCSI_MAXSEG 64 #define VTSCSI_IN_HEADER_LEN(_sc) \ (sizeof(struct pci_vtscsi_req_cmd_rd) + _sc->vss_config.cdb_size) #define VTSCSI_OUT_HEADER_LEN(_sc) \ (sizeof(struct pci_vtscsi_req_cmd_wr) + _sc->vss_config.sense_size) #define VIRTIO_SCSI_MAX_CHANNEL 0 #define VIRTIO_SCSI_MAX_TARGET 0 #define VIRTIO_SCSI_MAX_LUN 16383 #define VIRTIO_SCSI_F_INOUT (1 << 0) #define VIRTIO_SCSI_F_HOTPLUG (1 << 1) #define VIRTIO_SCSI_F_CHANGE (1 << 2) static int pci_vtscsi_debug = 0; #define DPRINTF(params) if (pci_vtscsi_debug) PRINTLN params #define WPRINTF(params) PRINTLN params struct pci_vtscsi_config { uint32_t num_queues; uint32_t seg_max; uint32_t max_sectors; uint32_t cmd_per_lun; uint32_t event_info_size; uint32_t sense_size; uint32_t cdb_size; uint16_t max_channel; uint16_t max_target; uint32_t max_lun; } __attribute__((packed)); struct pci_vtscsi_queue { struct pci_vtscsi_softc * vsq_sc; struct vqueue_info * vsq_vq; pthread_mutex_t vsq_mtx; pthread_mutex_t vsq_qmtx; pthread_cond_t vsq_cv; STAILQ_HEAD(, pci_vtscsi_request) vsq_requests; LIST_HEAD(, pci_vtscsi_worker) vsq_workers; }; struct pci_vtscsi_worker { struct pci_vtscsi_queue * vsw_queue; pthread_t vsw_thread; bool vsw_exiting; LIST_ENTRY(pci_vtscsi_worker) vsw_link; }; struct pci_vtscsi_request { struct pci_vtscsi_queue * vsr_queue; struct iovec vsr_iov_in[VTSCSI_MAXSEG]; int vsr_niov_in; struct iovec vsr_iov_out[VTSCSI_MAXSEG]; int vsr_niov_out; uint32_t vsr_idx; STAILQ_ENTRY(pci_vtscsi_request) vsr_link; }; /* * Per-device softc */ struct pci_vtscsi_softc { struct virtio_softc vss_vs; struct vqueue_info vss_vq[VTSCSI_MAXQ]; struct pci_vtscsi_queue vss_queues[VTSCSI_REQUESTQ]; pthread_mutex_t vss_mtx; int vss_iid; int vss_ctl_fd; uint32_t vss_features; struct pci_vtscsi_config vss_config; }; #define VIRTIO_SCSI_T_TMF 0 #define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 #define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 #define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 #define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 #define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 #define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 #define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 #define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 /* command-specific response values */ #define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0 #define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 #define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 struct pci_vtscsi_ctrl_tmf { uint32_t type; uint32_t subtype; uint8_t lun[8]; uint64_t id; uint8_t response; } __attribute__((packed)); #define VIRTIO_SCSI_T_AN_QUERY 1 #define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2 #define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4 #define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8 #define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16 #define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32 #define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64 struct pci_vtscsi_ctrl_an { uint32_t type; uint8_t lun[8]; uint32_t event_requested; uint32_t event_actual; uint8_t response; } __attribute__((packed)); /* command-specific response values */ #define VIRTIO_SCSI_S_OK 0 #define VIRTIO_SCSI_S_OVERRUN 1 #define VIRTIO_SCSI_S_ABORTED 2 #define VIRTIO_SCSI_S_BAD_TARGET 3 #define VIRTIO_SCSI_S_RESET 4 #define VIRTIO_SCSI_S_BUSY 5 #define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 #define VIRTIO_SCSI_S_TARGET_FAILURE 7 #define VIRTIO_SCSI_S_NEXUS_FAILURE 8 #define VIRTIO_SCSI_S_FAILURE 9 #define VIRTIO_SCSI_S_INCORRECT_LUN 12 /* task_attr */ #define VIRTIO_SCSI_S_SIMPLE 0 #define VIRTIO_SCSI_S_ORDERED 1 #define VIRTIO_SCSI_S_HEAD 2 #define VIRTIO_SCSI_S_ACA 3 struct pci_vtscsi_event { uint32_t event; uint8_t lun[8]; uint32_t reason; } __attribute__((packed)); struct pci_vtscsi_req_cmd_rd { uint8_t lun[8]; uint64_t id; uint8_t task_attr; uint8_t prio; uint8_t crn; uint8_t cdb[]; } __attribute__((packed)); struct pci_vtscsi_req_cmd_wr { uint32_t sense_len; uint32_t residual; uint16_t status_qualifier; uint8_t status; uint8_t response; uint8_t sense[]; } __attribute__((packed)); static void *pci_vtscsi_proc(void *); static void pci_vtscsi_reset(void *); static void pci_vtscsi_neg_features(void *, uint64_t); static int pci_vtscsi_cfgread(void *, int, int, uint32_t *); static int pci_vtscsi_cfgwrite(void *, int, int, uint32_t); static inline int pci_vtscsi_get_lun(uint8_t *); static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *, void *, size_t); static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *, struct pci_vtscsi_ctrl_tmf *); static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *, struct pci_vtscsi_ctrl_an *); static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *, struct iovec *, int, struct iovec *, int); static void pci_vtscsi_controlq_notify(void *, struct vqueue_info *); static void pci_vtscsi_eventq_notify(void *, struct vqueue_info *); static void pci_vtscsi_requestq_notify(void *, struct vqueue_info *); static int pci_vtscsi_init_queue(struct pci_vtscsi_softc *, struct pci_vtscsi_queue *, int); static int pci_vtscsi_init(struct vmctx *, struct pci_devinst *, nvlist_t *); static struct virtio_consts vtscsi_vi_consts = { "vtscsi", /* our name */ VTSCSI_MAXQ, /* we support 2+n virtqueues */ sizeof(struct pci_vtscsi_config), /* config reg size */ pci_vtscsi_reset, /* reset */ NULL, /* device-wide qnotify */ pci_vtscsi_cfgread, /* read virtio config */ pci_vtscsi_cfgwrite, /* write virtio config */ pci_vtscsi_neg_features, /* apply negotiated features */ 0, /* our capabilities */ }; static void * pci_vtscsi_proc(void *arg) { struct pci_vtscsi_worker *worker = (struct pci_vtscsi_worker *)arg; struct pci_vtscsi_queue *q = worker->vsw_queue; struct pci_vtscsi_request *req; int iolen; for (;;) { pthread_mutex_lock(&q->vsq_mtx); while (STAILQ_EMPTY(&q->vsq_requests) && !worker->vsw_exiting) pthread_cond_wait(&q->vsq_cv, &q->vsq_mtx); if (worker->vsw_exiting) break; req = STAILQ_FIRST(&q->vsq_requests); STAILQ_REMOVE_HEAD(&q->vsq_requests, vsr_link); pthread_mutex_unlock(&q->vsq_mtx); iolen = pci_vtscsi_request_handle(q, req->vsr_iov_in, req->vsr_niov_in, req->vsr_iov_out, req->vsr_niov_out); pthread_mutex_lock(&q->vsq_qmtx); vq_relchain(q->vsq_vq, req->vsr_idx, iolen); vq_endchains(q->vsq_vq, 0); pthread_mutex_unlock(&q->vsq_qmtx); DPRINTF(("virtio-scsi: request completed", req->vsr_idx)); free(req); } pthread_mutex_unlock(&q->vsq_mtx); return (NULL); } static void pci_vtscsi_reset(void *vsc) { struct pci_vtscsi_softc *sc; sc = vsc; DPRINTF(("vtscsi: device reset requested")); vi_reset_dev(&sc->vss_vs); /* initialize config structure */ sc->vss_config = (struct pci_vtscsi_config){ .num_queues = VTSCSI_REQUESTQ, /* Leave room for the request and the response. */ .seg_max = VTSCSI_MAXSEG - 2, .max_sectors = 2, .cmd_per_lun = 1, .event_info_size = sizeof(struct pci_vtscsi_event), .sense_size = 96, .cdb_size = 32, .max_channel = VIRTIO_SCSI_MAX_CHANNEL, .max_target = VIRTIO_SCSI_MAX_TARGET, .max_lun = VIRTIO_SCSI_MAX_LUN }; } static void pci_vtscsi_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vtscsi_softc *sc = vsc; sc->vss_features = negotiated_features; } static int pci_vtscsi_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtscsi_softc *sc = vsc; void *ptr; ptr = (uint8_t *)&sc->vss_config + offset; memcpy(retval, ptr, size); return (0); } static int pci_vtscsi_cfgwrite(void *vsc, int offset, int size, uint32_t val) { return (0); } static inline int pci_vtscsi_get_lun(uint8_t *lun) { return (((lun[2] << 8) | lun[3]) & 0x3fff); } static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *sc, void *buf, size_t bufsize) { struct pci_vtscsi_ctrl_tmf *tmf; struct pci_vtscsi_ctrl_an *an; uint32_t type; type = *(uint32_t *)buf; if (type == VIRTIO_SCSI_T_TMF) { tmf = (struct pci_vtscsi_ctrl_tmf *)buf; return (pci_vtscsi_tmf_handle(sc, tmf)); } if (type == VIRTIO_SCSI_T_AN_QUERY) { an = (struct pci_vtscsi_ctrl_an *)buf; return (pci_vtscsi_an_handle(sc, an)); } return (0); } static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *sc, struct pci_vtscsi_ctrl_tmf *tmf) { union ctl_io *io; int err; io = ctl_scsi_alloc_io(sc->vss_iid); ctl_scsi_zero_io(io); io->io_hdr.io_type = CTL_IO_TASK; io->io_hdr.nexus.initid = sc->vss_iid; io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(tmf->lun); io->taskio.tag_type = CTL_TAG_SIMPLE; io->taskio.tag_num = (uint32_t)tmf->id; switch (tmf->subtype) { case VIRTIO_SCSI_T_TMF_ABORT_TASK: io->taskio.task_action = CTL_TASK_ABORT_TASK; break; case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: io->taskio.task_action = CTL_TASK_ABORT_TASK_SET; break; case VIRTIO_SCSI_T_TMF_CLEAR_ACA: io->taskio.task_action = CTL_TASK_CLEAR_ACA; break; case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: io->taskio.task_action = CTL_TASK_CLEAR_TASK_SET; break; case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET; break; case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: io->taskio.task_action = CTL_TASK_LUN_RESET; break; case VIRTIO_SCSI_T_TMF_QUERY_TASK: io->taskio.task_action = CTL_TASK_QUERY_TASK; break; case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET: io->taskio.task_action = CTL_TASK_QUERY_TASK_SET; break; } if (pci_vtscsi_debug) { struct sbuf *sb = sbuf_new_auto(); ctl_io_sbuf(io, sb); sbuf_finish(sb); DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb))); sbuf_delete(sb); } err = ioctl(sc->vss_ctl_fd, CTL_IO, io); if (err != 0) WPRINTF(("CTL_IO: err=%d (%s)", errno, strerror(errno))); tmf->response = io->taskio.task_status; ctl_scsi_free_io(io); return (1); } static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *sc, struct pci_vtscsi_ctrl_an *an) { return (0); } static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *q, struct iovec *iov_in, int niov_in, struct iovec *iov_out, int niov_out) { struct pci_vtscsi_softc *sc = q->vsq_sc; struct pci_vtscsi_req_cmd_rd *cmd_rd = NULL; struct pci_vtscsi_req_cmd_wr *cmd_wr; struct iovec data_iov_in[VTSCSI_MAXSEG], data_iov_out[VTSCSI_MAXSEG]; union ctl_io *io; int data_niov_in, data_niov_out; void *ext_data_ptr = NULL; uint32_t ext_data_len = 0, ext_sg_entries = 0; int err, nxferred; seek_iov(iov_in, niov_in, data_iov_in, &data_niov_in, VTSCSI_IN_HEADER_LEN(sc)); seek_iov(iov_out, niov_out, data_iov_out, &data_niov_out, VTSCSI_OUT_HEADER_LEN(sc)); truncate_iov(iov_in, &niov_in, VTSCSI_IN_HEADER_LEN(sc)); truncate_iov(iov_out, &niov_out, VTSCSI_OUT_HEADER_LEN(sc)); iov_to_buf(iov_in, niov_in, (void **)&cmd_rd); cmd_wr = malloc(VTSCSI_OUT_HEADER_LEN(sc)); io = ctl_scsi_alloc_io(sc->vss_iid); ctl_scsi_zero_io(io); io->io_hdr.nexus.initid = sc->vss_iid; io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(cmd_rd->lun); io->io_hdr.io_type = CTL_IO_SCSI; if (data_niov_in > 0) { ext_data_ptr = (void *)data_iov_in; ext_sg_entries = data_niov_in; ext_data_len = count_iov(data_iov_in, data_niov_in); io->io_hdr.flags |= CTL_FLAG_DATA_OUT; } else if (data_niov_out > 0) { ext_data_ptr = (void *)data_iov_out; ext_sg_entries = data_niov_out; ext_data_len = count_iov(data_iov_out, data_niov_out); io->io_hdr.flags |= CTL_FLAG_DATA_IN; } io->scsiio.sense_len = sc->vss_config.sense_size; io->scsiio.tag_num = (uint32_t)cmd_rd->id; switch (cmd_rd->task_attr) { case VIRTIO_SCSI_S_ORDERED: io->scsiio.tag_type = CTL_TAG_ORDERED; break; case VIRTIO_SCSI_S_HEAD: io->scsiio.tag_type = CTL_TAG_HEAD_OF_QUEUE; break; case VIRTIO_SCSI_S_ACA: io->scsiio.tag_type = CTL_TAG_ACA; break; case VIRTIO_SCSI_S_SIMPLE: default: io->scsiio.tag_type = CTL_TAG_SIMPLE; break; } io->scsiio.ext_sg_entries = ext_sg_entries; io->scsiio.ext_data_ptr = ext_data_ptr; io->scsiio.ext_data_len = ext_data_len; io->scsiio.ext_data_filled = 0; io->scsiio.cdb_len = sc->vss_config.cdb_size; memcpy(io->scsiio.cdb, cmd_rd->cdb, sc->vss_config.cdb_size); if (pci_vtscsi_debug) { struct sbuf *sb = sbuf_new_auto(); ctl_io_sbuf(io, sb); sbuf_finish(sb); DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb))); sbuf_delete(sb); } err = ioctl(sc->vss_ctl_fd, CTL_IO, io); if (err != 0) { WPRINTF(("CTL_IO: err=%d (%s)", errno, strerror(errno))); cmd_wr->response = VIRTIO_SCSI_S_FAILURE; } else { cmd_wr->sense_len = MIN(io->scsiio.sense_len, sc->vss_config.sense_size); cmd_wr->residual = io->scsiio.residual; cmd_wr->status = io->scsiio.scsi_status; cmd_wr->response = VIRTIO_SCSI_S_OK; memcpy(&cmd_wr->sense, &io->scsiio.sense_data, cmd_wr->sense_len); } buf_to_iov(cmd_wr, VTSCSI_OUT_HEADER_LEN(sc), iov_out, niov_out, 0); nxferred = VTSCSI_OUT_HEADER_LEN(sc) + io->scsiio.ext_data_filled; free(cmd_rd); free(cmd_wr); ctl_scsi_free_io(io); return (nxferred); } static void pci_vtscsi_controlq_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtscsi_softc *sc; struct iovec iov[VTSCSI_MAXSEG]; - uint16_t idx, n; + struct vi_req req; + uint16_t n; void *buf = NULL; size_t bufsize; int iolen; sc = vsc; while (vq_has_descs(vq)) { - n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, NULL); + n = vq_getchain(vq, iov, VTSCSI_MAXSEG, &req); bufsize = iov_to_buf(iov, n, &buf); iolen = pci_vtscsi_control_handle(sc, buf, bufsize); buf_to_iov(buf + bufsize - iolen, iolen, iov, n, bufsize - iolen); /* * Release this chain and handle more */ - vq_relchain(vq, idx, iolen); + vq_relchain(vq, req.idx, iolen); } vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ free(buf); } static void pci_vtscsi_eventq_notify(void *vsc, struct vqueue_info *vq) { vq_kick_disable(vq); } static void pci_vtscsi_requestq_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtscsi_softc *sc; struct pci_vtscsi_queue *q; struct pci_vtscsi_request *req; struct iovec iov[VTSCSI_MAXSEG]; - uint16_t flags[VTSCSI_MAXSEG]; - uint16_t idx, n, i; - int readable; + struct vi_req vireq; + uint16_t n; sc = vsc; q = &sc->vss_queues[vq->vq_num - 2]; while (vq_has_descs(vq)) { - readable = 0; - n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, flags); - - /* Count readable descriptors */ - for (i = 0; i < n; i++) { - if (flags[i] & VRING_DESC_F_WRITE) - break; - - readable++; - } + n = vq_getchain(vq, iov, VTSCSI_MAXSEG, &vireq); req = calloc(1, sizeof(struct pci_vtscsi_request)); - req->vsr_idx = idx; + req->vsr_idx = vireq.idx; req->vsr_queue = q; - req->vsr_niov_in = readable; - req->vsr_niov_out = n - readable; + req->vsr_niov_in = vireq.readable; + req->vsr_niov_out = vireq.writable; memcpy(req->vsr_iov_in, iov, req->vsr_niov_in * sizeof(struct iovec)); - memcpy(req->vsr_iov_out, iov + readable, + memcpy(req->vsr_iov_out, iov + vireq.readable, req->vsr_niov_out * sizeof(struct iovec)); pthread_mutex_lock(&q->vsq_mtx); STAILQ_INSERT_TAIL(&q->vsq_requests, req, vsr_link); pthread_cond_signal(&q->vsq_cv); pthread_mutex_unlock(&q->vsq_mtx); - DPRINTF(("virtio-scsi: request enqueued", idx)); + DPRINTF(("virtio-scsi: request enqueued", + vireq.idx)); } } static int pci_vtscsi_init_queue(struct pci_vtscsi_softc *sc, struct pci_vtscsi_queue *queue, int num) { struct pci_vtscsi_worker *worker; char tname[MAXCOMLEN + 1]; int i; queue->vsq_sc = sc; queue->vsq_vq = &sc->vss_vq[num + 2]; pthread_mutex_init(&queue->vsq_mtx, NULL); pthread_mutex_init(&queue->vsq_qmtx, NULL); pthread_cond_init(&queue->vsq_cv, NULL); STAILQ_INIT(&queue->vsq_requests); LIST_INIT(&queue->vsq_workers); for (i = 0; i < VTSCSI_THR_PER_Q; i++) { worker = calloc(1, sizeof(struct pci_vtscsi_worker)); worker->vsw_queue = queue; pthread_create(&worker->vsw_thread, NULL, &pci_vtscsi_proc, (void *)worker); snprintf(tname, sizeof(tname), "vtscsi:%d-%d", num, i); pthread_set_name_np(worker->vsw_thread, tname); LIST_INSERT_HEAD(&queue->vsq_workers, worker, vsw_link); } return (0); } static int pci_vtscsi_legacy_config(nvlist_t *nvl, const char *opts) { char *cp, *devname; cp = strchr(opts, ','); if (cp == NULL) { set_config_value_node(nvl, "dev", opts); return (0); } devname = strndup(opts, cp - opts); set_config_value_node(nvl, "dev", devname); free(devname); return (pci_parse_legacy_config(nvl, cp + 1)); } static int pci_vtscsi_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { struct pci_vtscsi_softc *sc; const char *devname, *value;; int i; sc = calloc(1, sizeof(struct pci_vtscsi_softc)); value = get_config_value_node(nvl, "iid"); if (value != NULL) sc->vss_iid = strtoul(value, NULL, 10); devname = get_config_value_node(nvl, "dev"); if (devname == NULL) devname = "/dev/cam/ctl"; sc->vss_ctl_fd = open(devname, O_RDWR); if (sc->vss_ctl_fd < 0) { WPRINTF(("cannot open %s: %s", devname, strerror(errno))); free(sc); return (1); } vi_softc_linkup(&sc->vss_vs, &vtscsi_vi_consts, sc, pi, sc->vss_vq); sc->vss_vs.vs_mtx = &sc->vss_mtx; /* controlq */ sc->vss_vq[0].vq_qsize = VTSCSI_RINGSZ; sc->vss_vq[0].vq_notify = pci_vtscsi_controlq_notify; /* eventq */ sc->vss_vq[1].vq_qsize = VTSCSI_RINGSZ; sc->vss_vq[1].vq_notify = pci_vtscsi_eventq_notify; /* request queues */ for (i = 2; i < VTSCSI_MAXQ; i++) { sc->vss_vq[i].vq_qsize = VTSCSI_RINGSZ; sc->vss_vq[i].vq_notify = pci_vtscsi_requestq_notify; pci_vtscsi_init_queue(sc, &sc->vss_queues[i - 2], i - 2); } /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_SCSI); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_SCSI); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vss_vs, 1, fbsdrun_virtio_msix())) return (1); vi_set_io_bar(&sc->vss_vs, 0); return (0); } struct pci_devemu pci_de_vscsi = { .pe_emu = "virtio-scsi", .pe_init = pci_vtscsi_init, .pe_legacy_config = pci_vtscsi_legacy_config, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read }; PCI_EMUL_SET(pci_de_vscsi); diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c index 078a74b759df..7f0485cbb826 100644 --- a/usr.sbin/bhyve/virtio.c +++ b/usr.sbin/bhyve/virtio.c @@ -1,949 +1,955 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Chris Torek * All rights reserved. * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include +#include #include #include #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" /* * Functions for dealing with generalized "virtual devices" as * defined by */ /* * In case we decide to relax the "virtio softc comes at the * front of virtio-based device softc" constraint, let's use * this to convert. */ #define DEV_SOFTC(vs) ((void *)(vs)) /* * Link a virtio_softc to its constants, the device softc, and * the PCI emulation. */ void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, void *dev_softc, struct pci_devinst *pi, struct vqueue_info *queues) { int i; /* vs and dev_softc addresses must match */ assert((void *)vs == dev_softc); vs->vs_vc = vc; vs->vs_pi = pi; pi->pi_arg = vs; vs->vs_queues = queues; for (i = 0; i < vc->vc_nvq; i++) { queues[i].vq_vs = vs; queues[i].vq_num = i; } } /* * Reset device (device-wide). This erases all queues, i.e., * all the queues become invalid (though we don't wipe out the * internal pointers, we just clear the VQ_ALLOC flag). * * It resets negotiated features to "none". * * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR. */ void vi_reset_dev(struct virtio_softc *vs) { struct vqueue_info *vq; int i, nvq; if (vs->vs_mtx) assert(pthread_mutex_isowned_np(vs->vs_mtx)); nvq = vs->vs_vc->vc_nvq; for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) { vq->vq_flags = 0; vq->vq_last_avail = 0; vq->vq_next_used = 0; vq->vq_save_used = 0; vq->vq_pfn = 0; vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR; } vs->vs_negotiated_caps = 0; vs->vs_curq = 0; /* vs->vs_status = 0; -- redundant */ if (vs->vs_isr) pci_lintr_deassert(vs->vs_pi); vs->vs_isr = 0; vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR; } /* * Set I/O BAR (usually 0) to map PCI config registers. */ void vi_set_io_bar(struct virtio_softc *vs, int barnum) { size_t size; /* * ??? should we use VIRTIO_PCI_CONFIG_OFF(0) if MSI-X is disabled? * Existing code did not... */ size = VIRTIO_PCI_CONFIG_OFF(1) + vs->vs_vc->vc_cfgsize; pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size); } /* * Initialize MSI-X vector capabilities if we're to use MSI-X, * or MSI capabilities if not. * * We assume we want one MSI-X vector per queue, here, plus one * for the config vec. */ int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix) { int nvec; if (use_msix) { vs->vs_flags |= VIRTIO_USE_MSIX; VS_LOCK(vs); vi_reset_dev(vs); /* set all vectors to NO_VECTOR */ VS_UNLOCK(vs); nvec = vs->vs_vc->vc_nvq + 1; if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum)) return (1); } else vs->vs_flags &= ~VIRTIO_USE_MSIX; /* Only 1 MSI vector for bhyve */ pci_emul_add_msicap(vs->vs_pi, 1); /* Legacy interrupts are mandatory for virtio devices */ pci_lintr_request(vs->vs_pi); return (0); } /* * Initialize the currently-selected virtio queue (vs->vs_curq). * The guest just gave us a page frame number, from which we can * calculate the addresses of the queue. */ void vi_vq_init(struct virtio_softc *vs, uint32_t pfn) { struct vqueue_info *vq; uint64_t phys; size_t size; char *base; vq = &vs->vs_queues[vs->vs_curq]; vq->vq_pfn = pfn; phys = (uint64_t)pfn << VRING_PFN; size = vring_size_aligned(vq->vq_qsize); base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size); /* First page(s) are descriptors... */ vq->vq_desc = (struct vring_desc *)base; base += vq->vq_qsize * sizeof(struct vring_desc); /* ... immediately followed by "avail" ring (entirely uint16_t's) */ vq->vq_avail = (struct vring_avail *)base; base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t); /* Then it's rounded up to the next page... */ base = (char *)roundup2((uintptr_t)base, VRING_ALIGN); /* ... and the last page(s) are the used ring. */ vq->vq_used = (struct vring_used *)base; /* Mark queue as allocated, and start at 0 when we use it. */ vq->vq_flags = VQ_ALLOC; vq->vq_last_avail = 0; vq->vq_next_used = 0; vq->vq_save_used = 0; } /* * Helper inline for vq_getchain(): record the i'th "real" * descriptor. */ static inline void -_vq_record(int i, volatile struct vring_desc *vd, struct vmctx *ctx, - struct iovec *iov, int n_iov, uint16_t *flags) { +_vq_record(int i, volatile struct vring_desc *vd, + struct vmctx *ctx, struct iovec *iov, int n_iov, + struct vi_req *reqp) { if (i >= n_iov) return; iov[i].iov_base = paddr_guest2host(ctx, vd->addr, vd->len); iov[i].iov_len = vd->len; - if (flags != NULL) - flags[i] = vd->flags; + if ((vd->flags & VRING_DESC_F_WRITE) == 0) + reqp->readable++; + else + reqp->writable++; } #define VQ_MAX_DESCRIPTORS 512 /* see below */ /* * Examine the chain of descriptors starting at the "next one" to * make sure that they describe a sensible request. If so, return * the number of "real" descriptors that would be needed/used in * acting on this request. This may be smaller than the number of * available descriptors, e.g., if there are two available but * they are two separate requests, this just returns 1. Or, it * may be larger: if there are indirect descriptors involved, * there may only be one descriptor available but it may be an * indirect pointing to eight more. We return 8 in this case, * i.e., we do not count the indirect descriptors, only the "real" * ones. * * Basically, this vets the "flags" and "next" field of each * descriptor and tells you how many are involved. Since some may * be indirect, this also needs the vmctx (in the pci_devinst * at vs->vs_pi) so that it can find indirect descriptors. * * As we process each descriptor, we copy and adjust it (guest to * host address wise, also using the vmtctx) into the given iov[] * array (of the given size). If the array overflows, we stop * placing values into the array but keep processing descriptors, * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1. * So you, the caller, must not assume that iov[] is as big as the * return value (you can process the same thing twice to allocate * a larger iov array if needed, or supply a zero length to find * out how much space is needed). * - * If you want to verify the WRITE flag on each descriptor, pass a - * non-NULL "flags" pointer to an array of "uint16_t" of the same size - * as n_iov and we'll copy each "flags" field after unwinding any - * indirects. - * * If some descriptor(s) are invalid, this prints a diagnostic message * and returns -1. If no descriptors are ready now it simply returns 0. * * You are assumed to have done a vq_ring_ready() if needed (note * that vq_has_descs() does one). */ int -vq_getchain(struct vqueue_info *vq, uint16_t *pidx, - struct iovec *iov, int n_iov, uint16_t *flags) +vq_getchain(struct vqueue_info *vq, struct iovec *iov, int niov, + struct vi_req *reqp) { int i; u_int ndesc, n_indir; u_int idx, next; + struct vi_req req; volatile struct vring_desc *vdir, *vindir, *vp; struct vmctx *ctx; struct virtio_softc *vs; const char *name; vs = vq->vq_vs; name = vs->vs_vc->vc_name; + memset(&req, 0, sizeof(req)); /* * Note: it's the responsibility of the guest not to * update vq->vq_avail->idx until all of the descriptors * the guest has written are valid (including all their * "next" fields and "flags"). * * Compute (vq_avail->idx - last_avail) in integers mod 2**16. This is * the number of descriptors the device has made available * since the last time we updated vq->vq_last_avail. * * We just need to do the subtraction as an unsigned int, * then trim off excess bits. */ idx = vq->vq_last_avail; ndesc = (uint16_t)((u_int)vq->vq_avail->idx - idx); if (ndesc == 0) return (0); if (ndesc > vq->vq_qsize) { /* XXX need better way to diagnose issues */ EPRINTLN( "%s: ndesc (%u) out of range, driver confused?", name, (u_int)ndesc); return (-1); } /* * Now count/parse "involved" descriptors starting from * the head of the chain. * * To prevent loops, we could be more complicated and * check whether we're re-visiting a previously visited * index, but we just abort if the count gets excessive. */ ctx = vs->vs_pi->pi_vmctx; - *pidx = next = vq->vq_avail->ring[idx & (vq->vq_qsize - 1)]; + req.idx = next = vq->vq_avail->ring[idx & (vq->vq_qsize - 1)]; vq->vq_last_avail++; for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->next) { if (next >= vq->vq_qsize) { EPRINTLN( "%s: descriptor index %u out of range, " "driver confused?", name, next); return (-1); } vdir = &vq->vq_desc[next]; if ((vdir->flags & VRING_DESC_F_INDIRECT) == 0) { - _vq_record(i, vdir, ctx, iov, n_iov, flags); + _vq_record(i, vdir, ctx, iov, niov, &req); i++; } else if ((vs->vs_vc->vc_hv_caps & VIRTIO_RING_F_INDIRECT_DESC) == 0) { EPRINTLN( "%s: descriptor has forbidden INDIRECT flag, " "driver confused?", name); return (-1); } else { n_indir = vdir->len / 16; if ((vdir->len & 0xf) || n_indir == 0) { EPRINTLN( "%s: invalid indir len 0x%x, " "driver confused?", name, (u_int)vdir->len); return (-1); } vindir = paddr_guest2host(ctx, vdir->addr, vdir->len); /* * Indirects start at the 0th, then follow * their own embedded "next"s until those run * out. Each one's indirect flag must be off * (we don't really have to check, could just * ignore errors...). */ next = 0; for (;;) { vp = &vindir[next]; if (vp->flags & VRING_DESC_F_INDIRECT) { EPRINTLN( "%s: indirect desc has INDIR flag," " driver confused?", name); return (-1); } - _vq_record(i, vp, ctx, iov, n_iov, flags); + _vq_record(i, vp, ctx, iov, niov, &req); if (++i > VQ_MAX_DESCRIPTORS) goto loopy; if ((vp->flags & VRING_DESC_F_NEXT) == 0) break; next = vp->next; if (next >= n_indir) { EPRINTLN( "%s: invalid next %u > %u, " "driver confused?", name, (u_int)next, n_indir); return (-1); } } } if ((vdir->flags & VRING_DESC_F_NEXT) == 0) - return (i); + goto done; } + loopy: EPRINTLN( "%s: descriptor loop? count > %d - driver confused?", name, i); return (-1); + +done: + *reqp = req; + return (i); } /* * Return the first n_chain request chains back to the available queue. * * (These chains are the ones you handled when you called vq_getchain() * and used its positive return value.) */ void vq_retchains(struct vqueue_info *vq, uint16_t n_chains) { vq->vq_last_avail -= n_chains; } void vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) { volatile struct vring_used *vuh; volatile struct vring_used_elem *vue; uint16_t mask; /* * Notes: * - mask is N-1 where N is a power of 2 so computes x % N * - vuh points to the "used" data shared with guest * - vue points to the "used" ring entry we want to update */ mask = vq->vq_qsize - 1; vuh = vq->vq_used; vue = &vuh->ring[vq->vq_next_used++ & mask]; vue->id = idx; vue->len = iolen; } void vq_relchain_publish(struct vqueue_info *vq) { /* * Ensure the used descriptor is visible before updating the index. * This is necessary on ISAs with memory ordering less strict than x86 * (and even on x86 to act as a compiler barrier). */ atomic_thread_fence_rel(); vq->vq_used->idx = vq->vq_next_used; } /* * Return specified request chain to the guest, setting its I/O length * to the provided value. * * (This chain is the one you handled when you called vq_getchain() * and used its positive return value.) */ void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) { vq_relchain_prepare(vq, idx, iolen); vq_relchain_publish(vq); } /* * Driver has finished processing "available" chains and calling * vq_relchain on each one. If driver used all the available * chains, used_all should be set. * * If the "used" index moved we may need to inform the guest, i.e., * deliver an interrupt. Even if the used index did NOT move we * may need to deliver an interrupt, if the avail ring is empty and * we are supposed to interrupt on empty. * * Note that used_all_avail is provided by the caller because it's * a snapshot of the ring state when he decided to finish interrupt * processing -- it's possible that descriptors became available after * that point. (It's also typically a constant 1/True as well.) */ void vq_endchains(struct vqueue_info *vq, int used_all_avail) { struct virtio_softc *vs; uint16_t event_idx, new_idx, old_idx; int intr; /* * Interrupt generation: if we're using EVENT_IDX, * interrupt if we've crossed the event threshold. * Otherwise interrupt is generated if we added "used" entries, * but suppressed by VRING_AVAIL_F_NO_INTERRUPT. * * In any case, though, if NOTIFY_ON_EMPTY is set and the * entire avail was processed, we need to interrupt always. */ vs = vq->vq_vs; old_idx = vq->vq_save_used; vq->vq_save_used = new_idx = vq->vq_used->idx; /* * Use full memory barrier between "idx" store from preceding * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or * "flags" field below. */ atomic_thread_fence_seq_cst(); if (used_all_avail && (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) intr = 1; else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) { event_idx = VQ_USED_EVENT_IDX(vq); /* * This calculation is per docs and the kernel * (see src/sys/dev/virtio/virtio_ring.h). */ intr = (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old_idx); } else { intr = new_idx != old_idx && !(vq->vq_avail->flags & VRING_AVAIL_F_NO_INTERRUPT); } if (intr) vq_interrupt(vs, vq); } /* Note: these are in sorted order to make for a fast search */ static struct config_reg { uint16_t cr_offset; /* register offset */ uint8_t cr_size; /* size (bytes) */ uint8_t cr_ro; /* true => reg is read only */ const char *cr_name; /* name of reg */ } config_regs[] = { { VIRTIO_PCI_HOST_FEATURES, 4, 1, "HOST_FEATURES" }, { VIRTIO_PCI_GUEST_FEATURES, 4, 0, "GUEST_FEATURES" }, { VIRTIO_PCI_QUEUE_PFN, 4, 0, "QUEUE_PFN" }, { VIRTIO_PCI_QUEUE_NUM, 2, 1, "QUEUE_NUM" }, { VIRTIO_PCI_QUEUE_SEL, 2, 0, "QUEUE_SEL" }, { VIRTIO_PCI_QUEUE_NOTIFY, 2, 0, "QUEUE_NOTIFY" }, { VIRTIO_PCI_STATUS, 1, 0, "STATUS" }, { VIRTIO_PCI_ISR, 1, 0, "ISR" }, { VIRTIO_MSI_CONFIG_VECTOR, 2, 0, "CONFIG_VECTOR" }, { VIRTIO_MSI_QUEUE_VECTOR, 2, 0, "QUEUE_VECTOR" }, }; static inline struct config_reg * vi_find_cr(int offset) { u_int hi, lo, mid; struct config_reg *cr; lo = 0; hi = sizeof(config_regs) / sizeof(*config_regs) - 1; while (hi >= lo) { mid = (hi + lo) >> 1; cr = &config_regs[mid]; if (cr->cr_offset == offset) return (cr); if (cr->cr_offset < offset) lo = mid + 1; else hi = mid - 1; } return (NULL); } /* * Handle pci config space reads. * If it's to the MSI-X info, do that. * If it's part of the virtio standard stuff, do that. * Otherwise dispatch to the actual driver. */ uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct virtio_softc *vs = pi->pi_arg; struct virtio_consts *vc; struct config_reg *cr; uint64_t virtio_config_size, max; const char *name; uint32_t newoff; uint32_t value; int error; if (vs->vs_flags & VIRTIO_USE_MSIX) { if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { return (pci_emul_msix_tread(pi, offset, size)); } } /* XXX probably should do something better than just assert() */ assert(baridx == 0); if (vs->vs_mtx) pthread_mutex_lock(vs->vs_mtx); vc = vs->vs_vc; name = vc->vc_name; value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff; if (size != 1 && size != 2 && size != 4) goto bad; virtio_config_size = VIRTIO_PCI_CONFIG_OFF(pci_msix_enabled(pi)); if (offset >= virtio_config_size) { /* * Subtract off the standard size (including MSI-X * registers if enabled) and dispatch to underlying driver. * If that fails, fall into general code. */ newoff = offset - virtio_config_size; max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; if (newoff + size > max) goto bad; error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value); if (!error) goto done; } bad: cr = vi_find_cr(offset); if (cr == NULL || cr->cr_size != size) { if (cr != NULL) { /* offset must be OK, so size must be bad */ EPRINTLN( "%s: read from %s: bad size %d", name, cr->cr_name, size); } else { EPRINTLN( "%s: read from bad offset/size %jd/%d", name, (uintmax_t)offset, size); } goto done; } switch (offset) { case VIRTIO_PCI_HOST_FEATURES: value = vc->vc_hv_caps; break; case VIRTIO_PCI_GUEST_FEATURES: value = vs->vs_negotiated_caps; break; case VIRTIO_PCI_QUEUE_PFN: if (vs->vs_curq < vc->vc_nvq) value = vs->vs_queues[vs->vs_curq].vq_pfn; break; case VIRTIO_PCI_QUEUE_NUM: value = vs->vs_curq < vc->vc_nvq ? vs->vs_queues[vs->vs_curq].vq_qsize : 0; break; case VIRTIO_PCI_QUEUE_SEL: value = vs->vs_curq; break; case VIRTIO_PCI_QUEUE_NOTIFY: value = 0; /* XXX */ break; case VIRTIO_PCI_STATUS: value = vs->vs_status; break; case VIRTIO_PCI_ISR: value = vs->vs_isr; vs->vs_isr = 0; /* a read clears this flag */ if (value) pci_lintr_deassert(pi); break; case VIRTIO_MSI_CONFIG_VECTOR: value = vs->vs_msix_cfg_idx; break; case VIRTIO_MSI_QUEUE_VECTOR: value = vs->vs_curq < vc->vc_nvq ? vs->vs_queues[vs->vs_curq].vq_msix_idx : VIRTIO_MSI_NO_VECTOR; break; } done: if (vs->vs_mtx) pthread_mutex_unlock(vs->vs_mtx); return (value); } /* * Handle pci config space writes. * If it's to the MSI-X info, do that. * If it's part of the virtio standard stuff, do that. * Otherwise dispatch to the actual driver. */ void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct virtio_softc *vs = pi->pi_arg; struct vqueue_info *vq; struct virtio_consts *vc; struct config_reg *cr; uint64_t virtio_config_size, max; const char *name; uint32_t newoff; int error; if (vs->vs_flags & VIRTIO_USE_MSIX) { if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { pci_emul_msix_twrite(pi, offset, size, value); return; } } /* XXX probably should do something better than just assert() */ assert(baridx == 0); if (vs->vs_mtx) pthread_mutex_lock(vs->vs_mtx); vc = vs->vs_vc; name = vc->vc_name; if (size != 1 && size != 2 && size != 4) goto bad; virtio_config_size = VIRTIO_PCI_CONFIG_OFF(pci_msix_enabled(pi)); if (offset >= virtio_config_size) { /* * Subtract off the standard size (including MSI-X * registers if enabled) and dispatch to underlying driver. */ newoff = offset - virtio_config_size; max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; if (newoff + size > max) goto bad; error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value); if (!error) goto done; } bad: cr = vi_find_cr(offset); if (cr == NULL || cr->cr_size != size || cr->cr_ro) { if (cr != NULL) { /* offset must be OK, wrong size and/or reg is R/O */ if (cr->cr_size != size) EPRINTLN( "%s: write to %s: bad size %d", name, cr->cr_name, size); if (cr->cr_ro) EPRINTLN( "%s: write to read-only reg %s", name, cr->cr_name); } else { EPRINTLN( "%s: write to bad offset/size %jd/%d", name, (uintmax_t)offset, size); } goto done; } switch (offset) { case VIRTIO_PCI_GUEST_FEATURES: vs->vs_negotiated_caps = value & vc->vc_hv_caps; if (vc->vc_apply_features) (*vc->vc_apply_features)(DEV_SOFTC(vs), vs->vs_negotiated_caps); break; case VIRTIO_PCI_QUEUE_PFN: if (vs->vs_curq >= vc->vc_nvq) goto bad_qindex; vi_vq_init(vs, value); break; case VIRTIO_PCI_QUEUE_SEL: /* * Note that the guest is allowed to select an * invalid queue; we just need to return a QNUM * of 0 while the bad queue is selected. */ vs->vs_curq = value; break; case VIRTIO_PCI_QUEUE_NOTIFY: if (value >= vc->vc_nvq) { EPRINTLN("%s: queue %d notify out of range", name, (int)value); goto done; } vq = &vs->vs_queues[value]; if (vq->vq_notify) (*vq->vq_notify)(DEV_SOFTC(vs), vq); else if (vc->vc_qnotify) (*vc->vc_qnotify)(DEV_SOFTC(vs), vq); else EPRINTLN( "%s: qnotify queue %d: missing vq/vc notify", name, (int)value); break; case VIRTIO_PCI_STATUS: vs->vs_status = value; if (value == 0) (*vc->vc_reset)(DEV_SOFTC(vs)); break; case VIRTIO_MSI_CONFIG_VECTOR: vs->vs_msix_cfg_idx = value; break; case VIRTIO_MSI_QUEUE_VECTOR: if (vs->vs_curq >= vc->vc_nvq) goto bad_qindex; vq = &vs->vs_queues[vs->vs_curq]; vq->vq_msix_idx = value; break; } goto done; bad_qindex: EPRINTLN( "%s: write config reg %s: curq %d >= max %d", name, cr->cr_name, vs->vs_curq, vc->vc_nvq); done: if (vs->vs_mtx) pthread_mutex_unlock(vs->vs_mtx); } #ifdef BHYVE_SNAPSHOT int vi_pci_pause(struct vmctx *ctx, struct pci_devinst *pi) { struct virtio_softc *vs; struct virtio_consts *vc; vs = pi->pi_arg; vc = vs->vs_vc; vc = vs->vs_vc; assert(vc->vc_pause != NULL); (*vc->vc_pause)(DEV_SOFTC(vs)); return (0); } int vi_pci_resume(struct vmctx *ctx, struct pci_devinst *pi) { struct virtio_softc *vs; struct virtio_consts *vc; vs = pi->pi_arg; vc = vs->vs_vc; vc = vs->vs_vc; assert(vc->vc_resume != NULL); (*vc->vc_resume)(DEV_SOFTC(vs)); return (0); } static int vi_pci_snapshot_softc(struct virtio_softc *vs, struct vm_snapshot_meta *meta) { int ret; SNAPSHOT_VAR_OR_LEAVE(vs->vs_flags, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_negotiated_caps, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_curq, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_status, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_isr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_msix_cfg_idx, meta, ret, done); done: return (ret); } static int vi_pci_snapshot_consts(struct virtio_consts *vc, struct vm_snapshot_meta *meta) { int ret; SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_nvq, meta, ret, done); SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_cfgsize, meta, ret, done); SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_hv_caps, meta, ret, done); done: return (ret); } static int vi_pci_snapshot_queues(struct virtio_softc *vs, struct vm_snapshot_meta *meta) { int i; int ret; struct virtio_consts *vc; struct vqueue_info *vq; uint64_t addr_size; vc = vs->vs_vc; /* Save virtio queue info */ for (i = 0; i < vc->vc_nvq; i++) { vq = &vs->vs_queues[i]; SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_qsize, meta, ret, done); SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_num, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_flags, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_last_avail, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_next_used, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_save_used, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_msix_idx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_pfn, meta, ret, done); addr_size = vq->vq_qsize * sizeof(struct vring_desc); SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_desc, addr_size, false, meta, ret, done); addr_size = (2 + vq->vq_qsize + 1) * sizeof(uint16_t); SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_avail, addr_size, false, meta, ret, done); addr_size = (2 + 2 * vq->vq_qsize + 1) * sizeof(uint16_t); SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_used, addr_size, false, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(vq->vq_desc, vring_size_aligned(vq->vq_qsize), meta, ret, done); } done: return (ret); } int vi_pci_snapshot(struct vm_snapshot_meta *meta) { int ret; struct pci_devinst *pi; struct virtio_softc *vs; struct virtio_consts *vc; pi = meta->dev_data; vs = pi->pi_arg; vc = vs->vs_vc; /* Save virtio softc */ ret = vi_pci_snapshot_softc(vs, meta); if (ret != 0) goto done; /* Save virtio consts */ ret = vi_pci_snapshot_consts(vc, meta); if (ret != 0) goto done; /* Save virtio queue info */ ret = vi_pci_snapshot_queues(vs, meta); if (ret != 0) goto done; /* Save device softc, if needed */ if (vc->vc_snapshot != NULL) { ret = (*vc->vc_snapshot)(DEV_SOFTC(vs), meta); if (ret != 0) goto done; } done: return (ret); } #endif diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h index c5730f71000e..e03fd5f710d1 100644 --- a/usr.sbin/bhyve/virtio.h +++ b/usr.sbin/bhyve/virtio.h @@ -1,399 +1,411 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Chris Torek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _BHYVE_VIRTIO_H_ #define _BHYVE_VIRTIO_H_ #include #include #include #include /* * These are derived from several virtio specifications. * * Some useful links: * https://github.com/rustyrussell/virtio-spec * http://people.redhat.com/pbonzini/virtio-spec.pdf */ /* * A virtual device has zero or more "virtual queues" (virtqueue). * Each virtqueue uses at least two 4096-byte pages, laid out thus: * * +-----------------------------------------------+ * | "desc": descriptors, 16 bytes each | * | ----------------------------------------- | * | "avail": 2 uint16; uint16; 1 uint16 | * | ----------------------------------------- | * | pad to 4k boundary | * +-----------------------------------------------+ * | "used": 2 x uint16; elems; 1 uint16 | * | ----------------------------------------- | * | pad to 4k boundary | * +-----------------------------------------------+ * * The number that appears here is always a power of two and is * limited to no more than 32768 (as it must fit in a 16-bit field). * If is sufficiently large, the above will occupy more than * two pages. In any case, all pages must be physically contiguous * within the guest's physical address space. * * The 16-byte "desc" descriptors consist of a 64-bit guest * physical address , a 32-bit length , a 16-bit * , and a 16-bit field (all in guest byte order). * * There are three flags that may be set : * NEXT descriptor is chained, so use its "next" field * WRITE descriptor is for host to write into guest RAM * (else host is to read from guest RAM) * INDIRECT descriptor address field is (guest physical) * address of a linear array of descriptors * * Unless INDIRECT is set, is the number of bytes that may * be read/written from guest physical address . If * INDIRECT is set, WRITE is ignored and provides the length * of the indirect descriptors (and must be a multiple of * 16). Note that NEXT may still be set in the main descriptor * pointing to the indirect, and should be set in each indirect * descriptor that uses the next descriptor (these should generally * be numbered sequentially). However, INDIRECT must not be set * in the indirect descriptors. Upon reaching an indirect descriptor * without a NEXT bit, control returns to the direct descriptors. * * Except inside an indirect, each value must be in the * range [0 .. N) (i.e., the half-open interval). (Inside an * indirect, each must be in the range [0 .. /16).) * * The "avail" data structures reside in the same pages as the * "desc" structures since both together are used by the device to * pass information to the hypervisor's virtual driver. These * begin with a 16-bit field and 16-bit index , then * have 16-bit values, followed by one final 16-bit * field . The entries are simply indices * indices into the descriptor ring (and thus must meet the same * constraints as each value). However, is counted * up from 0 (initially) and simply wraps around after 65535; it * is taken mod to find the next available entry. * * The "used" ring occupies a separate page or pages, and contains * values written from the virtual driver back to the guest OS. * This begins with a 16-bit and 16-bit , then there * are "vring_used" elements, followed by a 16-bit . * The "vring_used" elements consist of a 32-bit and a * 32-bit (vu_tlen below). The is simply the index of * the head of a descriptor chain the guest made available * earlier, and the is the number of bytes actually written, * e.g., in the case of a network driver that provided a large * receive buffer but received only a small amount of data. * * The two event fields, and , in the * avail and used rings (respectively -- note the reversal!), are * always provided, but are used only if the virtual device * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature * negotiation. Similarly, both rings provide a flag -- * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in * their field, indicating that the guest does not need an * interrupt, or that the hypervisor driver does not need a * notify, when descriptors are added to the corresponding ring. * (These are provided only for interrupt optimization and need * not be implemented.) */ #define VRING_ALIGN 4096 /* * The address of any given virtual queue is determined by a single * Page Frame Number register. The guest writes the PFN into the * PCI config space. However, a device that has two or more * virtqueues can have a different PFN, and size, for each queue. * The number of queues is determinable via the PCI config space * VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means * queue #0, 1 means queue#1, etc. Once a queue is selected, the * remaining PFN and QNUM registers refer to that queue. * * QNUM is a read-only register containing a nonzero power of two * that indicates the (hypervisor's) queue size. Or, if reading it * produces zero, the hypervisor does not have a corresponding * queue. (The number of possible queues depends on the virtual * device. The block device has just one; the network device * provides either two -- 0 = receive, 1 = transmit -- or three, * with 2 = control.) * * PFN is a read/write register giving the physical page address of * the virtqueue in guest memory (the guest must allocate enough space * based on the hypervisor's provided QNUM). * * QNOTIFY is effectively write-only: when the guest writes a queue * number to the register, the hypervisor should scan the specified * virtqueue. (Reading QNOTIFY currently always gets 0). */ /* * PFN register shift amount */ #define VRING_PFN 12 /* * PCI vendor/device IDs */ #define VIRTIO_VENDOR 0x1AF4 #define VIRTIO_DEV_NET 0x1000 #define VIRTIO_DEV_BLOCK 0x1001 #define VIRTIO_DEV_CONSOLE 0x1003 #define VIRTIO_DEV_RANDOM 0x1005 #define VIRTIO_DEV_SCSI 0x1008 #define VIRTIO_DEV_9P 0x1009 /* From section 2.3, "Virtqueue Configuration", of the virtio specification */ static inline int vring_size_aligned(u_int qsz) { return (roundup2(vring_size(qsz, VRING_ALIGN), VRING_ALIGN)); } struct vmctx; struct pci_devinst; struct vqueue_info; struct vm_snapshot_meta; /* * A virtual device, with some number (possibly 0) of virtual * queues and some size (possibly 0) of configuration-space * registers private to the device. The virtio_softc should come * at the front of each "derived class", so that a pointer to the * virtio_softc is also a pointer to the more specific, derived- * from-virtio driver's softc. * * Note: inside each hypervisor virtio driver, changes to these * data structures must be locked against other threads, if any. * Except for PCI config space register read/write, we assume each * driver does the required locking, but we need a pointer to the * lock (if there is one) for PCI config space read/write ops. * * When the guest reads or writes the device's config space, the * generic layer checks for operations on the special registers * described above. If the offset of the register(s) being read * or written is past the CFG area (CFG0 or CFG1), the request is * passed on to the virtual device, after subtracting off the * generic-layer size. (So, drivers can just use the offset as * an offset into "struct config", for instance.) * * (The virtio layer also makes sure that the read or write is to/ * from a "good" config offset, hence vc_cfgsize, and on BAR #0. * However, the driver must verify the read or write size and offset * and that no one is writing a readonly register.) * * The BROKED flag ("this thing done gone and broked") is for future * use. */ #define VIRTIO_USE_MSIX 0x01 #define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */ #define VIRTIO_BROKED 0x08 /* ??? */ struct virtio_softc { struct virtio_consts *vs_vc; /* constants (see below) */ int vs_flags; /* VIRTIO_* flags from above */ pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */ struct pci_devinst *vs_pi; /* PCI device instance */ uint32_t vs_negotiated_caps; /* negotiated capabilities */ struct vqueue_info *vs_queues; /* one per vc_nvq */ int vs_curq; /* current queue */ uint8_t vs_status; /* value from last status write */ uint8_t vs_isr; /* ISR flags, if not MSI-X */ uint16_t vs_msix_cfg_idx; /* MSI-X vector for config event */ }; #define VS_LOCK(vs) \ do { \ if (vs->vs_mtx) \ pthread_mutex_lock(vs->vs_mtx); \ } while (0) #define VS_UNLOCK(vs) \ do { \ if (vs->vs_mtx) \ pthread_mutex_unlock(vs->vs_mtx); \ } while (0) struct virtio_consts { const char *vc_name; /* name of driver (for diagnostics) */ int vc_nvq; /* number of virtual queues */ size_t vc_cfgsize; /* size of dev-specific config regs */ void (*vc_reset)(void *); /* called on virtual device reset */ void (*vc_qnotify)(void *, struct vqueue_info *); /* called on QNOTIFY if no VQ notify */ int (*vc_cfgread)(void *, int, int, uint32_t *); /* called to read config regs */ int (*vc_cfgwrite)(void *, int, int, uint32_t); /* called to write config regs */ void (*vc_apply_features)(void *, uint64_t); /* called to apply negotiated features */ uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ void (*vc_pause)(void *); /* called to pause device activity */ void (*vc_resume)(void *); /* called to resume device activity */ int (*vc_snapshot)(void *, struct vm_snapshot_meta *); /* called to save / restore device state */ }; /* * Data structure allocated (statically) per virtual queue. * * Drivers may change vq_qsize after a reset. When the guest OS * requests a device reset, the hypervisor first calls * vs->vs_vc->vc_reset(); then the data structure below is * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq). * * The remaining fields should only be fussed-with by the generic * code. * * Note: the addresses of vq_desc, vq_avail, and vq_used are all * computable from each other, but it's a lot simpler if we just * keep a pointer to each one. The event indices are similarly * (but more easily) computable, and this time we'll compute them: * they're just XX_ring[N]. */ #define VQ_ALLOC 0x01 /* set once we have a pfn */ #define VQ_BROKED 0x02 /* ??? */ struct vqueue_info { uint16_t vq_qsize; /* size of this queue (a power of 2) */ void (*vq_notify)(void *, struct vqueue_info *); /* called instead of vc_notify, if not NULL */ struct virtio_softc *vq_vs; /* backpointer to softc */ uint16_t vq_num; /* we're the num'th queue in the softc */ uint16_t vq_flags; /* flags (see above) */ uint16_t vq_last_avail; /* a recent value of vq_avail->idx */ uint16_t vq_next_used; /* index of the next used slot to be filled */ uint16_t vq_save_used; /* saved vq_used->idx; see vq_endchains */ uint16_t vq_msix_idx; /* MSI-X index, or VIRTIO_MSI_NO_VECTOR */ uint32_t vq_pfn; /* PFN of virt queue (not shifted!) */ volatile struct vring_desc *vq_desc; /* descriptor array */ volatile struct vring_avail *vq_avail; /* the "avail" ring */ volatile struct vring_used *vq_used; /* the "used" ring */ }; /* as noted above, these are sort of backwards, name-wise */ #define VQ_AVAIL_EVENT_IDX(vq) \ (*(volatile uint16_t *)&(vq)->vq_used->ring[(vq)->vq_qsize]) #define VQ_USED_EVENT_IDX(vq) \ ((vq)->vq_avail->ring[(vq)->vq_qsize]) /* * Is this ring ready for I/O? */ static inline int vq_ring_ready(struct vqueue_info *vq) { return (vq->vq_flags & VQ_ALLOC); } /* * Are there "available" descriptors? (This does not count * how many, just returns True if there are some.) */ static inline int vq_has_descs(struct vqueue_info *vq) { return (vq_ring_ready(vq) && vq->vq_last_avail != vq->vq_avail->idx); } /* * Deliver an interrupt to guest on the given virtual queue * (if possible, or a generic MSI interrupt if not using MSI-X). */ static inline void vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq) { if (pci_msix_enabled(vs->vs_pi)) pci_generate_msix(vs->vs_pi, vq->vq_msix_idx); else { VS_LOCK(vs); vs->vs_isr |= VIRTIO_PCI_ISR_INTR; pci_generate_msi(vs->vs_pi, 0); pci_lintr_assert(vs->vs_pi); VS_UNLOCK(vs); } } static inline void vq_kick_enable(struct vqueue_info *vq) { vq->vq_used->flags &= ~VRING_USED_F_NO_NOTIFY; /* * Full memory barrier to make sure the store to vq_used->flags * happens before the load from vq_avail->idx, which results from a * subsequent call to vq_has_descs(). */ atomic_thread_fence_seq_cst(); } static inline void vq_kick_disable(struct vqueue_info *vq) { vq->vq_used->flags |= VRING_USED_F_NO_NOTIFY; } struct iovec; + +/* + * Request description returned by vq_getchain. + * + * Writable iovecs start at iov[req.readable]. + */ +struct vi_req { + int readable; /* num of readable iovecs */ + int writable; /* num of writable iovecs */ + unsigned int idx; /* ring index */ +}; + void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, void *dev_softc, struct pci_devinst *pi, struct vqueue_info *queues); int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix); void vi_reset_dev(struct virtio_softc *); void vi_set_io_bar(struct virtio_softc *, int); -int vq_getchain(struct vqueue_info *vq, uint16_t *pidx, - struct iovec *iov, int n_iov, uint16_t *flags); +int vq_getchain(struct vqueue_info *vq, struct iovec *iov, int niov, + struct vi_req *reqp); void vq_retchains(struct vqueue_info *vq, uint16_t n_chains); void vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); void vq_relchain_publish(struct vqueue_info *vq); void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); void vq_endchains(struct vqueue_info *vq, int used_all_avail); uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size); void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value); #ifdef BHYVE_SNAPSHOT int vi_pci_snapshot(struct vm_snapshot_meta *meta); int vi_pci_pause(struct vmctx *ctx, struct pci_devinst *pi); int vi_pci_resume(struct vmctx *ctx, struct pci_devinst *pi); #endif #endif /* _BHYVE_VIRTIO_H_ */