Index: head/usr.sbin/bhyve/net_backends.c =================================================================== --- head/usr.sbin/bhyve/net_backends.c (revision 354551) +++ head/usr.sbin/bhyve/net_backends.c (revision 354552) @@ -1,871 +1,872 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2019 Vincenzo Maffione * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* * This file implements multiple network backends (tap, netmap, ...), * to be used by network frontends such as virtio-net and e1000. * The API to access the backend (e.g. send/receive packets, negotiate * features) is exported by net_backends.h. */ #include __FBSDID("$FreeBSD$"); #include /* u_short etc */ #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #define NETMAP_WITH_LIBS #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "iov.h" #include "mevent.h" #include "net_backends.h" #include /* * Each network backend registers a set of function pointers that are * used to implement the net backends API. * This might need to be exposed if we implement backends in separate files. */ struct net_backend { const char *prefix; /* prefix matching this backend */ /* * Routines used to initialize and cleanup the resources needed * by a backend. The cleanup function is used internally, * and should not be called by the frontend. */ int (*init)(struct net_backend *be, const char *devname, net_be_rxeof_t cb, void *param); void (*cleanup)(struct net_backend *be); /* * Called to serve a guest transmit request. The scatter-gather * vector provided by the caller has 'iovcnt' elements and contains * the packet to send. */ ssize_t (*send)(struct net_backend *be, struct iovec *iov, int iovcnt); /* * Called to receive a packet from the backend. When the function * returns a positive value 'len', the scatter-gather vector * provided by the caller contains a packet with such length. * The function returns 0 if the backend doesn't have a new packet to * receive. */ ssize_t (*recv)(struct net_backend *be, struct iovec *iov, int iovcnt); /* * Ask the backend to enable or disable receive operation in the * backend. On return from a disable operation, it is guaranteed * that the receive callback won't be called until receive is * enabled again. Note however that it is up to the caller to make * sure that netbe_recv() is not currently being executed by another * thread. */ void (*recv_enable)(struct net_backend *be); void (*recv_disable)(struct net_backend *be); /* * Ask the backend for the virtio-net features it is able to * support. Possible features are TSO, UFO and checksum offloading * in both rx and tx direction and for both IPv4 and IPv6. */ uint64_t (*get_cap)(struct net_backend *be); /* * Tell the backend to enable/disable the specified virtio-net * features (capabilities). */ int (*set_cap)(struct net_backend *be, uint64_t features, unsigned int vnet_hdr_len); struct pci_vtnet_softc *sc; int fd; /* * Length of the virtio-net header used by the backend and the * frontend, respectively. A zero value means that the header * is not used. */ unsigned int be_vnet_hdr_len; unsigned int fe_vnet_hdr_len; /* Size of backend-specific private data. */ size_t priv_size; /* Room for backend-specific data. */ char opaque[0]; }; SET_DECLARE(net_backend_set, struct net_backend); #define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) #define WPRINTF(params) printf params /* * The tap backend */ struct tap_priv { struct mevent *mevp; }; static void tap_cleanup(struct net_backend *be) { struct tap_priv *priv = (struct tap_priv *)be->opaque; if (priv->mevp) { mevent_delete(priv->mevp); } if (be->fd != -1) { close(be->fd); be->fd = -1; } } static int tap_init(struct net_backend *be, const char *devname, net_be_rxeof_t cb, void *param) { struct tap_priv *priv = (struct tap_priv *)be->opaque; char tbuf[80]; int opt = 1; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif if (cb == NULL) { WPRINTF(("TAP backend requires non-NULL callback\n")); return (-1); } strcpy(tbuf, "/dev/"); strlcat(tbuf, devname, sizeof(tbuf)); be->fd = open(tbuf, O_RDWR); if (be->fd == -1) { WPRINTF(("open of tap device %s failed\n", tbuf)); goto error; } /* * Set non-blocking and register for read * notifications with the event loop */ if (ioctl(be->fd, FIONBIO, &opt) < 0) { WPRINTF(("tap device O_NONBLOCK failed\n")); goto error; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); if (caph_rights_limit(be->fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); if (priv->mevp == NULL) { WPRINTF(("Could not register event\n")); goto error; } return (0); error: tap_cleanup(be); return (-1); } /* * Called to send a buffer chain out to the tap device */ static ssize_t tap_send(struct net_backend *be, struct iovec *iov, int iovcnt) { return (writev(be->fd, iov, iovcnt)); } static ssize_t tap_recv(struct net_backend *be, struct iovec *iov, int iovcnt) { ssize_t ret; /* Should never be called without a valid tap fd */ assert(be->fd != -1); ret = readv(be->fd, iov, iovcnt); if (ret < 0 && errno == EWOULDBLOCK) { return (0); } return (ret); } static void tap_recv_enable(struct net_backend *be) { struct tap_priv *priv = (struct tap_priv *)be->opaque; mevent_enable(priv->mevp); } static void tap_recv_disable(struct net_backend *be) { struct tap_priv *priv = (struct tap_priv *)be->opaque; mevent_disable(priv->mevp); } static uint64_t tap_get_cap(struct net_backend *be) { return (0); /* no capabilities for now */ } static int tap_set_cap(struct net_backend *be, uint64_t features, unsigned vnet_hdr_len) { return ((features || vnet_hdr_len) ? -1 : 0); } static struct net_backend tap_backend = { .prefix = "tap", .priv_size = sizeof(struct tap_priv), .init = tap_init, .cleanup = tap_cleanup, .send = tap_send, .recv = tap_recv, .recv_enable = tap_recv_enable, .recv_disable = tap_recv_disable, .get_cap = tap_get_cap, .set_cap = tap_set_cap, }; /* A clone of the tap backend, with a different prefix. */ static struct net_backend vmnet_backend = { .prefix = "vmnet", .priv_size = sizeof(struct tap_priv), .init = tap_init, .cleanup = tap_cleanup, .send = tap_send, .recv = tap_recv, .recv_enable = tap_recv_enable, .recv_disable = tap_recv_disable, .get_cap = tap_get_cap, .set_cap = tap_set_cap, }; DATA_SET(net_backend_set, tap_backend); DATA_SET(net_backend_set, vmnet_backend); /* * The netmap backend */ /* The virtio-net features supported by netmap. */ #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ - VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) + VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO | \ + VIRTIO_NET_F_MRG_RXBUF) struct netmap_priv { char ifname[IFNAMSIZ]; struct nm_desc *nmd; uint16_t memid; struct netmap_ring *rx; struct netmap_ring *tx; struct mevent *mevp; net_be_rxeof_t cb; void *cb_param; }; static void nmreq_init(struct nmreq *req, char *ifname) { memset(req, 0, sizeof(*req)); strlcpy(req->nr_name, ifname, sizeof(req->nr_name)); req->nr_version = NETMAP_API; } static int netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) { int err; struct nmreq req; struct netmap_priv *priv = (struct netmap_priv *)be->opaque; nmreq_init(&req, priv->ifname); req.nr_cmd = NETMAP_BDG_VNET_HDR; req.nr_arg1 = vnet_hdr_len; err = ioctl(be->fd, NIOCREGIF, &req); if (err) { WPRINTF(("Unable to set vnet header length %d\n", vnet_hdr_len)); return (err); } be->be_vnet_hdr_len = vnet_hdr_len; return (0); } static int netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) { int prev_hdr_len = be->be_vnet_hdr_len; int ret; if (vnet_hdr_len == prev_hdr_len) { return (1); } ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); if (ret) { return (0); } netmap_set_vnet_hdr_len(be, prev_hdr_len); return (1); } static uint64_t netmap_get_cap(struct net_backend *be) { return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? NETMAP_FEATURES : 0); } static int netmap_set_cap(struct net_backend *be, uint64_t features, unsigned vnet_hdr_len) { return (netmap_set_vnet_hdr_len(be, vnet_hdr_len)); } static int netmap_init(struct net_backend *be, const char *devname, net_be_rxeof_t cb, void *param) { struct netmap_priv *priv = (struct netmap_priv *)be->opaque; strlcpy(priv->ifname, devname, sizeof(priv->ifname)); priv->ifname[sizeof(priv->ifname) - 1] = '\0'; priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); if (priv->nmd == NULL) { WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)\n", devname, strerror(errno))); free(priv); return (-1); } priv->memid = priv->nmd->req.nr_arg2; priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); priv->cb = cb; priv->cb_param = param; be->fd = priv->nmd->fd; priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); if (priv->mevp == NULL) { WPRINTF(("Could not register event\n")); return (-1); } return (0); } static void netmap_cleanup(struct net_backend *be) { struct netmap_priv *priv = (struct netmap_priv *)be->opaque; if (priv->mevp) { mevent_delete(priv->mevp); } if (priv->nmd) { nm_close(priv->nmd); } be->fd = -1; } static ssize_t netmap_send(struct net_backend *be, struct iovec *iov, int iovcnt) { struct netmap_priv *priv = (struct netmap_priv *)be->opaque; struct netmap_ring *ring; ssize_t totlen = 0; int nm_buf_size; int nm_buf_len; uint32_t head; void *nm_buf; int j; ring = priv->tx; head = ring->head; if (head == ring->tail) { WPRINTF(("No space, drop %zu bytes\n", count_iov(iov, iovcnt))); goto txsync; } nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); nm_buf_size = ring->nr_buf_size; nm_buf_len = 0; for (j = 0; j < iovcnt; j++) { int iov_frag_size = iov[j].iov_len; void *iov_frag_buf = iov[j].iov_base; totlen += iov_frag_size; /* * Split each iovec fragment over more netmap slots, if * necessary. */ for (;;) { int copylen; copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; memcpy(nm_buf, iov_frag_buf, copylen); iov_frag_buf += copylen; iov_frag_size -= copylen; nm_buf += copylen; nm_buf_size -= copylen; nm_buf_len += copylen; if (iov_frag_size == 0) { break; } ring->slot[head].len = nm_buf_len; ring->slot[head].flags = NS_MOREFRAG; head = nm_ring_next(ring, head); if (head == ring->tail) { /* * We ran out of netmap slots while * splitting the iovec fragments. */ WPRINTF(("No space, drop %zu bytes\n", count_iov(iov, iovcnt))); goto txsync; } nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); nm_buf_size = ring->nr_buf_size; nm_buf_len = 0; } } /* Complete the last slot, which must not have NS_MOREFRAG set. */ ring->slot[head].len = nm_buf_len; ring->slot[head].flags = 0; head = nm_ring_next(ring, head); /* Now update ring->head and ring->cur. */ ring->head = ring->cur = head; txsync: ioctl(be->fd, NIOCTXSYNC, NULL); return (totlen); } static ssize_t netmap_recv(struct net_backend *be, struct iovec *iov, int iovcnt) { struct netmap_priv *priv = (struct netmap_priv *)be->opaque; struct netmap_slot *slot = NULL; struct netmap_ring *ring; void *iov_frag_buf; int iov_frag_size; ssize_t totlen = 0; uint32_t head; assert(iovcnt); ring = priv->rx; head = ring->head; iov_frag_buf = iov->iov_base; iov_frag_size = iov->iov_len; do { int nm_buf_len; void *nm_buf; if (head == ring->tail) { return (0); } slot = ring->slot + head; nm_buf = NETMAP_BUF(ring, slot->buf_idx); nm_buf_len = slot->len; for (;;) { int copylen = nm_buf_len < iov_frag_size ? nm_buf_len : iov_frag_size; memcpy(iov_frag_buf, nm_buf, copylen); nm_buf += copylen; nm_buf_len -= copylen; iov_frag_buf += copylen; iov_frag_size -= copylen; totlen += copylen; if (nm_buf_len == 0) { break; } iov++; iovcnt--; if (iovcnt == 0) { /* No space to receive. */ WPRINTF(("Short iov, drop %zd bytes\n", totlen)); return (-ENOSPC); } iov_frag_buf = iov->iov_base; iov_frag_size = iov->iov_len; } head = nm_ring_next(ring, head); } while (slot->flags & NS_MOREFRAG); /* Release slots to netmap. */ ring->head = ring->cur = head; return (totlen); } static void netmap_recv_enable(struct net_backend *be) { struct netmap_priv *priv = (struct netmap_priv *)be->opaque; mevent_enable(priv->mevp); } static void netmap_recv_disable(struct net_backend *be) { struct netmap_priv *priv = (struct netmap_priv *)be->opaque; mevent_disable(priv->mevp); } static struct net_backend netmap_backend = { .prefix = "netmap", .priv_size = sizeof(struct netmap_priv), .init = netmap_init, .cleanup = netmap_cleanup, .send = netmap_send, .recv = netmap_recv, .recv_enable = netmap_recv_enable, .recv_disable = netmap_recv_disable, .get_cap = netmap_get_cap, .set_cap = netmap_set_cap, }; /* A clone of the netmap backend, with a different prefix. */ static struct net_backend vale_backend = { .prefix = "vale", .priv_size = sizeof(struct netmap_priv), .init = netmap_init, .cleanup = netmap_cleanup, .send = netmap_send, .recv = netmap_recv, .recv_enable = netmap_recv_enable, .recv_disable = netmap_recv_disable, .get_cap = netmap_get_cap, .set_cap = netmap_set_cap, }; DATA_SET(net_backend_set, netmap_backend); DATA_SET(net_backend_set, vale_backend); /* * Initialize a backend and attach to the frontend. * This is called during frontend initialization. * @pbe is a pointer to the backend to be initialized * @devname is the backend-name as supplied on the command line, * e.g. -s 2:0,frontend-name,backend-name[,other-args] * @cb is the receive callback supplied by the frontend, * and it is invoked in the event loop when a receive * event is generated in the hypervisor, * @param is a pointer to the frontend, and normally used as * the argument for the callback. */ int netbe_init(struct net_backend **ret, const char *devname, net_be_rxeof_t cb, void *param) { struct net_backend **pbe, *nbe, *tbe = NULL; int err; /* * Find the network backend that matches the user-provided * device name. net_backend_set is built using a linker set. */ SET_FOREACH(pbe, net_backend_set) { if (strncmp(devname, (*pbe)->prefix, strlen((*pbe)->prefix)) == 0) { tbe = *pbe; assert(tbe->init != NULL); assert(tbe->cleanup != NULL); assert(tbe->send != NULL); assert(tbe->recv != NULL); assert(tbe->get_cap != NULL); assert(tbe->set_cap != NULL); break; } } *ret = NULL; if (tbe == NULL) return (EINVAL); nbe = calloc(1, sizeof(*nbe) + tbe->priv_size); *nbe = *tbe; /* copy the template */ nbe->fd = -1; nbe->sc = param; nbe->be_vnet_hdr_len = 0; nbe->fe_vnet_hdr_len = 0; /* Initialize the backend. */ err = nbe->init(nbe, devname, cb, param); if (err) { free(nbe); return (err); } *ret = nbe; return (0); } void netbe_cleanup(struct net_backend *be) { if (be != NULL) { be->cleanup(be); free(be); } } uint64_t netbe_get_cap(struct net_backend *be) { assert(be != NULL); return (be->get_cap(be)); } int netbe_set_cap(struct net_backend *be, uint64_t features, unsigned vnet_hdr_len) { int ret; assert(be != NULL); /* There are only three valid lengths, i.e., 0, 10 and 12. */ if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) return (-1); be->fe_vnet_hdr_len = vnet_hdr_len; ret = be->set_cap(be, features, vnet_hdr_len); assert(be->be_vnet_hdr_len == 0 || be->be_vnet_hdr_len == be->fe_vnet_hdr_len); return (ret); } static __inline struct iovec * iov_trim(struct iovec *iov, int *iovcnt, unsigned int tlen) { struct iovec *riov; /* XXX short-cut: assume first segment is >= tlen */ assert(iov[0].iov_len >= tlen); iov[0].iov_len -= tlen; if (iov[0].iov_len == 0) { assert(*iovcnt > 1); *iovcnt -= 1; riov = &iov[1]; } else { iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen); riov = &iov[0]; } return (riov); } ssize_t netbe_send(struct net_backend *be, struct iovec *iov, int iovcnt) { assert(be != NULL); if (be->be_vnet_hdr_len != be->fe_vnet_hdr_len) { /* * The frontend uses a virtio-net header, but the backend * does not. We ignore it (as it must be all zeroes) and * strip it. */ assert(be->be_vnet_hdr_len == 0); iov = iov_trim(iov, &iovcnt, be->fe_vnet_hdr_len); } return (be->send(be, iov, iovcnt)); } /* * Try to read a packet from the backend, without blocking. * If no packets are available, return 0. In case of success, return * the length of the packet just read. Return -1 in case of errors. */ ssize_t netbe_recv(struct net_backend *be, struct iovec *iov, int iovcnt) { /* Length of prepended virtio-net header. */ unsigned int hlen = be->fe_vnet_hdr_len; int ret; assert(be != NULL); if (hlen && hlen != be->be_vnet_hdr_len) { /* * The frontend uses a virtio-net header, but the backend * does not. We need to prepend a zeroed header. */ struct virtio_net_rxhdr *vh; assert(be->be_vnet_hdr_len == 0); /* * Get a pointer to the rx header, and use the * data immediately following it for the packet buffer. */ vh = iov[0].iov_base; iov = iov_trim(iov, &iovcnt, hlen); /* * The only valid field in the rx packet header is the * number of buffers if merged rx bufs were negotiated. */ memset(vh, 0, hlen); if (hlen == VNET_HDR_LEN) { vh->vrh_bufs = 1; } } ret = be->recv(be, iov, iovcnt); if (ret > 0) { ret += hlen; } return (ret); } /* * Read a packet from the backend and discard it. * Returns the size of the discarded packet or zero if no packet was available. * A negative error code is returned in case of read error. */ ssize_t netbe_rx_discard(struct net_backend *be) { /* * MP note: the dummybuf is only used to discard frames, * so there is no need for it to be per-vtnet or locked. * We only make it large enough for TSO-sized segment. */ static uint8_t dummybuf[65536 + 64]; struct iovec iov; iov.iov_base = dummybuf; iov.iov_len = sizeof(dummybuf); return netbe_recv(be, &iov, 1); } void netbe_rx_disable(struct net_backend *be) { return be->recv_disable(be); } void netbe_rx_enable(struct net_backend *be) { return be->recv_enable(be); } Index: head/usr.sbin/bhyve/pci_virtio_console.c =================================================================== --- head/usr.sbin/bhyve/pci_virtio_console.c (revision 354551) +++ head/usr.sbin/bhyve/pci_virtio_console.c (revision 354552) @@ -1,681 +1,681 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 iXsystems Inc. * All rights reserved. * * This software was developed by Jakub Klama * under sponsorship from iXsystems Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "pci_emul.h" #include "virtio.h" #include "mevent.h" #include "sockstream.h" #define VTCON_RINGSZ 64 #define VTCON_MAXPORTS 16 #define VTCON_MAXQ (VTCON_MAXPORTS * 2 + 2) #define VTCON_DEVICE_READY 0 #define VTCON_DEVICE_ADD 1 #define VTCON_DEVICE_REMOVE 2 #define VTCON_PORT_READY 3 #define VTCON_CONSOLE_PORT 4 #define VTCON_CONSOLE_RESIZE 5 #define VTCON_PORT_OPEN 6 #define VTCON_PORT_NAME 7 #define VTCON_F_SIZE 0 #define VTCON_F_MULTIPORT 1 #define VTCON_F_EMERG_WRITE 2 #define VTCON_S_HOSTCAPS \ (VTCON_F_SIZE | VTCON_F_MULTIPORT | VTCON_F_EMERG_WRITE) static int pci_vtcon_debug; #define DPRINTF(params) if (pci_vtcon_debug) printf params #define WPRINTF(params) printf params struct pci_vtcon_softc; struct pci_vtcon_port; struct pci_vtcon_config; typedef void (pci_vtcon_cb_t)(struct pci_vtcon_port *, void *, struct iovec *, int); struct pci_vtcon_port { struct pci_vtcon_softc * vsp_sc; int vsp_id; const char * vsp_name; bool vsp_enabled; bool vsp_console; bool vsp_rx_ready; bool vsp_open; int vsp_rxq; int vsp_txq; void * vsp_arg; pci_vtcon_cb_t * vsp_cb; }; struct pci_vtcon_sock { struct pci_vtcon_port * vss_port; const char * vss_path; struct mevent * vss_server_evp; struct mevent * vss_conn_evp; int vss_server_fd; int vss_conn_fd; bool vss_open; }; struct pci_vtcon_softc { struct virtio_softc vsc_vs; struct vqueue_info vsc_queues[VTCON_MAXQ]; pthread_mutex_t vsc_mtx; uint64_t vsc_cfg; uint64_t vsc_features; char * vsc_rootdir; int vsc_kq; int vsc_nports; bool vsc_ready; struct pci_vtcon_port vsc_control_port; struct pci_vtcon_port vsc_ports[VTCON_MAXPORTS]; struct pci_vtcon_config *vsc_config; }; struct pci_vtcon_config { uint16_t cols; uint16_t rows; uint32_t max_nr_ports; uint32_t emerg_wr; } __attribute__((packed)); struct pci_vtcon_control { uint32_t id; uint16_t event; uint16_t value; } __attribute__((packed)); struct pci_vtcon_console_resize { uint16_t cols; uint16_t rows; } __attribute__((packed)); static void pci_vtcon_reset(void *); static void pci_vtcon_notify_rx(void *, struct vqueue_info *); static void pci_vtcon_notify_tx(void *, struct vqueue_info *); static int pci_vtcon_cfgread(void *, int, int, uint32_t *); static int pci_vtcon_cfgwrite(void *, int, int, uint32_t); static void pci_vtcon_neg_features(void *, uint64_t); static void pci_vtcon_sock_accept(int, enum ev_type, void *); static void pci_vtcon_sock_rx(int, enum ev_type, void *); static void pci_vtcon_sock_tx(struct pci_vtcon_port *, void *, struct iovec *, int); static void pci_vtcon_control_send(struct pci_vtcon_softc *, struct pci_vtcon_control *, const void *, size_t); static void pci_vtcon_announce_port(struct pci_vtcon_port *); static void pci_vtcon_open_port(struct pci_vtcon_port *, bool); static struct virtio_consts vtcon_vi_consts = { "vtcon", /* our name */ VTCON_MAXQ, /* we support VTCON_MAXQ virtqueues */ sizeof(struct pci_vtcon_config), /* config reg size */ pci_vtcon_reset, /* reset */ NULL, /* device-wide qnotify */ pci_vtcon_cfgread, /* read virtio config */ pci_vtcon_cfgwrite, /* write virtio config */ pci_vtcon_neg_features, /* apply negotiated features */ VTCON_S_HOSTCAPS, /* our capabilities */ }; static void pci_vtcon_reset(void *vsc) { struct pci_vtcon_softc *sc; sc = vsc; DPRINTF(("vtcon: device reset requested!\n")); vi_reset_dev(&sc->vsc_vs); } static void pci_vtcon_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vtcon_softc *sc = vsc; sc->vsc_features = negotiated_features; } static int pci_vtcon_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtcon_softc *sc = vsc; void *ptr; ptr = (uint8_t *)sc->vsc_config + offset; memcpy(retval, ptr, size); return (0); } static int pci_vtcon_cfgwrite(void *vsc, int offset, int size, uint32_t val) { return (0); } static inline struct pci_vtcon_port * pci_vtcon_vq_to_port(struct pci_vtcon_softc *sc, struct vqueue_info *vq) { uint16_t num = vq->vq_num; if (num == 0 || num == 1) return (&sc->vsc_ports[0]); if (num == 2 || num == 3) return (&sc->vsc_control_port); return (&sc->vsc_ports[(num / 2) - 1]); } static inline struct vqueue_info * pci_vtcon_port_to_vq(struct pci_vtcon_port *port, bool tx_queue) { int qnum; qnum = tx_queue ? port->vsp_txq : port->vsp_rxq; return (&port->vsp_sc->vsc_queues[qnum]); } static struct pci_vtcon_port * pci_vtcon_port_add(struct pci_vtcon_softc *sc, const char *name, pci_vtcon_cb_t *cb, void *arg) { struct pci_vtcon_port *port; if (sc->vsc_nports == VTCON_MAXPORTS) { errno = EBUSY; return (NULL); } port = &sc->vsc_ports[sc->vsc_nports++]; port->vsp_id = sc->vsc_nports - 1; port->vsp_sc = sc; port->vsp_name = name; port->vsp_cb = cb; port->vsp_arg = arg; if (port->vsp_id == 0) { /* port0 */ port->vsp_txq = 0; port->vsp_rxq = 1; } else { port->vsp_txq = sc->vsc_nports * 2; port->vsp_rxq = port->vsp_txq + 1; } port->vsp_enabled = true; return (port); } static int pci_vtcon_sock_add(struct pci_vtcon_softc *sc, const char *name, const char *path) { struct pci_vtcon_sock *sock; struct sockaddr_un sun; char *pathcopy; int s = -1, fd = -1, error = 0; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif sock = calloc(1, sizeof(struct pci_vtcon_sock)); if (sock == NULL) { error = -1; goto out; } s = socket(AF_UNIX, SOCK_STREAM, 0); if (s < 0) { error = -1; goto out; } pathcopy = strdup(path); if (pathcopy == NULL) { error = -1; goto out; } fd = open(dirname(pathcopy), O_RDONLY | O_DIRECTORY); if (fd < 0) { free(pathcopy); error = -1; goto out; } sun.sun_family = AF_UNIX; sun.sun_len = sizeof(struct sockaddr_un); strcpy(pathcopy, path); strlcpy(sun.sun_path, basename(pathcopy), sizeof(sun.sun_path)); free(pathcopy); if (bindat(fd, s, (struct sockaddr *)&sun, sun.sun_len) < 0) { error = -1; goto out; } if (fcntl(s, F_SETFL, O_NONBLOCK) < 0) { error = -1; goto out; } if (listen(s, 1) < 0) { error = -1; goto out; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE); if (caph_rights_limit(s, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif sock->vss_port = pci_vtcon_port_add(sc, name, pci_vtcon_sock_tx, sock); if (sock->vss_port == NULL) { error = -1; goto out; } sock->vss_open = false; sock->vss_conn_fd = -1; sock->vss_server_fd = s; sock->vss_server_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_accept, sock); if (sock->vss_server_evp == NULL) { error = -1; goto out; } out: if (fd != -1) close(fd); if (error != 0) { if (s != -1) close(s); free(sock); } return (error); } static void pci_vtcon_sock_accept(int fd __unused, enum ev_type t __unused, void *arg) { struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; int s; s = accept(sock->vss_server_fd, NULL, NULL); if (s < 0) return; if (sock->vss_open) { close(s); return; } sock->vss_open = true; sock->vss_conn_fd = s; sock->vss_conn_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_rx, sock); pci_vtcon_open_port(sock->vss_port, true); } static void pci_vtcon_sock_rx(int fd __unused, enum ev_type t __unused, void *arg) { struct pci_vtcon_port *port; struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; struct vqueue_info *vq; struct iovec iov; static char dummybuf[2048]; int len, n; uint16_t idx; port = sock->vss_port; vq = pci_vtcon_port_to_vq(port, true); if (!sock->vss_open || !port->vsp_rx_ready) { len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); if (len == 0) goto close; return; } if (!vq_has_descs(vq)) { len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); vq_endchains(vq, 1); if (len == 0) goto close; return; } do { n = vq_getchain(vq, &idx, &iov, 1, NULL); len = readv(sock->vss_conn_fd, &iov, n); if (len == 0 || (len < 0 && errno == EWOULDBLOCK)) { - vq_retchain(vq); + vq_retchains(vq, 1); vq_endchains(vq, 0); if (len == 0) goto close; return; } vq_relchain(vq, idx, len); } while (vq_has_descs(vq)); vq_endchains(vq, 1); close: mevent_delete_close(sock->vss_conn_evp); sock->vss_conn_fd = -1; sock->vss_open = false; } static void pci_vtcon_sock_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov, int niov) { struct pci_vtcon_sock *sock; int i, ret; sock = (struct pci_vtcon_sock *)arg; if (sock->vss_conn_fd == -1) return; for (i = 0; i < niov; i++) { ret = stream_write(sock->vss_conn_fd, iov[i].iov_base, iov[i].iov_len); if (ret <= 0) break; } if (ret <= 0) { mevent_delete_close(sock->vss_conn_evp); sock->vss_conn_fd = -1; sock->vss_open = false; } } static void pci_vtcon_control_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov, int niov) { struct pci_vtcon_softc *sc; struct pci_vtcon_port *tmp; struct pci_vtcon_control resp, *ctrl; int i; assert(niov == 1); sc = port->vsp_sc; ctrl = (struct pci_vtcon_control *)iov->iov_base; switch (ctrl->event) { case VTCON_DEVICE_READY: sc->vsc_ready = true; /* set port ready events for registered ports */ for (i = 0; i < VTCON_MAXPORTS; i++) { tmp = &sc->vsc_ports[i]; if (tmp->vsp_enabled) pci_vtcon_announce_port(tmp); if (tmp->vsp_open) pci_vtcon_open_port(tmp, true); } break; case VTCON_PORT_READY: if (ctrl->id >= sc->vsc_nports) { WPRINTF(("VTCON_PORT_READY event for unknown port %d\n", ctrl->id)); return; } tmp = &sc->vsc_ports[ctrl->id]; if (tmp->vsp_console) { resp.event = VTCON_CONSOLE_PORT; resp.id = ctrl->id; resp.value = 1; pci_vtcon_control_send(sc, &resp, NULL, 0); } break; } } static void pci_vtcon_announce_port(struct pci_vtcon_port *port) { struct pci_vtcon_control event; event.id = port->vsp_id; event.event = VTCON_DEVICE_ADD; event.value = 1; pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); event.event = VTCON_PORT_NAME; pci_vtcon_control_send(port->vsp_sc, &event, port->vsp_name, strlen(port->vsp_name)); } static void pci_vtcon_open_port(struct pci_vtcon_port *port, bool open) { struct pci_vtcon_control event; if (!port->vsp_sc->vsc_ready) { port->vsp_open = true; return; } event.id = port->vsp_id; event.event = VTCON_PORT_OPEN; event.value = (int)open; pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); } static void pci_vtcon_control_send(struct pci_vtcon_softc *sc, struct pci_vtcon_control *ctrl, const void *payload, size_t len) { struct vqueue_info *vq; struct iovec iov; uint16_t idx; int n; vq = pci_vtcon_port_to_vq(&sc->vsc_control_port, true); if (!vq_has_descs(vq)) return; n = vq_getchain(vq, &idx, &iov, 1, NULL); assert(n == 1); memcpy(iov.iov_base, ctrl, sizeof(struct pci_vtcon_control)); if (payload != NULL && len > 0) memcpy(iov.iov_base + sizeof(struct pci_vtcon_control), payload, len); vq_relchain(vq, idx, sizeof(struct pci_vtcon_control) + len); vq_endchains(vq, 1); } static void pci_vtcon_notify_tx(void *vsc, struct vqueue_info *vq) { struct pci_vtcon_softc *sc; struct pci_vtcon_port *port; struct iovec iov[1]; uint16_t idx, n; uint16_t flags[8]; sc = vsc; port = pci_vtcon_vq_to_port(sc, vq); while (vq_has_descs(vq)) { n = vq_getchain(vq, &idx, iov, 1, flags); assert(n >= 1); if (port != NULL) port->vsp_cb(port, port->vsp_arg, iov, 1); /* * Release this chain and handle more */ vq_relchain(vq, idx, 0); } vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ } static void pci_vtcon_notify_rx(void *vsc, struct vqueue_info *vq) { struct pci_vtcon_softc *sc; struct pci_vtcon_port *port; sc = vsc; port = pci_vtcon_vq_to_port(sc, vq); if (!port->vsp_rx_ready) { port->vsp_rx_ready = 1; vq_kick_disable(vq); } } static int pci_vtcon_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { struct pci_vtcon_softc *sc; char *portname = NULL; char *portpath = NULL; char *opt; int i; sc = calloc(1, sizeof(struct pci_vtcon_softc)); sc->vsc_config = calloc(1, sizeof(struct pci_vtcon_config)); sc->vsc_config->max_nr_ports = VTCON_MAXPORTS; sc->vsc_config->cols = 80; sc->vsc_config->rows = 25; vi_softc_linkup(&sc->vsc_vs, &vtcon_vi_consts, sc, pi, sc->vsc_queues); sc->vsc_vs.vs_mtx = &sc->vsc_mtx; for (i = 0; i < VTCON_MAXQ; i++) { sc->vsc_queues[i].vq_qsize = VTCON_RINGSZ; sc->vsc_queues[i].vq_notify = i % 2 == 0 ? pci_vtcon_notify_rx : pci_vtcon_notify_tx; } /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_CONSOLE); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_CONSOLE); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) return (1); vi_set_io_bar(&sc->vsc_vs, 0); /* create control port */ sc->vsc_control_port.vsp_sc = sc; sc->vsc_control_port.vsp_txq = 2; sc->vsc_control_port.vsp_rxq = 3; sc->vsc_control_port.vsp_cb = pci_vtcon_control_tx; sc->vsc_control_port.vsp_enabled = true; while ((opt = strsep(&opts, ",")) != NULL) { portname = strsep(&opt, "="); portpath = opt; /* create port */ if (pci_vtcon_sock_add(sc, portname, portpath) < 0) { fprintf(stderr, "cannot create port %s: %s\n", portname, strerror(errno)); return (1); } } return (0); } struct pci_devemu pci_de_vcon = { .pe_emu = "virtio-console", .pe_init = pci_vtcon_init, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read }; PCI_EMUL_SET(pci_de_vcon); Index: head/usr.sbin/bhyve/pci_virtio_net.c =================================================================== --- head/usr.sbin/bhyve/pci_virtio_net.c (revision 354551) +++ head/usr.sbin/bhyve/pci_virtio_net.c (revision 354552) @@ -1,528 +1,591 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "pci_emul.h" #include "mevent.h" #include "virtio.h" #include "net_utils.h" #include "net_backends.h" +#include "iov.h" #define VTNET_RINGSZ 1024 #define VTNET_MAXSEGS 256 +#define VTNET_MAX_PKT_LEN (65536 + 64) + #define VTNET_S_HOSTCAPS \ ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | \ VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC) /* * PCI config-space "registers" */ struct virtio_net_config { uint8_t mac[6]; uint16_t status; } __packed; /* * Queue definitions. */ #define VTNET_RXQ 0 #define VTNET_TXQ 1 #define VTNET_CTLQ 2 /* NB: not yet supported */ #define VTNET_MAXQ 3 /* * Debug printf */ static int pci_vtnet_debug; #define DPRINTF(params) if (pci_vtnet_debug) printf params #define WPRINTF(params) printf params /* * Per-device softc */ struct pci_vtnet_softc { struct virtio_softc vsc_vs; struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; pthread_mutex_t vsc_mtx; net_backend_t *vsc_be; int resetting; /* protected by tx_mtx */ uint64_t vsc_features; /* negotiated features */ pthread_mutex_t rx_mtx; unsigned int rx_vhdrlen; int rx_merge; /* merged rx bufs in use */ pthread_t tx_tid; pthread_mutex_t tx_mtx; pthread_cond_t tx_cond; int tx_in_progress; struct virtio_net_config vsc_config; struct virtio_consts vsc_consts; }; static void pci_vtnet_reset(void *); /* static void pci_vtnet_notify(void *, struct vqueue_info *); */ static int pci_vtnet_cfgread(void *, int, int, uint32_t *); static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); static void pci_vtnet_neg_features(void *, uint64_t); static struct virtio_consts vtnet_vi_consts = { "vtnet", /* our name */ VTNET_MAXQ - 1, /* we currently support 2 virtqueues */ sizeof(struct virtio_net_config), /* config reg size */ pci_vtnet_reset, /* reset */ NULL, /* device-wide qnotify -- not used */ pci_vtnet_cfgread, /* read PCI config */ pci_vtnet_cfgwrite, /* write PCI config */ pci_vtnet_neg_features, /* apply negotiated features */ VTNET_S_HOSTCAPS, /* our capabilities */ }; static void pci_vtnet_reset(void *vsc) { struct pci_vtnet_softc *sc = vsc; DPRINTF(("vtnet: device reset requested !\n")); /* Acquire the RX lock to block RX processing. */ pthread_mutex_lock(&sc->rx_mtx); /* Set sc->resetting and give a chance to the TX thread to stop. */ pthread_mutex_lock(&sc->tx_mtx); sc->resetting = 1; while (sc->tx_in_progress) { pthread_mutex_unlock(&sc->tx_mtx); usleep(10000); pthread_mutex_lock(&sc->tx_mtx); } sc->rx_merge = 1; sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); /* * Now reset rings, MSI-X vectors, and negotiated capabilities. * Do that with the TX lock held, since we need to reset * sc->resetting. */ vi_reset_dev(&sc->vsc_vs); sc->resetting = 0; pthread_mutex_unlock(&sc->tx_mtx); pthread_mutex_unlock(&sc->rx_mtx); } +struct virtio_mrg_rxbuf_info { + uint16_t idx; + uint16_t pad; + uint32_t len; +}; + static void pci_vtnet_rx(struct pci_vtnet_softc *sc) { + struct virtio_mrg_rxbuf_info info[VTNET_MAXSEGS]; struct iovec iov[VTNET_MAXSEGS + 1]; struct vqueue_info *vq; - int len, n; - uint16_t idx; + uint32_t cur_iov_bytes; + struct iovec *cur_iov; + uint16_t cur_iov_len; + uint32_t ulen; + int n_chains; + int len; vq = &sc->vsc_queues[VTNET_RXQ]; for (;;) { /* - * Check for available rx buffers. + * Get a descriptor chain to store the next ingress + * packet. In case of mergeable rx buffers, get as + * many chains as necessary in order to make room + * for a maximum sized LRO packet. */ - if (!vq_has_descs(vq)) { - /* No rx buffers. Enable RX kicks and double check. */ - vq_kick_enable(vq); - if (!vq_has_descs(vq)) { + cur_iov_bytes = 0; + cur_iov_len = 0; + cur_iov = iov; + n_chains = 0; + do { + int n = vq_getchain(vq, &info[n_chains].idx, cur_iov, + VTNET_MAXSEGS - cur_iov_len, NULL); + + if (n == 0) { /* - * Still no buffers. Interrupt if needed - * (including for NOTIFY_ON_EMPTY), and - * disable the backend until the next kick. + * No rx buffers. Enable RX kicks and double + * check. */ - vq_endchains(vq, /*used_all_avail=*/1); - netbe_rx_disable(sc->vsc_be); - return; + vq_kick_enable(vq); + if (!vq_has_descs(vq)) { + /* + * Still no buffers. Return the unused + * chains (if any), interrupt if needed + * (including for NOTIFY_ON_EMPTY), and + * disable the backend until the next + * kick. + */ + vq_retchains(vq, n_chains); + vq_endchains(vq, /*used_all_avail=*/1); + netbe_rx_disable(sc->vsc_be); + return; + } + + /* More rx buffers found, so keep going. */ + vq_kick_disable(vq); + continue; } + assert(n >= 1 && cur_iov_len + n <= VTNET_MAXSEGS); + cur_iov_len += n; + if (!sc->rx_merge) { + n_chains = 1; + break; + } + info[n_chains].len = (uint32_t)count_iov(cur_iov, n); + cur_iov_bytes += info[n_chains].len; + cur_iov += n; + n_chains++; + } while (cur_iov_bytes < VTNET_MAX_PKT_LEN && + cur_iov_len < VTNET_MAXSEGS); - /* More rx buffers found, so keep going. */ - vq_kick_disable(vq); - } + len = netbe_recv(sc->vsc_be, iov, cur_iov_len); - /* - * Get descriptor chain. - */ - n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); - assert(n >= 1 && n <= VTNET_MAXSEGS); - - len = netbe_recv(sc->vsc_be, iov, n); - if (len <= 0) { /* * No more packets (len == 0), or backend errored * (err < 0). Return unused available buffers * and stop. */ - vq_retchain(vq); + vq_retchains(vq, n_chains); /* Interrupt if needed/appropriate and stop. */ vq_endchains(vq, /*used_all_avail=*/0); return; } - /* Publish the info to the guest */ - vq_relchain(vq, idx, (uint32_t)len); + ulen = (uint32_t)len; /* avoid too many casts below */ + + /* Publish the used buffers to the guest. */ + if (!sc->rx_merge) { + vq_relchain(vq, info[0].idx, ulen); + } else { + struct virtio_net_rxhdr *hdr = iov[0].iov_base; + uint32_t iolen; + int i = 0; + + assert(iov[0].iov_len >= sizeof(*hdr)); + + do { + iolen = info[i].len; + if (iolen > ulen) { + iolen = ulen; + } + vq_relchain_prepare(vq, info[i].idx, iolen); + ulen -= iolen; + i++; + assert(i <= n_chains); + } while (ulen > 0); + + hdr->vrh_bufs = i; + vq_relchain_publish(vq); + vq_retchains(vq, n_chains - i); + } } } /* * Called when there is read activity on the backend file descriptor. * Each buffer posted by the guest is assumed to be able to contain * an entire ethernet frame + rx header. */ static void pci_vtnet_rx_callback(int fd, enum ev_type type, void *param) { struct pci_vtnet_softc *sc = param; pthread_mutex_lock(&sc->rx_mtx); pci_vtnet_rx(sc); pthread_mutex_unlock(&sc->rx_mtx); } /* Called on RX kick. */ static void pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) { struct pci_vtnet_softc *sc = vsc; /* * A qnotify means that the rx process can now begin. */ pthread_mutex_lock(&sc->rx_mtx); vq_kick_disable(vq); netbe_rx_enable(sc->vsc_be); pthread_mutex_unlock(&sc->rx_mtx); } /* TX virtqueue processing, called by the TX thread. */ static void pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) { struct iovec iov[VTNET_MAXSEGS + 1]; uint16_t idx; ssize_t len; int n; /* * Obtain chain of descriptors. The first descriptor also * contains the virtio-net header. */ n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); assert(n >= 1 && n <= VTNET_MAXSEGS); len = netbe_send(sc->vsc_be, iov, n); /* chain is processed, release it and set len */ vq_relchain(vq, idx, len > 0 ? len : 0); } /* Called on TX kick. */ static void pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) { struct pci_vtnet_softc *sc = vsc; /* * Any ring entries to process? */ if (!vq_has_descs(vq)) return; /* Signal the tx thread for processing */ pthread_mutex_lock(&sc->tx_mtx); vq_kick_disable(vq); if (sc->tx_in_progress == 0) pthread_cond_signal(&sc->tx_cond); pthread_mutex_unlock(&sc->tx_mtx); } /* * Thread which will handle processing of TX desc */ static void * pci_vtnet_tx_thread(void *param) { struct pci_vtnet_softc *sc = param; struct vqueue_info *vq; int error; vq = &sc->vsc_queues[VTNET_TXQ]; /* * Let us wait till the tx queue pointers get initialised & * first tx signaled */ pthread_mutex_lock(&sc->tx_mtx); error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); assert(error == 0); for (;;) { /* note - tx mutex is locked here */ while (sc->resetting || !vq_has_descs(vq)) { vq_kick_enable(vq); if (!sc->resetting && vq_has_descs(vq)) break; sc->tx_in_progress = 0; error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); assert(error == 0); } vq_kick_disable(vq); sc->tx_in_progress = 1; pthread_mutex_unlock(&sc->tx_mtx); do { /* * Run through entries, placing them into * iovecs and sending when an end-of-packet * is found */ pci_vtnet_proctx(sc, vq); } while (vq_has_descs(vq)); /* * Generate an interrupt if needed. */ vq_endchains(vq, /*used_all_avail=*/1); pthread_mutex_lock(&sc->tx_mtx); } } #ifdef notyet static void pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) { DPRINTF(("vtnet: control qnotify!\n\r")); } #endif static int pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { struct pci_vtnet_softc *sc; char tname[MAXCOMLEN + 1]; int mac_provided; /* * Allocate data structures for further virtio initializations. * sc also contains a copy of vtnet_vi_consts, since capabilities * change depending on the backend. */ sc = calloc(1, sizeof(struct pci_vtnet_softc)); sc->vsc_consts = vtnet_vi_consts; pthread_mutex_init(&sc->vsc_mtx, NULL); sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; #ifdef notyet sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; #endif /* * Attempt to open the backend device and read the MAC address * if specified. */ mac_provided = 0; if (opts != NULL) { char *devname; char *vtopts; int err; devname = vtopts = strdup(opts); (void) strsep(&vtopts, ","); if (vtopts != NULL) { err = net_parsemac(vtopts, sc->vsc_config.mac); if (err != 0) { free(devname); free(sc); return (err); } mac_provided = 1; } err = netbe_init(&sc->vsc_be, devname, pci_vtnet_rx_callback, sc); free(devname); if (err) { free(sc); return (err); } sc->vsc_consts.vc_hv_caps |= netbe_get_cap(sc->vsc_be); } if (!mac_provided) { net_genmac(pi, sc->vsc_config.mac); } /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); /* Link is up if we managed to open backend device. */ sc->vsc_config.status = (opts == NULL || sc->vsc_be); vi_softc_linkup(&sc->vsc_vs, &sc->vsc_consts, sc, pi, sc->vsc_queues); sc->vsc_vs.vs_mtx = &sc->vsc_mtx; /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) { free(sc); return (1); } /* use BAR 0 to map config regs in IO space */ vi_set_io_bar(&sc->vsc_vs, 0); sc->resetting = 0; sc->rx_merge = 1; sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); pthread_mutex_init(&sc->rx_mtx, NULL); /* * Initialize tx semaphore & spawn TX processing thread. * As of now, only one thread for TX desc processing is * spawned. */ sc->tx_in_progress = 0; pthread_mutex_init(&sc->tx_mtx, NULL); pthread_cond_init(&sc->tx_cond, NULL); pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, pi->pi_func); pthread_set_name_np(sc->tx_tid, tname); return (0); } static int pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) { struct pci_vtnet_softc *sc = vsc; void *ptr; if (offset < (int)sizeof(sc->vsc_config.mac)) { assert(offset + size <= (int)sizeof(sc->vsc_config.mac)); /* * The driver is allowed to change the MAC address */ ptr = &sc->vsc_config.mac[offset]; memcpy(ptr, &value, size); } else { /* silently ignore other writes */ DPRINTF(("vtnet: write to readonly reg %d\n\r", offset)); } return (0); } static int pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtnet_softc *sc = vsc; void *ptr; ptr = (uint8_t *)&sc->vsc_config + offset; memcpy(retval, ptr, size); return (0); } static void pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vtnet_softc *sc = vsc; sc->vsc_features = negotiated_features; if (!(negotiated_features & VIRTIO_NET_F_MRG_RXBUF)) { sc->rx_merge = 0; /* Without mergeable rx buffers, virtio-net header is 2 * bytes shorter than sizeof(struct virtio_net_rxhdr). */ sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr) - 2; } /* Tell the backend to enable some capabilities it has advertised. */ netbe_set_cap(sc->vsc_be, negotiated_features, sc->rx_vhdrlen); } static struct pci_devemu pci_de_vnet = { .pe_emu = "virtio-net", .pe_init = pci_vtnet_init, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read }; PCI_EMUL_SET(pci_de_vnet); Index: head/usr.sbin/bhyve/virtio.c =================================================================== --- head/usr.sbin/bhyve/virtio.c (revision 354551) +++ head/usr.sbin/bhyve/virtio.c (revision 354552) @@ -1,796 +1,807 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Chris Torek * All rights reserved. * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include "bhyverun.h" #include "pci_emul.h" #include "virtio.h" /* * Functions for dealing with generalized "virtual devices" as * defined by */ /* * In case we decide to relax the "virtio softc comes at the * front of virtio-based device softc" constraint, let's use * this to convert. */ #define DEV_SOFTC(vs) ((void *)(vs)) /* * Link a virtio_softc to its constants, the device softc, and * the PCI emulation. */ void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, void *dev_softc, struct pci_devinst *pi, struct vqueue_info *queues) { int i; /* vs and dev_softc addresses must match */ assert((void *)vs == dev_softc); vs->vs_vc = vc; vs->vs_pi = pi; pi->pi_arg = vs; vs->vs_queues = queues; for (i = 0; i < vc->vc_nvq; i++) { queues[i].vq_vs = vs; queues[i].vq_num = i; } } /* * Reset device (device-wide). This erases all queues, i.e., * all the queues become invalid (though we don't wipe out the * internal pointers, we just clear the VQ_ALLOC flag). * * It resets negotiated features to "none". * * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR. */ void vi_reset_dev(struct virtio_softc *vs) { struct vqueue_info *vq; int i, nvq; if (vs->vs_mtx) assert(pthread_mutex_isowned_np(vs->vs_mtx)); nvq = vs->vs_vc->vc_nvq; for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) { vq->vq_flags = 0; vq->vq_last_avail = 0; + vq->vq_next_used = 0; vq->vq_save_used = 0; vq->vq_pfn = 0; vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR; } vs->vs_negotiated_caps = 0; vs->vs_curq = 0; /* vs->vs_status = 0; -- redundant */ if (vs->vs_isr) pci_lintr_deassert(vs->vs_pi); vs->vs_isr = 0; vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR; } /* * Set I/O BAR (usually 0) to map PCI config registers. */ void vi_set_io_bar(struct virtio_softc *vs, int barnum) { size_t size; /* * ??? should we use CFG0 if MSI-X is disabled? * Existing code did not... */ size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize; pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size); } /* * Initialize MSI-X vector capabilities if we're to use MSI-X, * or MSI capabilities if not. * * We assume we want one MSI-X vector per queue, here, plus one * for the config vec. */ int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix) { int nvec; if (use_msix) { vs->vs_flags |= VIRTIO_USE_MSIX; VS_LOCK(vs); vi_reset_dev(vs); /* set all vectors to NO_VECTOR */ VS_UNLOCK(vs); nvec = vs->vs_vc->vc_nvq + 1; if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum)) return (1); } else vs->vs_flags &= ~VIRTIO_USE_MSIX; /* Only 1 MSI vector for bhyve */ pci_emul_add_msicap(vs->vs_pi, 1); /* Legacy interrupts are mandatory for virtio devices */ pci_lintr_request(vs->vs_pi); return (0); } /* * Initialize the currently-selected virtio queue (vs->vs_curq). * The guest just gave us a page frame number, from which we can * calculate the addresses of the queue. */ void vi_vq_init(struct virtio_softc *vs, uint32_t pfn) { struct vqueue_info *vq; uint64_t phys; size_t size; char *base; vq = &vs->vs_queues[vs->vs_curq]; vq->vq_pfn = pfn; phys = (uint64_t)pfn << VRING_PFN; size = vring_size(vq->vq_qsize); base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size); /* First page(s) are descriptors... */ vq->vq_desc = (struct virtio_desc *)base; base += vq->vq_qsize * sizeof(struct virtio_desc); /* ... immediately followed by "avail" ring (entirely uint16_t's) */ vq->vq_avail = (struct vring_avail *)base; base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t); /* Then it's rounded up to the next page... */ base = (char *)roundup2((uintptr_t)base, VRING_ALIGN); /* ... and the last page(s) are the used ring. */ vq->vq_used = (struct vring_used *)base; /* Mark queue as allocated, and start at 0 when we use it. */ vq->vq_flags = VQ_ALLOC; vq->vq_last_avail = 0; + vq->vq_next_used = 0; vq->vq_save_used = 0; } /* * Helper inline for vq_getchain(): record the i'th "real" * descriptor. */ static inline void _vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx, struct iovec *iov, int n_iov, uint16_t *flags) { if (i >= n_iov) return; iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len); iov[i].iov_len = vd->vd_len; if (flags != NULL) flags[i] = vd->vd_flags; } #define VQ_MAX_DESCRIPTORS 512 /* see below */ /* * Examine the chain of descriptors starting at the "next one" to * make sure that they describe a sensible request. If so, return * the number of "real" descriptors that would be needed/used in * acting on this request. This may be smaller than the number of * available descriptors, e.g., if there are two available but * they are two separate requests, this just returns 1. Or, it * may be larger: if there are indirect descriptors involved, * there may only be one descriptor available but it may be an * indirect pointing to eight more. We return 8 in this case, * i.e., we do not count the indirect descriptors, only the "real" * ones. * * Basically, this vets the vd_flags and vd_next field of each * descriptor and tells you how many are involved. Since some may * be indirect, this also needs the vmctx (in the pci_devinst * at vs->vs_pi) so that it can find indirect descriptors. * * As we process each descriptor, we copy and adjust it (guest to * host address wise, also using the vmtctx) into the given iov[] * array (of the given size). If the array overflows, we stop * placing values into the array but keep processing descriptors, * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1. * So you, the caller, must not assume that iov[] is as big as the * return value (you can process the same thing twice to allocate * a larger iov array if needed, or supply a zero length to find * out how much space is needed). * * If you want to verify the WRITE flag on each descriptor, pass a * non-NULL "flags" pointer to an array of "uint16_t" of the same size * as n_iov and we'll copy each vd_flags field after unwinding any * indirects. * * If some descriptor(s) are invalid, this prints a diagnostic message * and returns -1. If no descriptors are ready now it simply returns 0. * * You are assumed to have done a vq_ring_ready() if needed (note * that vq_has_descs() does one). */ int vq_getchain(struct vqueue_info *vq, uint16_t *pidx, struct iovec *iov, int n_iov, uint16_t *flags) { int i; u_int ndesc, n_indir; u_int idx, next; volatile struct virtio_desc *vdir, *vindir, *vp; struct vmctx *ctx; struct virtio_softc *vs; const char *name; vs = vq->vq_vs; name = vs->vs_vc->vc_name; /* * Note: it's the responsibility of the guest not to * update vq->vq_avail->va_idx until all of the descriptors * the guest has written are valid (including all their * vd_next fields and vd_flags). * - * Compute (last_avail - va_idx) in integers mod 2**16. This is + * Compute (va_idx - last_avail) in integers mod 2**16. This is * the number of descriptors the device has made available * since the last time we updated vq->vq_last_avail. * * We just need to do the subtraction as an unsigned int, * then trim off excess bits. */ idx = vq->vq_last_avail; ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx); if (ndesc == 0) return (0); if (ndesc > vq->vq_qsize) { /* XXX need better way to diagnose issues */ fprintf(stderr, "%s: ndesc (%u) out of range, driver confused?\r\n", name, (u_int)ndesc); return (-1); } /* * Now count/parse "involved" descriptors starting from * the head of the chain. * * To prevent loops, we could be more complicated and * check whether we're re-visiting a previously visited * index, but we just abort if the count gets excessive. */ ctx = vs->vs_pi->pi_vmctx; *pidx = next = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)]; vq->vq_last_avail++; for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) { if (next >= vq->vq_qsize) { fprintf(stderr, "%s: descriptor index %u out of range, " "driver confused?\r\n", name, next); return (-1); } vdir = &vq->vq_desc[next]; if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) { _vq_record(i, vdir, ctx, iov, n_iov, flags); i++; } else if ((vs->vs_vc->vc_hv_caps & VIRTIO_RING_F_INDIRECT_DESC) == 0) { fprintf(stderr, "%s: descriptor has forbidden INDIRECT flag, " "driver confused?\r\n", name); return (-1); } else { n_indir = vdir->vd_len / 16; if ((vdir->vd_len & 0xf) || n_indir == 0) { fprintf(stderr, "%s: invalid indir len 0x%x, " "driver confused?\r\n", name, (u_int)vdir->vd_len); return (-1); } vindir = paddr_guest2host(ctx, vdir->vd_addr, vdir->vd_len); /* * Indirects start at the 0th, then follow * their own embedded "next"s until those run * out. Each one's indirect flag must be off * (we don't really have to check, could just * ignore errors...). */ next = 0; for (;;) { vp = &vindir[next]; if (vp->vd_flags & VRING_DESC_F_INDIRECT) { fprintf(stderr, "%s: indirect desc has INDIR flag," " driver confused?\r\n", name); return (-1); } _vq_record(i, vp, ctx, iov, n_iov, flags); if (++i > VQ_MAX_DESCRIPTORS) goto loopy; if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0) break; next = vp->vd_next; if (next >= n_indir) { fprintf(stderr, "%s: invalid next %u > %u, " "driver confused?\r\n", name, (u_int)next, n_indir); return (-1); } } } if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0) return (i); } loopy: fprintf(stderr, "%s: descriptor loop? count > %d - driver confused?\r\n", name, i); return (-1); } /* - * Return the currently-first request chain back to the available queue. + * Return the first n_chain request chains back to the available queue. * - * (This chain is the one you handled when you called vq_getchain() + * (These chains are the ones you handled when you called vq_getchain() * and used its positive return value.) */ void -vq_retchain(struct vqueue_info *vq) +vq_retchains(struct vqueue_info *vq, uint16_t n_chains) { - vq->vq_last_avail--; + vq->vq_last_avail -= n_chains; } -/* - * Return specified request chain to the guest, setting its I/O length - * to the provided value. - * - * (This chain is the one you handled when you called vq_getchain() - * and used its positive return value.) - */ void -vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) +vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) { - uint16_t uidx, mask; volatile struct vring_used *vuh; volatile struct virtio_used *vue; + uint16_t mask; /* * Notes: * - mask is N-1 where N is a power of 2 so computes x % N * - vuh points to the "used" data shared with guest * - vue points to the "used" ring entry we want to update - * - head is the same value we compute in vq_iovecs(). * * (I apologize for the two fields named vu_idx; the * virtio spec calls the one that vue points to, "id"...) */ mask = vq->vq_qsize - 1; vuh = vq->vq_used; - uidx = vuh->vu_idx; - vue = &vuh->vu_ring[uidx++ & mask]; + vue = &vuh->vu_ring[vq->vq_next_used++ & mask]; vue->vu_idx = idx; vue->vu_tlen = iolen; +} +void +vq_relchain_publish(struct vqueue_info *vq) +{ /* * Ensure the used descriptor is visible before updating the index. * This is necessary on ISAs with memory ordering less strict than x86 * (and even on x86 to act as a compiler barrier). */ atomic_thread_fence_rel(); - vuh->vu_idx = uidx; + vq->vq_used->vu_idx = vq->vq_next_used; +} + +/* + * Return specified request chain to the guest, setting its I/O length + * to the provided value. + * + * (This chain is the one you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) +{ + vq_relchain_prepare(vq, idx, iolen); + vq_relchain_publish(vq); } /* * Driver has finished processing "available" chains and calling * vq_relchain on each one. If driver used all the available * chains, used_all should be set. * * If the "used" index moved we may need to inform the guest, i.e., * deliver an interrupt. Even if the used index did NOT move we * may need to deliver an interrupt, if the avail ring is empty and * we are supposed to interrupt on empty. * * Note that used_all_avail is provided by the caller because it's * a snapshot of the ring state when he decided to finish interrupt * processing -- it's possible that descriptors became available after * that point. (It's also typically a constant 1/True as well.) */ void vq_endchains(struct vqueue_info *vq, int used_all_avail) { struct virtio_softc *vs; uint16_t event_idx, new_idx, old_idx; int intr; /* * Interrupt generation: if we're using EVENT_IDX, * interrupt if we've crossed the event threshold. * Otherwise interrupt is generated if we added "used" entries, * but suppressed by VRING_AVAIL_F_NO_INTERRUPT. * * In any case, though, if NOTIFY_ON_EMPTY is set and the * entire avail was processed, we need to interrupt always. */ vs = vq->vq_vs; old_idx = vq->vq_save_used; vq->vq_save_used = new_idx = vq->vq_used->vu_idx; /* * Use full memory barrier between vu_idx store from preceding * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or * va_flags below. */ atomic_thread_fence_seq_cst(); if (used_all_avail && (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) intr = 1; else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) { event_idx = VQ_USED_EVENT_IDX(vq); /* * This calculation is per docs and the kernel * (see src/sys/dev/virtio/virtio_ring.h). */ intr = (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old_idx); } else { intr = new_idx != old_idx && !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT); } if (intr) vq_interrupt(vs, vq); } /* Note: these are in sorted order to make for a fast search */ static struct config_reg { uint16_t cr_offset; /* register offset */ uint8_t cr_size; /* size (bytes) */ uint8_t cr_ro; /* true => reg is read only */ const char *cr_name; /* name of reg */ } config_regs[] = { { VTCFG_R_HOSTCAP, 4, 1, "HOSTCAP" }, { VTCFG_R_GUESTCAP, 4, 0, "GUESTCAP" }, { VTCFG_R_PFN, 4, 0, "PFN" }, { VTCFG_R_QNUM, 2, 1, "QNUM" }, { VTCFG_R_QSEL, 2, 0, "QSEL" }, { VTCFG_R_QNOTIFY, 2, 0, "QNOTIFY" }, { VTCFG_R_STATUS, 1, 0, "STATUS" }, { VTCFG_R_ISR, 1, 0, "ISR" }, { VTCFG_R_CFGVEC, 2, 0, "CFGVEC" }, { VTCFG_R_QVEC, 2, 0, "QVEC" }, }; static inline struct config_reg * vi_find_cr(int offset) { u_int hi, lo, mid; struct config_reg *cr; lo = 0; hi = sizeof(config_regs) / sizeof(*config_regs) - 1; while (hi >= lo) { mid = (hi + lo) >> 1; cr = &config_regs[mid]; if (cr->cr_offset == offset) return (cr); if (cr->cr_offset < offset) lo = mid + 1; else hi = mid - 1; } return (NULL); } /* * Handle pci config space reads. * If it's to the MSI-X info, do that. * If it's part of the virtio standard stuff, do that. * Otherwise dispatch to the actual driver. */ uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct virtio_softc *vs = pi->pi_arg; struct virtio_consts *vc; struct config_reg *cr; uint64_t virtio_config_size, max; const char *name; uint32_t newoff; uint32_t value; int error; if (vs->vs_flags & VIRTIO_USE_MSIX) { if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { return (pci_emul_msix_tread(pi, offset, size)); } } /* XXX probably should do something better than just assert() */ assert(baridx == 0); if (vs->vs_mtx) pthread_mutex_lock(vs->vs_mtx); vc = vs->vs_vc; name = vc->vc_name; value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff; if (size != 1 && size != 2 && size != 4) goto bad; if (pci_msix_enabled(pi)) virtio_config_size = VTCFG_R_CFG1; else virtio_config_size = VTCFG_R_CFG0; if (offset >= virtio_config_size) { /* * Subtract off the standard size (including MSI-X * registers if enabled) and dispatch to underlying driver. * If that fails, fall into general code. */ newoff = offset - virtio_config_size; max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; if (newoff + size > max) goto bad; error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value); if (!error) goto done; } bad: cr = vi_find_cr(offset); if (cr == NULL || cr->cr_size != size) { if (cr != NULL) { /* offset must be OK, so size must be bad */ fprintf(stderr, "%s: read from %s: bad size %d\r\n", name, cr->cr_name, size); } else { fprintf(stderr, "%s: read from bad offset/size %jd/%d\r\n", name, (uintmax_t)offset, size); } goto done; } switch (offset) { case VTCFG_R_HOSTCAP: value = vc->vc_hv_caps; break; case VTCFG_R_GUESTCAP: value = vs->vs_negotiated_caps; break; case VTCFG_R_PFN: if (vs->vs_curq < vc->vc_nvq) value = vs->vs_queues[vs->vs_curq].vq_pfn; break; case VTCFG_R_QNUM: value = vs->vs_curq < vc->vc_nvq ? vs->vs_queues[vs->vs_curq].vq_qsize : 0; break; case VTCFG_R_QSEL: value = vs->vs_curq; break; case VTCFG_R_QNOTIFY: value = 0; /* XXX */ break; case VTCFG_R_STATUS: value = vs->vs_status; break; case VTCFG_R_ISR: value = vs->vs_isr; vs->vs_isr = 0; /* a read clears this flag */ if (value) pci_lintr_deassert(pi); break; case VTCFG_R_CFGVEC: value = vs->vs_msix_cfg_idx; break; case VTCFG_R_QVEC: value = vs->vs_curq < vc->vc_nvq ? vs->vs_queues[vs->vs_curq].vq_msix_idx : VIRTIO_MSI_NO_VECTOR; break; } done: if (vs->vs_mtx) pthread_mutex_unlock(vs->vs_mtx); return (value); } /* * Handle pci config space writes. * If it's to the MSI-X info, do that. * If it's part of the virtio standard stuff, do that. * Otherwise dispatch to the actual driver. */ void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct virtio_softc *vs = pi->pi_arg; struct vqueue_info *vq; struct virtio_consts *vc; struct config_reg *cr; uint64_t virtio_config_size, max; const char *name; uint32_t newoff; int error; if (vs->vs_flags & VIRTIO_USE_MSIX) { if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { pci_emul_msix_twrite(pi, offset, size, value); return; } } /* XXX probably should do something better than just assert() */ assert(baridx == 0); if (vs->vs_mtx) pthread_mutex_lock(vs->vs_mtx); vc = vs->vs_vc; name = vc->vc_name; if (size != 1 && size != 2 && size != 4) goto bad; if (pci_msix_enabled(pi)) virtio_config_size = VTCFG_R_CFG1; else virtio_config_size = VTCFG_R_CFG0; if (offset >= virtio_config_size) { /* * Subtract off the standard size (including MSI-X * registers if enabled) and dispatch to underlying driver. */ newoff = offset - virtio_config_size; max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; if (newoff + size > max) goto bad; error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value); if (!error) goto done; } bad: cr = vi_find_cr(offset); if (cr == NULL || cr->cr_size != size || cr->cr_ro) { if (cr != NULL) { /* offset must be OK, wrong size and/or reg is R/O */ if (cr->cr_size != size) fprintf(stderr, "%s: write to %s: bad size %d\r\n", name, cr->cr_name, size); if (cr->cr_ro) fprintf(stderr, "%s: write to read-only reg %s\r\n", name, cr->cr_name); } else { fprintf(stderr, "%s: write to bad offset/size %jd/%d\r\n", name, (uintmax_t)offset, size); } goto done; } switch (offset) { case VTCFG_R_GUESTCAP: vs->vs_negotiated_caps = value & vc->vc_hv_caps; if (vc->vc_apply_features) (*vc->vc_apply_features)(DEV_SOFTC(vs), vs->vs_negotiated_caps); break; case VTCFG_R_PFN: if (vs->vs_curq >= vc->vc_nvq) goto bad_qindex; vi_vq_init(vs, value); break; case VTCFG_R_QSEL: /* * Note that the guest is allowed to select an * invalid queue; we just need to return a QNUM * of 0 while the bad queue is selected. */ vs->vs_curq = value; break; case VTCFG_R_QNOTIFY: if (value >= vc->vc_nvq) { fprintf(stderr, "%s: queue %d notify out of range\r\n", name, (int)value); goto done; } vq = &vs->vs_queues[value]; if (vq->vq_notify) (*vq->vq_notify)(DEV_SOFTC(vs), vq); else if (vc->vc_qnotify) (*vc->vc_qnotify)(DEV_SOFTC(vs), vq); else fprintf(stderr, "%s: qnotify queue %d: missing vq/vc notify\r\n", name, (int)value); break; case VTCFG_R_STATUS: vs->vs_status = value; if (value == 0) (*vc->vc_reset)(DEV_SOFTC(vs)); break; case VTCFG_R_CFGVEC: vs->vs_msix_cfg_idx = value; break; case VTCFG_R_QVEC: if (vs->vs_curq >= vc->vc_nvq) goto bad_qindex; vq = &vs->vs_queues[vs->vs_curq]; vq->vq_msix_idx = value; break; } goto done; bad_qindex: fprintf(stderr, "%s: write config reg %s: curq %d >= max %d\r\n", name, cr->cr_name, vs->vs_curq, vc->vc_nvq); done: if (vs->vs_mtx) pthread_mutex_unlock(vs->vs_mtx); } Index: head/usr.sbin/bhyve/virtio.h =================================================================== --- head/usr.sbin/bhyve/virtio.h (revision 354551) +++ head/usr.sbin/bhyve/virtio.h (revision 354552) @@ -1,490 +1,494 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Chris Torek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VIRTIO_H_ #define _VIRTIO_H_ #include /* * These are derived from several virtio specifications. * * Some useful links: * https://github.com/rustyrussell/virtio-spec * http://people.redhat.com/pbonzini/virtio-spec.pdf */ /* * A virtual device has zero or more "virtual queues" (virtqueue). * Each virtqueue uses at least two 4096-byte pages, laid out thus: * * +-----------------------------------------------+ * | "desc": descriptors, 16 bytes each | * | ----------------------------------------- | * | "avail": 2 uint16; uint16; 1 uint16 | * | ----------------------------------------- | * | pad to 4k boundary | * +-----------------------------------------------+ * | "used": 2 x uint16; elems; 1 uint16 | * | ----------------------------------------- | * | pad to 4k boundary | * +-----------------------------------------------+ * * The number that appears here is always a power of two and is * limited to no more than 32768 (as it must fit in a 16-bit field). * If is sufficiently large, the above will occupy more than * two pages. In any case, all pages must be physically contiguous * within the guest's physical address space. * * The 16-byte "desc" descriptors consist of a 64-bit guest * physical address , a 32-bit length , a 16-bit * , and a 16-bit field (all in guest byte order). * * There are three flags that may be set : * NEXT descriptor is chained, so use its "next" field * WRITE descriptor is for host to write into guest RAM * (else host is to read from guest RAM) * INDIRECT descriptor address field is (guest physical) * address of a linear array of descriptors * * Unless INDIRECT is set, is the number of bytes that may * be read/written from guest physical address . If * INDIRECT is set, WRITE is ignored and provides the length * of the indirect descriptors (and must be a multiple of * 16). Note that NEXT may still be set in the main descriptor * pointing to the indirect, and should be set in each indirect * descriptor that uses the next descriptor (these should generally * be numbered sequentially). However, INDIRECT must not be set * in the indirect descriptors. Upon reaching an indirect descriptor * without a NEXT bit, control returns to the direct descriptors. * * Except inside an indirect, each value must be in the * range [0 .. N) (i.e., the half-open interval). (Inside an * indirect, each must be in the range [0 .. /16).) * * The "avail" data structures reside in the same pages as the * "desc" structures since both together are used by the device to * pass information to the hypervisor's virtual driver. These * begin with a 16-bit field and 16-bit index , then * have 16-bit values, followed by one final 16-bit * field . The entries are simply indices * indices into the descriptor ring (and thus must meet the same * constraints as each value). However, is counted * up from 0 (initially) and simply wraps around after 65535; it * is taken mod to find the next available entry. * * The "used" ring occupies a separate page or pages, and contains * values written from the virtual driver back to the guest OS. * This begins with a 16-bit and 16-bit , then there * are "vring_used" elements, followed by a 16-bit . * The "vring_used" elements consist of a 32-bit and a * 32-bit (vu_tlen below). The is simply the index of * the head of a descriptor chain the guest made available * earlier, and the is the number of bytes actually written, * e.g., in the case of a network driver that provided a large * receive buffer but received only a small amount of data. * * The two event fields, and , in the * avail and used rings (respectively -- note the reversal!), are * always provided, but are used only if the virtual device * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature * negotiation. Similarly, both rings provide a flag -- * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in * their field, indicating that the guest does not need an * interrupt, or that the hypervisor driver does not need a * notify, when descriptors are added to the corresponding ring. * (These are provided only for interrupt optimization and need * not be implemented.) */ #define VRING_ALIGN 4096 #define VRING_DESC_F_NEXT (1 << 0) #define VRING_DESC_F_WRITE (1 << 1) #define VRING_DESC_F_INDIRECT (1 << 2) struct virtio_desc { /* AKA vring_desc */ uint64_t vd_addr; /* guest physical address */ uint32_t vd_len; /* length of scatter/gather seg */ uint16_t vd_flags; /* VRING_F_DESC_* */ uint16_t vd_next; /* next desc if F_NEXT */ } __packed; struct virtio_used { /* AKA vring_used_elem */ uint32_t vu_idx; /* head of used descriptor chain */ uint32_t vu_tlen; /* length written-to */ } __packed; #define VRING_AVAIL_F_NO_INTERRUPT 1 struct vring_avail { uint16_t va_flags; /* VRING_AVAIL_F_* */ uint16_t va_idx; /* counts to 65535, then cycles */ uint16_t va_ring[]; /* size N, reported in QNUM value */ /* uint16_t va_used_event; -- after N ring entries */ } __packed; #define VRING_USED_F_NO_NOTIFY 1 struct vring_used { uint16_t vu_flags; /* VRING_USED_F_* */ uint16_t vu_idx; /* counts to 65535, then cycles */ struct virtio_used vu_ring[]; /* size N */ /* uint16_t vu_avail_event; -- after N ring entries */ } __packed; /* * The address of any given virtual queue is determined by a single * Page Frame Number register. The guest writes the PFN into the * PCI config space. However, a device that has two or more * virtqueues can have a different PFN, and size, for each queue. * The number of queues is determinable via the PCI config space * VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means * queue #0, 1 means queue#1, etc. Once a queue is selected, the * remaining PFN and QNUM registers refer to that queue. * * QNUM is a read-only register containing a nonzero power of two * that indicates the (hypervisor's) queue size. Or, if reading it * produces zero, the hypervisor does not have a corresponding * queue. (The number of possible queues depends on the virtual * device. The block device has just one; the network device * provides either two -- 0 = receive, 1 = transmit -- or three, * with 2 = control.) * * PFN is a read/write register giving the physical page address of * the virtqueue in guest memory (the guest must allocate enough space * based on the hypervisor's provided QNUM). * * QNOTIFY is effectively write-only: when the guest writes a queue * number to the register, the hypervisor should scan the specified * virtqueue. (Reading QNOTIFY currently always gets 0). */ /* * PFN register shift amount */ #define VRING_PFN 12 /* * Virtio device types * * XXX Should really be merged with defines */ #define VIRTIO_TYPE_NET 1 #define VIRTIO_TYPE_BLOCK 2 #define VIRTIO_TYPE_CONSOLE 3 #define VIRTIO_TYPE_ENTROPY 4 #define VIRTIO_TYPE_BALLOON 5 #define VIRTIO_TYPE_IOMEMORY 6 #define VIRTIO_TYPE_RPMSG 7 #define VIRTIO_TYPE_SCSI 8 #define VIRTIO_TYPE_9P 9 /* experimental IDs start at 65535 and work down */ /* * PCI vendor/device IDs */ #define VIRTIO_VENDOR 0x1AF4 #define VIRTIO_DEV_NET 0x1000 #define VIRTIO_DEV_BLOCK 0x1001 #define VIRTIO_DEV_CONSOLE 0x1003 #define VIRTIO_DEV_RANDOM 0x1005 #define VIRTIO_DEV_SCSI 0x1008 /* * PCI config space constants. * * If MSI-X is enabled, the ISR register is generally not used, * and the configuration vector and queue vector appear at offsets * 20 and 22 with the remaining configuration registers at 24. * If MSI-X is not enabled, those two registers disappear and * the remaining configuration registers start at offset 20. */ #define VTCFG_R_HOSTCAP 0 #define VTCFG_R_GUESTCAP 4 #define VTCFG_R_PFN 8 #define VTCFG_R_QNUM 12 #define VTCFG_R_QSEL 14 #define VTCFG_R_QNOTIFY 16 #define VTCFG_R_STATUS 18 #define VTCFG_R_ISR 19 #define VTCFG_R_CFGVEC 20 #define VTCFG_R_QVEC 22 #define VTCFG_R_CFG0 20 /* No MSI-X */ #define VTCFG_R_CFG1 24 /* With MSI-X */ #define VTCFG_R_MSIX 20 /* * Bits in VTCFG_R_STATUS. Guests need not actually set any of these, * but a guest writing 0 to this register means "please reset". */ #define VTCFG_STATUS_ACK 0x01 /* guest OS has acknowledged dev */ #define VTCFG_STATUS_DRIVER 0x02 /* guest OS driver is loaded */ #define VTCFG_STATUS_DRIVER_OK 0x04 /* guest OS driver ready */ #define VTCFG_STATUS_FAILED 0x80 /* guest has given up on this dev */ /* * Bits in VTCFG_R_ISR. These apply only if not using MSI-X. * * (We don't [yet?] ever use CONF_CHANGED.) */ #define VTCFG_ISR_QUEUES 0x01 /* re-scan queues */ #define VTCFG_ISR_CONF_CHANGED 0x80 /* configuration changed */ #define VIRTIO_MSI_NO_VECTOR 0xFFFF /* * Feature flags. * Note: bits 0 through 23 are reserved to each device type. */ #define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24) #define VIRTIO_RING_F_INDIRECT_DESC (1 << 28) #define VIRTIO_RING_F_EVENT_IDX (1 << 29) /* From section 2.3, "Virtqueue Configuration", of the virtio specification */ static inline size_t vring_size(u_int qsz) { size_t size; /* constant 3 below = va_flags, va_idx, va_used_event */ size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz); size = roundup2(size, VRING_ALIGN); /* constant 3 below = vu_flags, vu_idx, vu_avail_event */ size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz; size = roundup2(size, VRING_ALIGN); return (size); } struct vmctx; struct pci_devinst; struct vqueue_info; /* * A virtual device, with some number (possibly 0) of virtual * queues and some size (possibly 0) of configuration-space * registers private to the device. The virtio_softc should come * at the front of each "derived class", so that a pointer to the * virtio_softc is also a pointer to the more specific, derived- * from-virtio driver's softc. * * Note: inside each hypervisor virtio driver, changes to these * data structures must be locked against other threads, if any. * Except for PCI config space register read/write, we assume each * driver does the required locking, but we need a pointer to the * lock (if there is one) for PCI config space read/write ops. * * When the guest reads or writes the device's config space, the * generic layer checks for operations on the special registers * described above. If the offset of the register(s) being read * or written is past the CFG area (CFG0 or CFG1), the request is * passed on to the virtual device, after subtracting off the * generic-layer size. (So, drivers can just use the offset as * an offset into "struct config", for instance.) * * (The virtio layer also makes sure that the read or write is to/ * from a "good" config offset, hence vc_cfgsize, and on BAR #0. * However, the driver must verify the read or write size and offset * and that no one is writing a readonly register.) * * The BROKED flag ("this thing done gone and broked") is for future * use. */ #define VIRTIO_USE_MSIX 0x01 #define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */ #define VIRTIO_BROKED 0x08 /* ??? */ struct virtio_softc { struct virtio_consts *vs_vc; /* constants (see below) */ int vs_flags; /* VIRTIO_* flags from above */ pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */ struct pci_devinst *vs_pi; /* PCI device instance */ uint32_t vs_negotiated_caps; /* negotiated capabilities */ struct vqueue_info *vs_queues; /* one per vc_nvq */ int vs_curq; /* current queue */ uint8_t vs_status; /* value from last status write */ uint8_t vs_isr; /* ISR flags, if not MSI-X */ uint16_t vs_msix_cfg_idx; /* MSI-X vector for config event */ }; #define VS_LOCK(vs) \ do { \ if (vs->vs_mtx) \ pthread_mutex_lock(vs->vs_mtx); \ } while (0) #define VS_UNLOCK(vs) \ do { \ if (vs->vs_mtx) \ pthread_mutex_unlock(vs->vs_mtx); \ } while (0) struct virtio_consts { const char *vc_name; /* name of driver (for diagnostics) */ int vc_nvq; /* number of virtual queues */ size_t vc_cfgsize; /* size of dev-specific config regs */ void (*vc_reset)(void *); /* called on virtual device reset */ void (*vc_qnotify)(void *, struct vqueue_info *); /* called on QNOTIFY if no VQ notify */ int (*vc_cfgread)(void *, int, int, uint32_t *); /* called to read config regs */ int (*vc_cfgwrite)(void *, int, int, uint32_t); /* called to write config regs */ void (*vc_apply_features)(void *, uint64_t); /* called to apply negotiated features */ uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ }; /* * Data structure allocated (statically) per virtual queue. * * Drivers may change vq_qsize after a reset. When the guest OS * requests a device reset, the hypervisor first calls * vs->vs_vc->vc_reset(); then the data structure below is * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq). * * The remaining fields should only be fussed-with by the generic * code. * * Note: the addresses of vq_desc, vq_avail, and vq_used are all * computable from each other, but it's a lot simpler if we just * keep a pointer to each one. The event indices are similarly * (but more easily) computable, and this time we'll compute them: * they're just XX_ring[N]. */ #define VQ_ALLOC 0x01 /* set once we have a pfn */ #define VQ_BROKED 0x02 /* ??? */ struct vqueue_info { uint16_t vq_qsize; /* size of this queue (a power of 2) */ void (*vq_notify)(void *, struct vqueue_info *); /* called instead of vc_notify, if not NULL */ struct virtio_softc *vq_vs; /* backpointer to softc */ uint16_t vq_num; /* we're the num'th queue in the softc */ uint16_t vq_flags; /* flags (see above) */ uint16_t vq_last_avail; /* a recent value of vq_avail->va_idx */ + uint16_t vq_next_used; /* index of the next used slot to be filled */ uint16_t vq_save_used; /* saved vq_used->vu_idx; see vq_endchains */ uint16_t vq_msix_idx; /* MSI-X index, or VIRTIO_MSI_NO_VECTOR */ uint32_t vq_pfn; /* PFN of virt queue (not shifted!) */ volatile struct virtio_desc *vq_desc; /* descriptor array */ volatile struct vring_avail *vq_avail; /* the "avail" ring */ volatile struct vring_used *vq_used; /* the "used" ring */ }; /* as noted above, these are sort of backwards, name-wise */ #define VQ_AVAIL_EVENT_IDX(vq) \ (*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize]) #define VQ_USED_EVENT_IDX(vq) \ ((vq)->vq_avail->va_ring[(vq)->vq_qsize]) /* * Is this ring ready for I/O? */ static inline int vq_ring_ready(struct vqueue_info *vq) { return (vq->vq_flags & VQ_ALLOC); } /* * Are there "available" descriptors? (This does not count * how many, just returns True if there are some.) */ static inline int vq_has_descs(struct vqueue_info *vq) { return (vq_ring_ready(vq) && vq->vq_last_avail != vq->vq_avail->va_idx); } /* * Deliver an interrupt to guest on the given virtual queue * (if possible, or a generic MSI interrupt if not using MSI-X). */ static inline void vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq) { if (pci_msix_enabled(vs->vs_pi)) pci_generate_msix(vs->vs_pi, vq->vq_msix_idx); else { VS_LOCK(vs); vs->vs_isr |= VTCFG_ISR_QUEUES; pci_generate_msi(vs->vs_pi, 0); pci_lintr_assert(vs->vs_pi); VS_UNLOCK(vs); } } static inline void vq_kick_enable(struct vqueue_info *vq) { vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY; /* * Full memory barrier to make sure the store to vu_flags * happens before the load from va_idx, which results from * a subsequent call to vq_has_descs(). */ atomic_thread_fence_seq_cst(); } static inline void vq_kick_disable(struct vqueue_info *vq) { vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; } struct iovec; void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, void *dev_softc, struct pci_devinst *pi, struct vqueue_info *queues); int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix); void vi_reset_dev(struct virtio_softc *); void vi_set_io_bar(struct virtio_softc *, int); int vq_getchain(struct vqueue_info *vq, uint16_t *pidx, struct iovec *iov, int n_iov, uint16_t *flags); -void vq_retchain(struct vqueue_info *vq); +void vq_retchains(struct vqueue_info *vq, uint16_t n_chains); +void vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, + uint32_t iolen); +void vq_relchain_publish(struct vqueue_info *vq); void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); void vq_endchains(struct vqueue_info *vq, int used_all_avail); uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size); void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value); #endif /* _VIRTIO_H_ */