Page MenuHomeFreeBSD

D20659.id59117.diff
No OneTemporary

D20659.id59117.diff

Index: usr.sbin/bhyve/Makefile
===================================================================
--- usr.sbin/bhyve/Makefile
+++ usr.sbin/bhyve/Makefile
@@ -32,6 +32,7 @@
mem.c \
mevent.c \
mptbl.c \
+ net_backends.c \
net_utils.c \
pci_ahci.c \
pci_e82545.c \
Index: usr.sbin/bhyve/net_backends.h
===================================================================
--- /dev/null
+++ usr.sbin/bhyve/net_backends.h
@@ -0,0 +1,85 @@
+/*-
+ * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __NET_BACKENDS_H__
+#define __NET_BACKENDS_H__
+
+#include <stdint.h>
+
+/* Opaque type representing a network backend. */
+typedef struct net_backend net_backend_t;
+
+/* Interface between network frontends and the network backends. */
+typedef void (*net_be_rxeof_t)(int, enum ev_type, void *param);
+int netbe_init(net_backend_t **be, const char *devname, net_be_rxeof_t cb,
+ void *param);
+void netbe_cleanup(net_backend_t *be);
+uint64_t netbe_get_cap(net_backend_t *be);
+int netbe_set_cap(net_backend_t *be, uint64_t cap,
+ unsigned vnet_hdr_len);
+ssize_t netbe_send(net_backend_t *be, struct iovec *iov, int iovcnt);
+ssize_t netbe_recv(net_backend_t *be, struct iovec *iov, int iovcnt);
+ssize_t netbe_rx_discard(net_backend_t *be);
+
+
+/*
+ * Network device capabilities taken from the VirtIO standard.
+ * Despite the name, these capabilities can be used by different frontents
+ * (virtio-net, ptnet) and supported by different backends (netmap, tap, ...).
+ */
+#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */
+#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */
+#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
+#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */
+#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */
+#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */
+#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */
+#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */
+#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */
+#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */
+#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */
+#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */
+#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
+#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
+#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */
+#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */
+#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */
+#define VIRTIO_NET_F_GUEST_ANNOUNCE \
+ (1 << 21) /* guest can send gratuitous pkts */
+
+/*
+ * Fixed network header size
+ */
+struct virtio_net_rxhdr {
+ uint8_t vrh_flags;
+ uint8_t vrh_gso_type;
+ uint16_t vrh_hdr_len;
+ uint16_t vrh_gso_size;
+ uint16_t vrh_csum_start;
+ uint16_t vrh_csum_offset;
+ uint16_t vrh_bufs;
+} __packed;
+
+#endif /* __NET_BACKENDS_H__ */
Index: usr.sbin/bhyve/net_backends.c
===================================================================
--- /dev/null
+++ usr.sbin/bhyve/net_backends.c
@@ -0,0 +1,799 @@
+/*-
+ * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This file implements multiple network backends (tap, netmap, ...),
+ * to be used by network frontends such as virtio-net and e1000.
+ * The API to access the backend (e.g. send/receive packets, negotiate
+ * features) is exported by net_backends.h.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/types.h> /* u_short etc */
+#include <net/if.h>
+
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <sysexits.h>
+#include <assert.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <poll.h>
+#include <assert.h>
+#include <net/netmap.h>
+#include <net/netmap_virt.h>
+#define NETMAP_WITH_LIBS
+#include <net/netmap_user.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#include <sys/capsicum.h>
+#endif
+
+#include "iov.h"
+#include "mevent.h"
+#include "net_backends.h"
+
+#include <sys/linker_set.h>
+
+/*
+ * Each network backend registers a set of function pointers that are
+ * used to implement the net backends API.
+ * This might need to be exposed if we implement backends in separate files.
+ */
+struct net_backend {
+ const char *prefix; /* prefix matching this backend */
+
+ /*
+ * Routines used to initialize and cleanup the resources needed
+ * by a backend. The cleanup function is used internally,
+ * and should not be called by the frontend.
+ */
+ int (*init)(struct net_backend *be, const char *devname,
+ net_be_rxeof_t cb, void *param);
+ void (*cleanup)(struct net_backend *be);
+
+ /*
+ * Called to serve a guest transmit request. The scatter-gather
+ * vector provided by the caller has 'iovcnt' elements and contains
+ * the packet to send.
+ */
+ ssize_t (*send)(struct net_backend *be, struct iovec *iov, int iovcnt);
+
+ /*
+ * Called to receive a packet from the backend. When the function
+ * returns a positive value 'len', the scatter-gather vector
+ * provided by the caller contains a packet with such length.
+ * The function returns 0 if the backend doesn't have a new packet to
+ * receive.
+ */
+ ssize_t (*recv)(struct net_backend *be, struct iovec *iov, int iovcnt);
+
+ /*
+ * Ask the backend for the virtio-net features it is able to
+ * support. Possible features are TSO, UFO and checksum offloading
+ * in both rx and tx direction and for both IPv4 and IPv6.
+ */
+ uint64_t (*get_cap)(struct net_backend *be);
+
+ /*
+ * Tell the backend to enable/disable the specified virtio-net
+ * features (capabilities).
+ */
+ int (*set_cap)(struct net_backend *be, uint64_t features,
+ unsigned int vnet_hdr_len);
+
+ struct pci_vtnet_softc *sc;
+ int fd;
+
+ /*
+ * Length of the virtio-net header used by the backend and the
+ * frontend, respectively. A zero value means that the header
+ * is not used.
+ */
+ unsigned int be_vnet_hdr_len;
+ unsigned int fe_vnet_hdr_len;
+
+ /* Size of backend-specific private data. */
+ size_t priv_size;
+
+ /* Room for backend-specific data. */
+ char opaque[0];
+};
+
+SET_DECLARE(net_backend_set, struct net_backend);
+
+#define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr)
+
+#define WPRINTF(params) printf params
+
+/*
+ * The tap backend
+ */
+
+struct tap_priv {
+ struct mevent *mevp;
+};
+
+static void
+tap_cleanup(struct net_backend *be)
+{
+ struct tap_priv *priv = (struct tap_priv *)be->opaque;
+
+ if (priv->mevp) {
+ mevent_delete(priv->mevp);
+ }
+ if (be->fd != -1) {
+ close(be->fd);
+ be->fd = -1;
+ }
+}
+
+static int
+tap_init(struct net_backend *be, const char *devname,
+ net_be_rxeof_t cb, void *param)
+{
+ struct tap_priv *priv = (struct tap_priv *)be->opaque;
+ char tbuf[80];
+ int fd;
+ int opt = 1;
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+#endif
+
+ if (cb == NULL) {
+ WPRINTF(("TAP backend requires non-NULL callback\n"));
+ return (-1);
+ }
+
+ strcpy(tbuf, "/dev/");
+ strlcat(tbuf, devname, sizeof(tbuf));
+
+ fd = open(tbuf, O_RDWR);
+ if (fd == -1) {
+ WPRINTF(("open of tap device %s failed\n", tbuf));
+ goto error;
+ }
+
+ /*
+ * Set non-blocking and register for read
+ * notifications with the event loop
+ */
+ if (ioctl(fd, FIONBIO, &opt) < 0) {
+ WPRINTF(("tap device O_NONBLOCK failed\n"));
+ goto error;
+ }
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
+ if (caph_rights_limit(fd, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ priv->mevp = mevent_add(fd, EVF_READ, cb, param);
+ if (priv->mevp == NULL) {
+ WPRINTF(("Could not register event\n"));
+ goto error;
+ }
+
+ be->fd = fd;
+
+ return (0);
+
+error:
+ tap_cleanup(be);
+ return (-1);
+}
+
+/*
+ * Called to send a buffer chain out to the tap device
+ */
+static ssize_t
+tap_send(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+ return writev(be->fd, iov, iovcnt);
+}
+
+static ssize_t
+tap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+ ssize_t ret;
+
+ /* Should never be called without a valid tap fd */
+ assert(be->fd != -1);
+
+ ret = readv(be->fd, iov, iovcnt);
+
+ if (ret < 0 && errno == EWOULDBLOCK) {
+ return (0);
+ }
+
+ return (ret);
+}
+
+static uint64_t
+tap_get_cap(struct net_backend *be)
+{
+
+ return (0); /* no capabilities for now */
+}
+
+static int
+tap_set_cap(struct net_backend *be, uint64_t features,
+ unsigned vnet_hdr_len)
+{
+
+ return ((features || vnet_hdr_len) ? -1 : 0);
+}
+
+static struct net_backend tap_backend = {
+ .prefix = "tap",
+ .priv_size = sizeof(struct tap_priv),
+ .init = tap_init,
+ .cleanup = tap_cleanup,
+ .send = tap_send,
+ .recv = tap_recv,
+ .get_cap = tap_get_cap,
+ .set_cap = tap_set_cap,
+};
+
+/* A clone ot the tap backend, with a different prefix. */
+static struct net_backend vmnet_backend = {
+ .prefix = "vmnet",
+ .priv_size = sizeof(struct tap_priv),
+ .init = tap_init,
+ .cleanup = tap_cleanup,
+ .send = tap_send,
+ .recv = tap_recv,
+ .get_cap = tap_get_cap,
+ .set_cap = tap_set_cap,
+};
+
+DATA_SET(net_backend_set, tap_backend);
+DATA_SET(net_backend_set, vmnet_backend);
+
+/*
+ * The netmap backend
+ */
+
+/* The virtio-net features supported by netmap. */
+#define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
+ VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
+ VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
+ VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
+
+struct netmap_priv {
+ char ifname[IFNAMSIZ];
+ struct nm_desc *nmd;
+ uint16_t memid;
+ struct netmap_ring *rx;
+ struct netmap_ring *tx;
+ struct mevent *mevp;
+ net_be_rxeof_t cb;
+ void *cb_param;
+};
+
+static void
+nmreq_init(struct nmreq *req, char *ifname)
+{
+
+ memset(req, 0, sizeof(*req));
+ strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
+ req->nr_version = NETMAP_API;
+}
+
+static int
+netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
+{
+ int err;
+ struct nmreq req;
+ struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+
+ nmreq_init(&req, priv->ifname);
+ req.nr_cmd = NETMAP_BDG_VNET_HDR;
+ req.nr_arg1 = vnet_hdr_len;
+ err = ioctl(be->fd, NIOCREGIF, &req);
+ if (err) {
+ WPRINTF(("Unable to set vnet header length %d\n",
+ vnet_hdr_len));
+ return (err);
+ }
+
+ be->be_vnet_hdr_len = vnet_hdr_len;
+
+ return (0);
+}
+
+static int
+netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
+{
+ int prev_hdr_len = be->be_vnet_hdr_len;
+ int ret;
+
+ if (vnet_hdr_len == prev_hdr_len) {
+ return (1);
+ }
+
+ ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
+ if (ret) {
+ return (0);
+ }
+
+ netmap_set_vnet_hdr_len(be, prev_hdr_len);
+
+ return (1);
+}
+
+static uint64_t
+netmap_get_cap(struct net_backend *be)
+{
+
+ return netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
+ NETMAP_FEATURES : 0;
+}
+
+static int
+netmap_set_cap(struct net_backend *be, uint64_t features,
+ unsigned vnet_hdr_len)
+{
+
+ return netmap_set_vnet_hdr_len(be, vnet_hdr_len);
+}
+
+static int
+netmap_init(struct net_backend *be, const char *devname,
+ net_be_rxeof_t cb, void *param)
+{
+ struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+
+ strlcpy(priv->ifname, devname, sizeof(priv->ifname));
+ priv->ifname[sizeof(priv->ifname) - 1] = '\0';
+
+ priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
+ if (priv->nmd == NULL) {
+ WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)\n",
+ devname, strerror(errno)));
+ free(priv);
+ return (-1);
+ }
+
+ priv->memid = priv->nmd->req.nr_arg2;
+ priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
+ priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
+ priv->cb = cb;
+ priv->cb_param = param;
+ be->fd = priv->nmd->fd;
+
+ priv->mevp = mevent_add(be->fd, EVF_READ, cb, param);
+ if (priv->mevp == NULL) {
+ WPRINTF(("Could not register event\n"));
+ return (-1);
+ }
+
+ return (0);
+}
+
+static void
+netmap_cleanup(struct net_backend *be)
+{
+ struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+
+ if (priv->mevp) {
+ mevent_delete(priv->mevp);
+ }
+ if (priv->nmd) {
+ nm_close(priv->nmd);
+ }
+ be->fd = -1;
+}
+
+static ssize_t
+netmap_send(struct net_backend *be, struct iovec *iov,
+ int iovcnt)
+{
+ struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+ struct netmap_ring *ring;
+ ssize_t totlen = 0;
+ int nm_buf_size;
+ int nm_buf_len;
+ uint32_t head;
+ void *nm_buf;
+ int j;
+
+ ring = priv->tx;
+ head = ring->head;
+ if (head == ring->tail) {
+ WPRINTF(("No space, drop %zu bytes\n", count_iov(iov, iovcnt)));
+ goto txsync;
+ }
+ nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
+ nm_buf_size = ring->nr_buf_size;
+ nm_buf_len = 0;
+
+ for (j = 0; j < iovcnt; j++) {
+ int iov_frag_size = iov[j].iov_len;
+ void *iov_frag_buf = iov[j].iov_base;
+
+ totlen += iov_frag_size;
+
+ /*
+ * Split each iovec fragment over more netmap slots, if
+ * necessary.
+ */
+ for (;;) {
+ int copylen;
+
+ copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
+ memcpy(nm_buf, iov_frag_buf, copylen);
+
+ iov_frag_buf += copylen;
+ iov_frag_size -= copylen;
+ nm_buf += copylen;
+ nm_buf_size -= copylen;
+ nm_buf_len += copylen;
+
+ if (iov_frag_size == 0) {
+ break;
+ }
+
+ ring->slot[head].len = nm_buf_len;
+ ring->slot[head].flags = NS_MOREFRAG;
+ head = nm_ring_next(ring, head);
+ if (head == ring->tail) {
+ /*
+ * We ran out of netmap slots while
+ * splitting the iovec fragments.
+ */
+ WPRINTF(("No space, drop %zu bytes\n",
+ count_iov(iov, iovcnt)));
+ goto txsync;
+ }
+ nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
+ nm_buf_size = ring->nr_buf_size;
+ nm_buf_len = 0;
+ }
+ }
+
+ /* Complete the last slot, which must not have NS_MOREFRAG set. */
+ ring->slot[head].len = nm_buf_len;
+ ring->slot[head].flags = 0;
+ head = nm_ring_next(ring, head);
+
+ /* Now update ring->head and ring->cur. */
+ ring->head = ring->cur = head;
+txsync:
+ ioctl(be->fd, NIOCTXSYNC, NULL);
+
+ return (totlen);
+}
+
+static ssize_t
+netmap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+ struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+ struct netmap_slot *slot = NULL;
+ struct netmap_ring *ring;
+ void *iov_frag_buf;
+ int iov_frag_size;
+ ssize_t totlen = 0;
+ uint32_t head;
+
+ assert(iovcnt);
+
+ ring = priv->rx;
+ head = ring->head;
+ iov_frag_buf = iov->iov_base;
+ iov_frag_size = iov->iov_len;
+
+ do {
+ int nm_buf_len;
+ void *nm_buf;
+
+ if (head == ring->tail) {
+ return (0);
+ }
+
+ slot = ring->slot + head;
+ nm_buf = NETMAP_BUF(ring, slot->buf_idx);
+ nm_buf_len = slot->len;
+
+ for (;;) {
+ int copylen = nm_buf_len < iov_frag_size ? nm_buf_len : iov_frag_size;
+
+ memcpy(iov_frag_buf, nm_buf, copylen);
+ nm_buf += copylen;
+ nm_buf_len -= copylen;
+ iov_frag_buf += copylen;
+ iov_frag_size -= copylen;
+ totlen += copylen;
+
+ if (nm_buf_len == 0) {
+ break;
+ }
+
+ iov++;
+ iovcnt--;
+ if (iovcnt == 0) {
+ /* No space to receive. */
+ WPRINTF(("Short iov, drop %zd bytes\n",
+ totlen));
+ return (-ENOSPC);
+ }
+ iov_frag_buf = iov->iov_base;
+ iov_frag_size = iov->iov_len;
+ }
+
+ head = nm_ring_next(ring, head);
+
+ } while (slot->flags & NS_MOREFRAG);
+
+ /* Release slots to netmap. */
+ ring->head = ring->cur = head;
+
+ return (totlen);
+}
+
+static struct net_backend netmap_backend = {
+ .prefix = "netmap",
+ .priv_size = sizeof(struct netmap_priv),
+ .init = netmap_init,
+ .cleanup = netmap_cleanup,
+ .send = netmap_send,
+ .recv = netmap_recv,
+ .get_cap = netmap_get_cap,
+ .set_cap = netmap_set_cap,
+};
+
+/* A clone ot the netmap backend, with a different prefix. */
+static struct net_backend vale_backend = {
+ .prefix = "vale",
+ .priv_size = sizeof(struct netmap_priv),
+ .init = netmap_init,
+ .cleanup = netmap_cleanup,
+ .send = netmap_send,
+ .recv = netmap_recv,
+ .get_cap = netmap_get_cap,
+ .set_cap = netmap_set_cap,
+};
+
+DATA_SET(net_backend_set, netmap_backend);
+DATA_SET(net_backend_set, vale_backend);
+
+/*
+ * Initialize a backend and attach to the frontend.
+ * This is called during frontend initialization.
+ * @pbe is a pointer to the backend to be initialized
+ * @devname is the backend-name as supplied on the command line,
+ * e.g. -s 2:0,frontend-name,backend-name[,other-args]
+ * @cb is the receive callback supplied by the frontend,
+ * and it is invoked in the event loop when a receive
+ * event is generated in the hypervisor,
+ * @param is a pointer to the frontend, and normally used as
+ * the argument for the callback.
+ */
+int
+netbe_init(struct net_backend **ret, const char *devname, net_be_rxeof_t cb,
+ void *param)
+{
+ struct net_backend **pbe, *nbe, *tbe = NULL;
+ int err;
+
+ /*
+ * Find the network backend that matches the user-provided
+ * device name. net_backend_set is built using a linker set.
+ */
+ SET_FOREACH(pbe, net_backend_set) {
+ if (!strncmp(devname, (*pbe)->prefix, strlen((*pbe)->prefix))) {
+ tbe = *pbe;
+ assert(tbe->init != NULL);
+ assert(tbe->cleanup != NULL);
+ assert(tbe->send != NULL);
+ assert(tbe->recv != NULL);
+ assert(tbe->get_cap != NULL);
+ assert(tbe->set_cap != NULL);
+ break;
+ }
+ }
+
+ *ret = NULL;
+ if (tbe == NULL)
+ return (EINVAL);
+ nbe = calloc(1, sizeof(*nbe) + tbe->priv_size);
+ *nbe = *tbe; /* copy the template */
+ nbe->fd = -1;
+ nbe->sc = param;
+ nbe->be_vnet_hdr_len = 0;
+ nbe->fe_vnet_hdr_len = 0;
+
+ /* Initialize the backend. */
+ err = nbe->init(nbe, devname, cb, param);
+ if (err) {
+ free(nbe);
+ return (err);
+ }
+
+ *ret = nbe;
+
+ return (0);
+}
+
+void
+netbe_cleanup(struct net_backend *be)
+{
+
+ if (be != NULL) {
+ be->cleanup(be);
+ free(be);
+ }
+}
+
+uint64_t
+netbe_get_cap(struct net_backend *be)
+{
+
+ assert(be != NULL);
+ return be->get_cap(be);
+}
+
+int
+netbe_set_cap(struct net_backend *be, uint64_t features,
+ unsigned vnet_hdr_len)
+{
+ int ret;
+
+ assert(be != NULL);
+
+ /* There are only three valid lengths, i.e., 0, 10 and 12. */
+ if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
+ && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
+ return (-1);
+
+ be->fe_vnet_hdr_len = vnet_hdr_len;
+
+ ret = be->set_cap(be, features, vnet_hdr_len);
+ assert(be->be_vnet_hdr_len == 0 ||
+ be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
+
+ return (ret);
+}
+
+static __inline struct iovec *
+iov_trim(struct iovec *iov, int *iovcnt, unsigned int tlen)
+{
+ struct iovec *riov;
+
+ /* XXX short-cut: assume first segment is >= tlen */
+ assert(iov[0].iov_len >= tlen);
+
+ iov[0].iov_len -= tlen;
+ if (iov[0].iov_len == 0) {
+ assert(*iovcnt > 1);
+ *iovcnt -= 1;
+ riov = &iov[1];
+ } else {
+ iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
+ riov = &iov[0];
+ }
+
+ return (riov);
+}
+
+ssize_t
+netbe_send(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+
+ assert(be != NULL);
+ if (be->be_vnet_hdr_len != be->fe_vnet_hdr_len) {
+ /*
+ * The frontend uses a virtio-net header, but the backend
+ * does not. We ignore it (as it must be all zeroes) and
+ * strip it.
+ */
+ assert(be->be_vnet_hdr_len == 0);
+ iov = iov_trim(iov, &iovcnt, be->fe_vnet_hdr_len);
+ }
+
+ return be->send(be, iov, iovcnt);
+}
+
+/*
+ * Try to read a packet from the backend, without blocking.
+ * If no packets are available, return 0. In case of success, return
+ * the length of the packet just read. Return -1 in case of errors.
+ */
+ssize_t
+netbe_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+ /* Length of prepended virtio-net header. */
+ unsigned int hlen = be->fe_vnet_hdr_len;
+ int ret;
+
+ assert(be != NULL);
+
+ if (hlen && hlen != be->be_vnet_hdr_len) {
+ /*
+ * The frontend uses a virtio-net header, but the backend
+ * does not. We need to prepend a zeroed header.
+ */
+ struct virtio_net_rxhdr *vh;
+
+ assert(be->be_vnet_hdr_len == 0);
+
+ /*
+ * Get a pointer to the rx header, and use the
+ * data immediately following it for the packet buffer.
+ */
+ vh = iov[0].iov_base;
+ iov = iov_trim(iov, &iovcnt, hlen);
+
+ /*
+ * The only valid field in the rx packet header is the
+ * number of buffers if merged rx bufs were negotiated.
+ */
+ memset(vh, 0, hlen);
+ if (hlen == VNET_HDR_LEN) {
+ vh->vrh_bufs = 1;
+ }
+ }
+
+ ret = be->recv(be, iov, iovcnt);
+ if (ret > 0) {
+ ret += hlen;
+ }
+
+ return (ret);
+}
+
+/*
+ * Read a packet from the backend and discard it.
+ * Returns the size of the discarded packet or zero if no packet was available.
+ * A negative error code is returned in case of read error.
+ */
+ssize_t
+netbe_rx_discard(struct net_backend *be)
+{
+ /*
+ * MP note: the dummybuf is only used to discard frames,
+ * so there is no need for it to be per-vtnet or locked.
+ * We only make it large enough for TSO-sized segment.
+ */
+ static uint8_t dummybuf[65536 + 64];
+ struct iovec iov;
+
+ iov.iov_base = dummybuf;
+ iov.iov_len = sizeof(dummybuf);
+
+ return netbe_recv(be, &iov, 1);
+}
+
Index: usr.sbin/bhyve/pci_e82545.c
===================================================================
--- usr.sbin/bhyve/pci_e82545.c
+++ usr.sbin/bhyve/pci_e82545.c
@@ -66,6 +66,7 @@
#include "pci_emul.h"
#include "mevent.h"
#include "net_utils.h"
+#include "net_backends.h"
/* Hardware/register definitions XXX: move some to common code. */
#define E82545_VENDOR_ID_INTEL 0x8086
@@ -245,11 +246,10 @@
struct e82545_softc {
struct pci_devinst *esc_pi;
struct vmctx *esc_ctx;
- struct mevent *esc_mevp;
struct mevent *esc_mevpitr;
pthread_mutex_t esc_mtx;
struct ether_addr esc_mac;
- int esc_tapfd;
+ net_backend_t *esc_be;
/* General */
uint32_t esc_CTRL; /* x0000 device ctl */
@@ -355,7 +355,7 @@
static void e82545_reset(struct e82545_softc *sc, int dev);
static void e82545_rx_enable(struct e82545_softc *sc);
static void e82545_rx_disable(struct e82545_softc *sc);
-static void e82545_tap_callback(int fd, enum ev_type type, void *param);
+static void e82545_rx_callback(int fd, enum ev_type type, void *param);
static void e82545_tx_start(struct e82545_softc *sc);
static void e82545_tx_enable(struct e82545_softc *sc);
static void e82545_tx_disable(struct e82545_softc *sc);
@@ -824,11 +824,9 @@
return (256); /* Forbidden value. */
}
-static uint8_t dummybuf[2048];
-
/* XXX one packet at a time until this is debugged */
static void
-e82545_tap_callback(int fd, enum ev_type type, void *param)
+e82545_rx_callback(int fd, enum ev_type type, void *param)
{
struct e82545_softc *sc = param;
struct e1000_rx_desc *rxd;
@@ -843,7 +841,7 @@
if (!sc->esc_rx_enabled || sc->esc_rx_loopback) {
DPRINTF("rx disabled (!%d || %d) -- packet(s) dropped\r\n",
sc->esc_rx_enabled, sc->esc_rx_loopback);
- while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) {
+ while (netbe_rx_discard(sc->esc_be) > 0) {
}
goto done1;
}
@@ -856,7 +854,7 @@
if (left < maxpktdesc) {
DPRINTF("rx overflow (%d < %d) -- packet(s) dropped\r\n",
left, maxpktdesc);
- while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) {
+ while (netbe_rx_discard(sc->esc_be) > 0) {
}
goto done1;
}
@@ -873,9 +871,9 @@
rxd->buffer_addr, bufsz);
vec[i].iov_len = bufsz;
}
- len = readv(sc->esc_tapfd, vec, maxpktdesc);
+ len = netbe_recv(sc->esc_be, vec, maxpktdesc);
if (len <= 0) {
- DPRINTF("tap: readv() returned %d\n", len);
+ DPRINTF("netbe_recv() returned %d\n", len);
goto done;
}
@@ -1050,10 +1048,10 @@
e82545_transmit_backend(struct e82545_softc *sc, struct iovec *iov, int iovcnt)
{
- if (sc->esc_tapfd == -1)
+ if (sc->esc_be == NULL)
return;
- (void) writev(sc->esc_tapfd, iov, iovcnt);
+ (void) netbe_send(sc->esc_be, iov, iovcnt);
}
static void
@@ -2209,56 +2207,6 @@
sc->esc_TXDCTL = 0;
}
-static void
-e82545_open_tap(struct e82545_softc *sc, char *opts)
-{
- char tbuf[80];
-#ifndef WITHOUT_CAPSICUM
- cap_rights_t rights;
-#endif
-
- if (opts == NULL) {
- sc->esc_tapfd = -1;
- return;
- }
-
- strcpy(tbuf, "/dev/");
- strlcat(tbuf, opts, sizeof(tbuf));
-
- sc->esc_tapfd = open(tbuf, O_RDWR);
- if (sc->esc_tapfd == -1) {
- DPRINTF("unable to open tap device %s\n", opts);
- exit(4);
- }
-
- /*
- * Set non-blocking and register for read
- * notifications with the event loop
- */
- int opt = 1;
- if (ioctl(sc->esc_tapfd, FIONBIO, &opt) < 0) {
- WPRINTF("tap device O_NONBLOCK failed: %d\n", errno);
- close(sc->esc_tapfd);
- sc->esc_tapfd = -1;
- }
-
-#ifndef WITHOUT_CAPSICUM
- cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
- if (caph_rights_limit(sc->esc_tapfd, &rights) == -1)
- errx(EX_OSERR, "Unable to apply rights for sandbox");
-#endif
-
- sc->esc_mevp = mevent_add(sc->esc_tapfd,
- EVF_READ,
- e82545_tap_callback,
- sc);
- if (sc->esc_mevp == NULL) {
- DPRINTF("Could not register mevent %d\n", EVF_READ);
- close(sc->esc_tapfd);
- sc->esc_tapfd = -1;
- }
-}
-
static int
e82545_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
@@ -2307,11 +2255,11 @@
E82545_BAR_IO_LEN);
/*
- * Attempt to open the tap device and read the MAC address
+ * Attempt to open the net backend and read the MAC address
* if specified. Copied from virtio-net, slightly modified.
*/
mac_provided = 0;
- sc->esc_tapfd = -1;
+ sc->esc_be = NULL;
if (opts != NULL) {
int err;
@@ -2327,11 +2275,10 @@
mac_provided = 1;
}
- if (strncmp(devname, "tap", 3) == 0 ||
- strncmp(devname, "vmnet", 5) == 0)
- e82545_open_tap(sc, devname);
-
+ err = netbe_init(&sc->esc_be, devname, e82545_rx_callback, sc);
free(devname);
+ if (err)
+ return (err);
}
if (!mac_provided) {
Index: usr.sbin/bhyve/pci_virtio_net.c
===================================================================
--- usr.sbin/bhyve/pci_virtio_net.c
+++ usr.sbin/bhyve/pci_virtio_net.c
@@ -32,22 +32,13 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
-#ifndef WITHOUT_CAPSICUM
-#include <sys/capsicum.h>
-#endif
#include <sys/linker_set.h>
#include <sys/select.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
#include <net/ethernet.h>
-#ifndef NETMAP_WITH_LIBS
-#define NETMAP_WITH_LIBS
-#endif
-#include <net/netmap_user.h>
+#include <net/if.h> /* IFNAMSIZ */
-#ifndef WITHOUT_CAPSICUM
-#include <capsicum_helpers.h>
-#endif
#include <err.h>
#include <errno.h>
#include <fcntl.h>
@@ -58,44 +49,20 @@
#include <strings.h>
#include <unistd.h>
#include <assert.h>
-#include <md5.h>
#include <pthread.h>
#include <pthread_np.h>
-#include <sysexits.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "mevent.h"
#include "virtio.h"
#include "net_utils.h"
+#include "net_backends.h"
#define VTNET_RINGSZ 1024
#define VTNET_MAXSEGS 256
-/*
- * Host capabilities. Note that we only offer a few of these.
- */
-#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */
-#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */
-#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
-#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */
-#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */
-#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */
-#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */
-#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */
-#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */
-#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */
-#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */
-#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */
-#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
-#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
-#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */
-#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */
-#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */
-#define VIRTIO_NET_F_GUEST_ANNOUNCE \
- (1 << 21) /* guest can send gratuitous pkts */
-
#define VTNET_S_HOSTCAPS \
( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \
VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC)
@@ -117,19 +84,6 @@
#define VTNET_MAXQ 3
-/*
- * Fixed network header size
- */
-struct virtio_net_rxhdr {
- uint8_t vrh_flags;
- uint8_t vrh_gso_type;
- uint16_t vrh_hdr_len;
- uint16_t vrh_gso_size;
- uint16_t vrh_csum_start;
- uint16_t vrh_csum_offset;
- uint16_t vrh_bufs;
-} __packed;
-
/*
* Debug printf
*/
@@ -144,20 +98,16 @@
struct virtio_softc vsc_vs;
struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
pthread_mutex_t vsc_mtx;
- struct mevent *vsc_mevp;
- int vsc_tapfd;
- struct nm_desc *vsc_nmd;
+ net_backend_t *vsc_be;
int vsc_rx_ready;
int resetting; /* protected by tx_mtx */
uint64_t vsc_features; /* negotiated features */
- struct virtio_net_config vsc_config;
-
pthread_mutex_t rx_mtx;
- int rx_vhdrlen;
+ unsigned int rx_vhdrlen;
int rx_merge; /* merged rx bufs in use */
pthread_t tx_tid;
@@ -165,9 +115,8 @@
pthread_cond_t tx_cond;
int tx_in_progress;
- void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc);
- void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov,
- int iovcnt, int len);
+ struct virtio_net_config vsc_config;
+ struct virtio_consts vsc_consts;
};
static void pci_vtnet_reset(void *);
@@ -223,84 +172,20 @@
pthread_mutex_unlock(&sc->rx_mtx);
}
-/*
- * Called to send a buffer chain out to the tap device
- */
-static void
-pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
- int len)
-{
- static char pad[60]; /* all zero bytes */
-
- if (sc->vsc_tapfd == -1)
- return;
-
- /*
- * If the length is < 60, pad out to that and add the
- * extra zero'd segment to the iov. It is guaranteed that
- * there is always an extra iov available by the caller.
- */
- if (len < 60) {
- iov[iovcnt].iov_base = pad;
- iov[iovcnt].iov_len = 60 - len;
- iovcnt++;
- }
- (void) writev(sc->vsc_tapfd, iov, iovcnt);
-}
-
-/*
- * Called when there is read activity on the tap file descriptor.
- * Each buffer posted by the guest is assumed to be able to contain
- * an entire ethernet frame + rx header.
- * MP note: the dummybuf is only used for discarding frames, so there
- * is no need for it to be per-vtnet or locked.
- */
-static uint8_t dummybuf[2048];
-
-static __inline struct iovec *
-rx_iov_trim(struct iovec *iov, int *niov, int tlen)
-{
- struct iovec *riov;
-
- /* XXX short-cut: assume first segment is >= tlen */
- assert(iov[0].iov_len >= tlen);
-
- iov[0].iov_len -= tlen;
- if (iov[0].iov_len == 0) {
- assert(*niov > 1);
- *niov -= 1;
- riov = &iov[1];
- } else {
- iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
- riov = &iov[0];
- }
-
- return (riov);
-}
-
static void
-pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
+pci_vtnet_rx(struct pci_vtnet_softc *sc)
{
- struct iovec iov[VTNET_MAXSEGS], *riov;
+ struct iovec iov[VTNET_MAXSEGS + 1];
struct vqueue_info *vq;
- void *vrx;
int len, n;
uint16_t idx;
- /*
- * Should never be called without a valid tap fd
- */
- assert(sc->vsc_tapfd != -1);
-
- /*
- * But, will be called when the rx ring hasn't yet
- * been set up.
- */
if (!sc->vsc_rx_ready) {
/*
+ * The rx ring has not yet been set up.
* Drop the packet and try later.
*/
- (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+ netbe_rx_discard(sc->vsc_be);
return;
}
@@ -310,11 +195,11 @@
vq = &sc->vsc_queues[VTNET_RXQ];
if (!vq_has_descs(vq)) {
/*
- * Drop the packet and try later. Interrupt on
- * empty, if that's negotiated.
+ * No available rx buffers. Drop the packet and try later.
+ * Interrupt on empty, if that's negotiated.
*/
- (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
- vq_endchains(vq, 1);
+ netbe_rx_discard(sc->vsc_be);
+ vq_endchains(vq, /*used_all_avail=*/1);
return;
}
@@ -325,256 +210,46 @@
n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
assert(n >= 1 && n <= VTNET_MAXSEGS);
- /*
- * Get a pointer to the rx header, and use the
- * data immediately following it for the packet buffer.
- */
- vrx = iov[0].iov_base;
- riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
+ len = netbe_recv(sc->vsc_be, iov, n);
- len = readv(sc->vsc_tapfd, riov, n);
-
- if (len < 0 && errno == EWOULDBLOCK) {
+ if (len <= 0) {
/*
- * No more packets, but still some avail ring
- * entries. Interrupt if needed/appropriate.
+ * No more packets (len == 0), or backend errored
+ * (err < 0). Return unused available buffers.
*/
vq_retchain(vq);
- vq_endchains(vq, 0);
- return;
- }
-
- /*
- * The only valid field in the rx packet header is the
- * number of buffers if merged rx bufs were negotiated.
- */
- memset(vrx, 0, sc->rx_vhdrlen);
-
- if (sc->rx_merge) {
- struct virtio_net_rxhdr *vrxh;
-
- vrxh = vrx;
- vrxh->vrh_bufs = 1;
+ if (len == 0) {
+ /* Interrupt if needed/appropriate and stop. */
+ vq_endchains(vq, /*used_all_avail=*/0);
+ return;
+ }
}
- /*
- * Release this chain and handle more chains.
- */
- vq_relchain(vq, idx, len + sc->rx_vhdrlen);
+ /* Publish the info to the guest */
+ vq_relchain(vq, idx, (uint32_t)len);
} while (vq_has_descs(vq));
/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
- vq_endchains(vq, 1);
-}
-
-static __inline int
-pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
-{
- int r, i;
- int len = 0;
-
- for (r = nmd->cur_tx_ring; ; ) {
- struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r);
- uint32_t cur, idx;
- char *buf;
-
- if (nm_ring_empty(ring)) {
- r++;
- if (r > nmd->last_tx_ring)
- r = nmd->first_tx_ring;
- if (r == nmd->cur_tx_ring)
- break;
- continue;
- }
- cur = ring->cur;
- idx = ring->slot[cur].buf_idx;
- buf = NETMAP_BUF(ring, idx);
-
- for (i = 0; i < iovcnt; i++) {
- if (len + iov[i].iov_len > 2048)
- break;
- memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len);
- len += iov[i].iov_len;
- }
- ring->slot[cur].len = len;
- ring->head = ring->cur = nm_ring_next(ring, cur);
- nmd->cur_tx_ring = r;
- ioctl(nmd->fd, NIOCTXSYNC, NULL);
- break;
- }
-
- return (len);
-}
-
-static __inline int
-pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
-{
- int len = 0;
- int i = 0;
- int r;
-
- for (r = nmd->cur_rx_ring; ; ) {
- struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r);
- uint32_t cur, idx;
- char *buf;
- size_t left;
-
- if (nm_ring_empty(ring)) {
- r++;
- if (r > nmd->last_rx_ring)
- r = nmd->first_rx_ring;
- if (r == nmd->cur_rx_ring)
- break;
- continue;
- }
- cur = ring->cur;
- idx = ring->slot[cur].buf_idx;
- buf = NETMAP_BUF(ring, idx);
- left = ring->slot[cur].len;
-
- for (i = 0; i < iovcnt && left > 0; i++) {
- if (iov[i].iov_len > left)
- iov[i].iov_len = left;
- memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len);
- len += iov[i].iov_len;
- left -= iov[i].iov_len;
- }
- ring->head = ring->cur = nm_ring_next(ring, cur);
- nmd->cur_rx_ring = r;
- ioctl(nmd->fd, NIOCRXSYNC, NULL);
- break;
- }
- for (; i < iovcnt; i++)
- iov[i].iov_len = 0;
-
- return (len);
+ vq_endchains(vq, /*used_all_avail=*/1);
}
/*
- * Called to send a buffer chain out to the vale port
+ * Called when there is read activity on the backend file descriptor.
+ * Each buffer posted by the guest is assumed to be able to contain
+ * an entire ethernet frame + rx header.
*/
-static void
-pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
- int len)
-{
- static char pad[60]; /* all zero bytes */
-
- if (sc->vsc_nmd == NULL)
- return;
-
- /*
- * If the length is < 60, pad out to that and add the
- * extra zero'd segment to the iov. It is guaranteed that
- * there is always an extra iov available by the caller.
- */
- if (len < 60) {
- iov[iovcnt].iov_base = pad;
- iov[iovcnt].iov_len = 60 - len;
- iovcnt++;
- }
- (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt);
-}
-
-static void
-pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc)
-{
- struct iovec iov[VTNET_MAXSEGS], *riov;
- struct vqueue_info *vq;
- void *vrx;
- int len, n;
- uint16_t idx;
-
- /*
- * Should never be called without a valid netmap descriptor
- */
- assert(sc->vsc_nmd != NULL);
-
- /*
- * But, will be called when the rx ring hasn't yet
- * been set up.
- */
- if (!sc->vsc_rx_ready) {
- /*
- * Drop the packet and try later.
- */
- (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
- return;
- }
-
- /*
- * Check for available rx buffers
- */
- vq = &sc->vsc_queues[VTNET_RXQ];
- if (!vq_has_descs(vq)) {
- /*
- * Drop the packet and try later. Interrupt on
- * empty, if that's negotiated.
- */
- (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
- vq_endchains(vq, 1);
- return;
- }
-
- do {
- /*
- * Get descriptor chain.
- */
- n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
- assert(n >= 1 && n <= VTNET_MAXSEGS);
-
- /*
- * Get a pointer to the rx header, and use the
- * data immediately following it for the packet buffer.
- */
- vrx = iov[0].iov_base;
- riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
-
- len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n);
-
- if (len == 0) {
- /*
- * No more packets, but still some avail ring
- * entries. Interrupt if needed/appropriate.
- */
- vq_retchain(vq);
- vq_endchains(vq, 0);
- return;
- }
-
- /*
- * The only valid field in the rx packet header is the
- * number of buffers if merged rx bufs were negotiated.
- */
- memset(vrx, 0, sc->rx_vhdrlen);
-
- if (sc->rx_merge) {
- struct virtio_net_rxhdr *vrxh;
-
- vrxh = vrx;
- vrxh->vrh_bufs = 1;
- }
-
- /*
- * Release this chain and handle more chains.
- */
- vq_relchain(vq, idx, len + sc->rx_vhdrlen);
- } while (vq_has_descs(vq));
-
- /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
- vq_endchains(vq, 1);
-}
-
static void
pci_vtnet_rx_callback(int fd, enum ev_type type, void *param)
{
struct pci_vtnet_softc *sc = param;
pthread_mutex_lock(&sc->rx_mtx);
- sc->pci_vtnet_rx(sc);
+ pci_vtnet_rx(sc);
pthread_mutex_unlock(&sc->rx_mtx);
}
+/* Called on RX kick. */
static void
pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
{
@@ -589,35 +264,29 @@
}
}
+/* TX virtqueue processing, called by the TX thread. */
static void
pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
{
struct iovec iov[VTNET_MAXSEGS + 1];
- int i, n;
- int plen, tlen;
uint16_t idx;
+ ssize_t len;
+ int n;
/*
- * Obtain chain of descriptors. The first one is
- * really the header descriptor, so we need to sum
- * up two lengths: packet length and transfer length.
+ * Obtain chain of descriptors. The first descriptor also
+ * contains the virtio-net header.
*/
n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
assert(n >= 1 && n <= VTNET_MAXSEGS);
- plen = 0;
- tlen = iov[0].iov_len;
- for (i = 1; i < n; i++) {
- plen += iov[i].iov_len;
- tlen += iov[i].iov_len;
- }
- DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
- sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen);
+ len = netbe_send(sc->vsc_be, iov, n);
- /* chain is processed, release it and set tlen */
- vq_relchain(vq, idx, tlen);
+ /* chain is processed, release it and set len */
+ vq_relchain(vq, idx, len > 0 ? len : 0);
}
+/* Called on TX kick. */
static void
pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
{
@@ -684,7 +353,7 @@
/*
* Generate an interrupt if needed.
*/
- vq_endchains(vq, 1);
+ vq_endchains(vq, /*used_all_avail=*/1);
pthread_mutex_lock(&sc->tx_mtx);
}
@@ -699,93 +368,23 @@
}
#endif
-static void
-pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname)
-{
- char tbuf[80];
-#ifndef WITHOUT_CAPSICUM
- cap_rights_t rights;
-#endif
-
- strcpy(tbuf, "/dev/");
- strlcat(tbuf, devname, sizeof(tbuf));
-
- sc->pci_vtnet_rx = pci_vtnet_tap_rx;
- sc->pci_vtnet_tx = pci_vtnet_tap_tx;
-
- sc->vsc_tapfd = open(tbuf, O_RDWR);
- if (sc->vsc_tapfd == -1) {
- WPRINTF(("open of tap device %s failed\n", tbuf));
- return;
- }
-
- /*
- * Set non-blocking and register for read
- * notifications with the event loop
- */
- int opt = 1;
- if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
- WPRINTF(("tap device O_NONBLOCK failed\n"));
- close(sc->vsc_tapfd);
- sc->vsc_tapfd = -1;
- }
-
-#ifndef WITHOUT_CAPSICUM
- cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
- if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1)
- errx(EX_OSERR, "Unable to apply rights for sandbox");
-#endif
-
- sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
- EVF_READ,
- pci_vtnet_rx_callback,
- sc);
- if (sc->vsc_mevp == NULL) {
- WPRINTF(("Could not register event\n"));
- close(sc->vsc_tapfd);
- sc->vsc_tapfd = -1;
- }
-}
-
-static void
-pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname)
-{
- sc->pci_vtnet_rx = pci_vtnet_netmap_rx;
- sc->pci_vtnet_tx = pci_vtnet_netmap_tx;
-
- sc->vsc_nmd = nm_open(ifname, NULL, 0, 0);
- if (sc->vsc_nmd == NULL) {
- WPRINTF(("open of netmap device %s failed\n", ifname));
- return;
- }
-
- sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd,
- EVF_READ,
- pci_vtnet_rx_callback,
- sc);
- if (sc->vsc_mevp == NULL) {
- WPRINTF(("Could not register event\n"));
- nm_close(sc->vsc_nmd);
- sc->vsc_nmd = NULL;
- }
-}
-
static int
pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
- char tname[MAXCOMLEN + 1];
struct pci_vtnet_softc *sc;
- char *devname;
- char *vtopts;
+ char tname[MAXCOMLEN + 1];
int mac_provided;
+ /*
+ * Allocate data structures for further virtio initializations.
+ * sc also contains a copy of vtnet_vi_consts, since capabilities
+ * change depending on the backend.
+ */
sc = calloc(1, sizeof(struct pci_vtnet_softc));
+ sc->vsc_consts = vtnet_vi_consts;
pthread_mutex_init(&sc->vsc_mtx, NULL);
- vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues);
- sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
-
sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
@@ -796,13 +395,13 @@
#endif
/*
- * Attempt to open the tap device and read the MAC address
- * if specified
+ * Attempt to open the backend device and read the MAC address
+ * if specified.
*/
mac_provided = 0;
- sc->vsc_tapfd = -1;
- sc->vsc_nmd = NULL;
if (opts != NULL) {
+ char *devname;
+ char *vtopts;
int err;
devname = vtopts = strdup(opts);
@@ -817,13 +416,12 @@
mac_provided = 1;
}
- if (strncmp(devname, "vale", 4) == 0)
- pci_vtnet_netmap_setup(sc, devname);
- if (strncmp(devname, "tap", 3) == 0 ||
- strncmp(devname, "vmnet", 5) == 0)
- pci_vtnet_tap_setup(sc, devname);
-
+ err = netbe_init(&sc->vsc_be, devname, pci_vtnet_rx_callback,
+ sc);
free(devname);
+ if (err)
+ return (err);
+ sc->vsc_consts.vc_hv_caps |= netbe_get_cap(sc->vsc_be);
}
if (!mac_provided) {
@@ -837,10 +435,12 @@
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
- /* Link is up if we managed to open tap device or vale port. */
- sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 ||
- sc->vsc_nmd != NULL);
+ /* Link is up if we managed to open backend device. */
+ sc->vsc_config.status = (opts == NULL || sc->vsc_be);
+ vi_softc_linkup(&sc->vsc_vs, &sc->vsc_consts, sc, pi, sc->vsc_queues);
+ sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
+
/* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix()))
return (1);
@@ -876,8 +476,8 @@
struct pci_vtnet_softc *sc = vsc;
void *ptr;
- if (offset < 6) {
- assert(offset + size <= 6);
+ if (offset < (int)sizeof(sc->vsc_config.mac)) {
+ assert(offset + size <= (int)sizeof(sc->vsc_config.mac));
/*
* The driver is allowed to change the MAC address
*/
@@ -909,14 +509,18 @@
sc->vsc_features = negotiated_features;
- if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) {
+ if (!(negotiated_features & VIRTIO_NET_F_MRG_RXBUF)) {
sc->rx_merge = 0;
- /* non-merge rx header is 2 bytes shorter */
- sc->rx_vhdrlen -= 2;
+ /* Without mergeable rx buffers, virtio-net header is 2
+ * bytes shorter than sizeof(struct virtio_net_rxhdr). */
+ sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr) - 2;
}
+
+ /* Tell the backend to enable some capabilities it has advertised. */
+ netbe_set_cap(sc->vsc_be, negotiated_features, sc->rx_vhdrlen);
}
-struct pci_devemu pci_de_vnet = {
+static struct pci_devemu pci_de_vnet = {
.pe_emu = "virtio-net",
.pe_init = pci_vtnet_init,
.pe_barwrite = vi_pci_write,

File Metadata

Mime Type
text/plain
Expires
Wed, Nov 19, 9:10 AM (3 h, 42 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
25615305
Default Alt Text
D20659.id59117.diff (46 KB)

Event Timeline