D20659.id58685.diff
No OneTemporary
Actions

Size

44 KB

Referenced Files

None

Subscribers

None

D20659.id58685.diff
View Options

	Index: usr.sbin/bhyve/Makefile
	===================================================================
	--- usr.sbin/bhyve/Makefile
	+++ usr.sbin/bhyve/Makefile
	@@ -32,6 +32,7 @@
	mem.c \
	mevent.c \
	mptbl.c \
	+ net_backends.c \
	net_utils.c \
	pci_ahci.c \
	pci_e82545.c \
	Index: usr.sbin/bhyve/net_backends.h
	===================================================================
	--- /dev/null
	+++ usr.sbin/bhyve/net_backends.h
	@@ -0,0 +1,110 @@
	+/*-
	+ * Copyright (c) 2014 Vincenzo Maffione <v.maffione@gmail.com>
	+ * All rights reserved.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
	+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
	+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
	+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
	+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
	+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
	+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
	+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	+ */
	+
	+#ifndef __NET_BACKENDS_H__
	+#define __NET_BACKENDS_H__
	+
	+#include <stdint.h>
	+#include <net/if.h>
	+
	+#include <net/netmap.h>
	+#include <net/netmap_virt.h>
	+#define NETMAP_WITH_LIBS
	+#include <net/netmap_user.h>
	+
	+#include "mevent.h"
	+
	+extern int netmap_ioctl_counter;
	+
	+/* Opaque type representing a network backend. */
	+typedef struct net_backend net_backend_t;
	+
	+/* Interface between network frontends and the network backends. */
	+typedef void (net_backend_cb_t)(int, enum ev_type, void param);
	+net_backend_t netbe_init(const char devname,
	+ net_backend_cb_t cb, void *param);
	+void netbe_cleanup(net_backend_t *be);
	+uint64_t netbe_get_cap(net_backend_t *be);
	+int netbe_set_cap(net_backend_t *be, uint64_t cap,
	+ unsigned vnet_hdr_len);
	+void netbe_send(net_backend_t be, struct iovec iov,
	+ int iovcnt, uint32_t len, int more);
	+int netbe_recv(net_backend_t be, struct iovec iov, int iovcnt);
	+int netbe_rx_discard(net_backend_t *be);
	+
	+
	+/*
	+ * Network device capabilities taken from VirtIO standard.
	+ * Despite the name, these capabilities can be used by different frontents
	+ * (virtio-net, ptnet) and supported by different backends (netmap, tap, ...).
	+ */
	+#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */
	+#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */
	+#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
	+#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */
	+#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */
	+#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */
	+#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */
	+#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */
	+#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */
	+#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */
	+#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */
	+#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */
	+#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
	+#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
	+#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */
	+#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */
	+#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */
	+#define VIRTIO_NET_F_GUEST_ANNOUNCE \
	+ (1 << 21) /* guest can send gratuitous pkts */
	+
	+/*
	+ * Fixed network header size
	+ */
	+struct virtio_net_rxhdr {
	+ uint8_t vrh_flags;
	+ uint8_t vrh_gso_type;
	+ uint16_t vrh_hdr_len;
	+ uint16_t vrh_gso_size;
	+ uint16_t vrh_csum_start;
	+ uint16_t vrh_csum_offset;
	+ uint16_t vrh_bufs;
	+} __packed;
	+
	+/* Used to get read-only info. */
	+struct netmap_if_info {
	+ uint32_t nifp_offset;
	+ uint16_t num_tx_rings;
	+ uint16_t num_rx_rings;
	+ uint16_t num_tx_slots;
	+ uint16_t num_rx_slots;
	+};
	+
	+#include "pci_emul.h"
	+int net_parsemac(char mac_str, uint8_t mac_addr);
	+void net_genmac(struct pci_devinst pi, uint8_t macaddr);
	+
	+#endif /* __NET_BACKENDS_H__ */
	Index: usr.sbin/bhyve/net_backends.c
	===================================================================
	--- /dev/null
	+++ usr.sbin/bhyve/net_backends.c
	@@ -0,0 +1,897 @@
	+/*-
	+ * Copyright (c) 2014-2016 Vincenzo Maffione
	+ * All rights reserved.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
	+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
	+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
	+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
	+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
	+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
	+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
	+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	+ */
	+
	+/*
	+ * This file implements multiple network backends (null, tap, netmap, ...),
	+ * to be used by network frontends such as virtio-net and ptnet.
	+ * The API to access the backend (e.g. send/receive packets, negotiate
	+ * features) is exported by net_backends.h.
	+ */
	+
	+#include <sys/cdefs.h>
	+#include <sys/uio.h>
	+#include <sys/ioctl.h>
	+#include <sys/mman.h>
	+#include <sys/types.h> /* u_short etc */
	+#include <net/if.h>
	+
	+#include <err.h>
	+#include <errno.h>
	+#include <fcntl.h>
	+#include <stdio.h>
	+#include <stdlib.h>
	+#include <stdint.h>
	+#include <string.h>
	+#include <unistd.h>
	+#include <sysexits.h>
	+#include <assert.h>
	+#include <pthread.h>
	+#include <pthread_np.h>
	+#include <poll.h>
	+#include <assert.h>
	+
	+#ifndef WITHOUT_CAPSICUM
	+#include <capsicum_helpers.h>
	+#include <sys/capsicum.h>
	+#endif
	+
	+#include "mevent.h"
	+#include "net_backends.h"
	+
	+#include <sys/linker_set.h>
	+
	+/*
	+ * Each network backend registers a set of function pointers that are
	+ * used to implement the net backends API.
	+ * This might need to be exposed if we implement backends in separate files.
	+ */
	+struct net_backend {
	+ const char name; / name of the backend */
	+ /*
	+ * The init and cleanup functions are used internally,
	+ * virtio-net should never use it.
	+ */
	+ int (init)(struct net_backend be, const char *devname,
	+ net_backend_cb_t cb, void *param);
	+ void (cleanup)(struct net_backend be);
	+
	+
	+ /*
	+ * Called to serve a guest transmit request. The scatter-gather
	+ * vector provided by the caller has 'iovcnt' elements and contains
	+ * the packet to send. 'len' is the length of whole packet in bytes.
	+ */
	+ int (send)(struct net_backend be, struct iovec *iov,
	+ int iovcnt, uint32_t len, int more);
	+
	+ /*
	+ * Called to serve guest receive request. When the function
	+ * returns a positive value, the scatter-gather vector
	+ * provided by the caller (having 'iovcnt' elements in it) will
	+ * contain a chunk of the received packet. The 'more' flag will
	+ * be set if the returned chunk was the last one for the current
	+ * packet, and 0 otherwise. The function returns the chunk size
	+ * in bytes, or 0 if the backend doesn't have a new packet to
	+ * receive.
	+ * Note that it may be necessary to call this callback many
	+ * times to receive a single packet, depending of how big is
	+ * buffers you provide.
	+ */
	+ int (recv)(struct net_backend be, struct iovec *iov, int iovcnt);
	+
	+ /*
	+ * Ask the backend for the virtio-net features it is able to
	+ * support. Possible features are TSO, UFO and checksum offloading
	+ * in both rx and tx direction and for both IPv4 and IPv6.
	+ */
	+ uint64_t (get_cap)(struct net_backend be);
	+
	+ /*
	+ * Tell the backend to enable/disable the specified virtio-net
	+ * features (capabilities).
	+ */
	+ int (set_cap)(struct net_backend be, uint64_t features,
	+ unsigned int vnet_hdr_len);
	+
	+ struct pci_vtnet_softc *sc;
	+ int fd;
	+ unsigned int be_vnet_hdr_len;
	+ unsigned int fe_vnet_hdr_len;
	+ /* TODO: implement priv space as a 'char opaque[0]' */
	+ void priv; / Pointer to backend-specific data. */
	+};
	+
	+SET_DECLARE(net_backend_set, struct net_backend);
	+
	+#define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr)
	+
	+#define WPRINTF(params) printf params
	+
	+/* the tap backend */
	+
	+struct tap_priv {
	+ struct mevent *mevp;
	+};
	+
	+static void
	+tap_cleanup(struct net_backend *be)
	+{
	+ struct tap_priv *priv = be->priv;
	+
	+ if (be->priv) {
	+ mevent_delete(priv->mevp);
	+ free(be->priv);
	+ be->priv = NULL;
	+ }
	+ if (be->fd != -1) {
	+ close(be->fd);
	+ be->fd = -1;
	+ }
	+}
	+
	+static int
	+tap_init(struct net_backend be, const char devname,
	+ net_backend_cb_t cb, void *param)
	+{
	+ char tbuf[80];
	+ int fd;
	+ int opt = 1;
	+ struct tap_priv *priv;
	+#ifndef WITHOUT_CAPSICUM
	+ cap_rights_t rights;
	+#endif
	+
	+ if (cb == NULL) {
	+ WPRINTF(("TAP backend requires non-NULL callback\n"));
	+ return -1;
	+ }
	+
	+ priv = calloc(1, sizeof(struct tap_priv));
	+ if (priv == NULL) {
	+ WPRINTF(("tap_priv alloc failed\n"));
	+ return -1;
	+ }
	+
	+ strcpy(tbuf, "/dev/");
	+ strlcat(tbuf, devname, sizeof(tbuf));
	+
	+ fd = open(tbuf, O_RDWR);
	+ if (fd == -1) {
	+ WPRINTF(("open of tap device %s failed\n", tbuf));
	+ goto error;
	+ }
	+
	+ /*
	+ * Set non-blocking and register for read
	+ * notifications with the event loop
	+ */
	+ if (ioctl(fd, FIONBIO, &opt) < 0) {
	+ WPRINTF(("tap device O_NONBLOCK failed\n"));
	+ goto error;
	+ }
	+
	+#ifndef WITHOUT_CAPSICUM
	+ cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
	+ if (caph_rights_limit(fd, &rights) == -1)
	+ errx(EX_OSERR, "Unable to apply rights for sandbox");
	+#endif
	+
	+ priv->mevp = mevent_add(fd, EVF_READ, cb, param);
	+ if (priv->mevp == NULL) {
	+ WPRINTF(("Could not register event\n"));
	+ goto error;
	+ }
	+
	+ be->fd = fd;
	+ be->priv = priv;
	+
	+ return 0;
	+
	+error:
	+ tap_cleanup(be);
	+ return -1;
	+}
	+
	+/*
	+ * Called to send a buffer chain out to the tap device
	+ */
	+static int
	+tap_send(struct net_backend be, struct iovec iov, int iovcnt, uint32_t len,
	+ int more)
	+{
	+ static char pad[60]; /* all zero bytes */
	+
	+ (void)more;
	+ /*
	+ * If the length is < 60, pad out to that and add the
	+ * extra zero'd segment to the iov. It is guaranteed that
	+ * there is always an extra iov available by the caller.
	+ */
	+ if (len < 60) {
	+ iov[iovcnt].iov_base = pad;
	+ iov[iovcnt].iov_len = (size_t)(60 - len);
	+ iovcnt++;
	+ }
	+
	+ return (int)writev(be->fd, iov, iovcnt);
	+}
	+
	+static int
	+tap_recv(struct net_backend be, struct iovec iov, int iovcnt)
	+{
	+ int ret;
	+
	+ /* Should never be called without a valid tap fd */
	+ assert(be->fd != -1);
	+
	+ ret = (int)readv(be->fd, iov, iovcnt);
	+
	+ if (ret < 0 && errno == EWOULDBLOCK) {
	+ return 0;
	+ }
	+
	+ return ret;
	+}
	+
	+static uint64_t
	+tap_get_cap(struct net_backend *be)
	+{
	+ (void)be;
	+ return 0; // nothing extra
	+}
	+
	+static int
	+tap_set_cap(struct net_backend *be, uint64_t features,
	+ unsigned vnet_hdr_len)
	+{
	+ (void)be;
	+ return (features \|\| vnet_hdr_len) ? -1 : 0;
	+}
	+
	+static struct net_backend tap_backend = {
	+ .name = "tap\|vmnet",
	+ .init = tap_init,
	+ .cleanup = tap_cleanup,
	+ .send = tap_send,
	+ .recv = tap_recv,
	+ .get_cap = tap_get_cap,
	+ .set_cap = tap_set_cap,
	+};
	+
	+DATA_SET(net_backend_set, tap_backend);
	+
	+/*
	+ * The netmap backend
	+ */
	+
	+/* The virtio-net features supported by netmap. */
	+#define NETMAP_FEATURES (VIRTIO_NET_F_CSUM \| VIRTIO_NET_F_HOST_TSO4 \| \
	+ VIRTIO_NET_F_HOST_TSO6 \| VIRTIO_NET_F_HOST_UFO \| \
	+ VIRTIO_NET_F_GUEST_CSUM \| VIRTIO_NET_F_GUEST_TSO4 \| \
	+ VIRTIO_NET_F_GUEST_TSO6 \| VIRTIO_NET_F_GUEST_UFO)
	+
	+#define NETMAP_POLLMASK (POLLIN \| POLLRDNORM \| POLLRDBAND)
	+
	+struct netmap_priv {
	+ char ifname[IFNAMSIZ];
	+ struct nm_desc *nmd;
	+ uint16_t memid;
	+ struct netmap_ring *rx;
	+ struct netmap_ring *tx;
	+ pthread_t evloop_tid;
	+ net_backend_cb_t cb;
	+ void *cb_param;
	+};
	+
	+static void *
	+netmap_evloop_thread(void *param)
	+{
	+ struct net_backend *be = param;
	+ struct netmap_priv *priv = be->priv;
	+ struct pollfd pfd;
	+ int ret;
	+
	+ for (;;) {
	+ pfd.fd = be->fd;
	+ pfd.events = NETMAP_POLLMASK;
	+ ret = poll(&pfd, 1, INFTIM);
	+ if (ret == -1 && errno != EINTR) {
	+ WPRINTF(("netmap poll failed, %d\n", errno));
	+ } else if (ret == 1 && (pfd.revents & NETMAP_POLLMASK)) {
	+ priv->cb(pfd.fd, EVF_READ, priv->cb_param);
	+ }
	+ }
	+
	+ return NULL;
	+}
	+
	+static void
	+nmreq_init(struct nmreq req, char ifname)
	+{
	+ memset(req, 0, sizeof(*req));
	+ strncpy(req->nr_name, ifname, sizeof(req->nr_name));
	+ req->nr_version = NETMAP_API;
	+}
	+
	+static int
	+netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
	+{
	+ int err;
	+ struct nmreq req;
	+ struct netmap_priv *priv = be->priv;
	+
	+ nmreq_init(&req, priv->ifname);
	+ req.nr_cmd = NETMAP_BDG_VNET_HDR;
	+ req.nr_arg1 = vnet_hdr_len;
	+ err = ioctl(be->fd, NIOCREGIF, &req);
	+ if (err) {
	+ WPRINTF(("Unable to set vnet header length %d\n",
	+ vnet_hdr_len));
	+ return err;
	+ }
	+
	+ be->be_vnet_hdr_len = vnet_hdr_len;
	+
	+ return 0;
	+}
	+
	+static int
	+netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
	+{
	+ int prev_hdr_len = be->be_vnet_hdr_len;
	+ int ret;
	+
	+ if (vnet_hdr_len == prev_hdr_len) {
	+ return 1;
	+ }
	+
	+ ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
	+ if (ret) {
	+ return 0;
	+ }
	+
	+ netmap_set_vnet_hdr_len(be, prev_hdr_len);
	+
	+ return 1;
	+}
	+
	+static uint64_t
	+netmap_get_cap(struct net_backend *be)
	+{
	+ return netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
	+ NETMAP_FEATURES : 0;
	+}
	+
	+static int
	+netmap_set_cap(struct net_backend *be, uint64_t features,
	+ unsigned vnet_hdr_len)
	+{
	+ return netmap_set_vnet_hdr_len(be, vnet_hdr_len);
	+}
	+
	+static int
	+netmap_init(struct net_backend be, const char devname,
	+ net_backend_cb_t cb, void *param)
	+{
	+ struct netmap_priv *priv = NULL;
	+
	+ priv = calloc(1, sizeof(struct netmap_priv));
	+ if (priv == NULL) {
	+ WPRINTF(("Unable alloc netmap private data\n"));
	+ return -1;
	+ }
	+
	+ strncpy(priv->ifname, devname, sizeof(priv->ifname));
	+ priv->ifname[sizeof(priv->ifname) - 1] = '\0';
	+
	+ priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
	+ if (priv->nmd == NULL) {
	+ WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)\n",
	+ devname, strerror(errno)));
	+ free(priv);
	+ return -1;
	+ }
	+
	+ priv->memid = priv->nmd->req.nr_arg2;
	+ priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
	+ priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
	+ priv->cb = cb;
	+ priv->cb_param = param;
	+ be->fd = priv->nmd->fd;
	+ be->priv = priv;
	+
	+ /* TODO Turn this into a mevent_add */
	+ {
	+ char tname[40];
	+
	+ /* Create a thread for netmap poll. */
	+ pthread_create(&priv->evloop_tid, NULL, netmap_evloop_thread, (void *)be);
	+ snprintf(tname, sizeof(tname), "netmap-evloop-%p", priv);
	+ pthread_set_name_np(priv->evloop_tid, tname);
	+ }
	+
	+ return 0;
	+}
	+
	+static void
	+netmap_cleanup(struct net_backend *be)
	+{
	+ struct netmap_priv *priv = be->priv;
	+
	+ if (be->priv) {
	+ nm_close(priv->nmd);
	+ free(be->priv);
	+ be->priv = NULL;
	+ }
	+ be->fd = -1;
	+}
	+
	+/* A fast copy routine only for multiples of 64 bytes, non overlapped. */
	+static inline void
	+pkt_copy(const void _src, void _dst, int l)
	+{
	+ const uint64_t *src = _src;
	+ uint64_t *dst = _dst;
	+ if (l >= 1024) {
	+ bcopy(src, dst, l);
	+ return;
	+ }
	+ for (; l > 0; l -= 64) {
	+ dst++ = src++;
	+ dst++ = src++;
	+ dst++ = src++;
	+ dst++ = src++;
	+ dst++ = src++;
	+ dst++ = src++;
	+ dst++ = src++;
	+ dst++ = src++;
	+ }
	+}
	+
	+static int
	+netmap_send(struct net_backend be, struct iovec iov,
	+ int iovcnt, uint32_t size, int more)
	+{
	+ struct netmap_priv *priv = be->priv;
	+ struct netmap_ring *ring;
	+ int nm_buf_size;
	+ int nm_buf_len;
	+ uint32_t head;
	+ void *nm_buf;
	+ int j;
	+
	+ if (iovcnt <= 0 \|\| size <= 0) {
	+ D("Wrong iov: iovcnt %d size %d", iovcnt, size);
	+ return 0;
	+ }
	+
	+ ring = priv->tx;
	+ head = ring->head;
	+ if (head == ring->tail) {
	+ RD(1, "No space, drop %d bytes", size);
	+ goto txsync;
	+ }
	+ nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
	+ nm_buf_size = ring->nr_buf_size;
	+ nm_buf_len = 0;
	+
	+ for (j = 0; j < iovcnt; j++) {
	+ int iov_frag_size = iov[j].iov_len;
	+ void *iov_frag_buf = iov[j].iov_base;
	+
	+ /* Split each iovec fragment over more netmap slots, if
	+ necessary. */
	+ for (;;) {
	+ int copylen;
	+
	+ copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
	+ pkt_copy(iov_frag_buf, nm_buf, copylen);
	+
	+ iov_frag_buf += copylen;
	+ iov_frag_size -= copylen;
	+ nm_buf += copylen;
	+ nm_buf_size -= copylen;
	+ nm_buf_len += copylen;
	+
	+ if (iov_frag_size == 0) {
	+ break;
	+ }
	+
	+ ring->slot[head].len = nm_buf_len;
	+ ring->slot[head].flags = NS_MOREFRAG;
	+ head = nm_ring_next(ring, head);
	+ if (head == ring->tail) {
	+ /* We ran out of netmap slots while
	+ * splitting the iovec fragments. */
	+ RD(1, "No space, drop %d bytes", size);
	+ goto txsync;
	+ }
	+ nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
	+ nm_buf_size = ring->nr_buf_size;
	+ nm_buf_len = 0;
	+ }
	+ }
	+
	+ /* Complete the last slot, which must not have NS_MOREFRAG set. */
	+ ring->slot[head].len = nm_buf_len;
	+ ring->slot[head].flags = 0;
	+ head = nm_ring_next(ring, head);
	+
	+ /* Now update ring->head and ring->cur. */
	+ ring->head = ring->cur = head;
	+
	+ if (more) {// && nm_ring_space(ring) > 64
	+ return 0;
	+ }
	+txsync:
	+ ioctl(be->fd, NIOCTXSYNC, NULL);
	+
	+ return 0;
	+}
	+
	+static int
	+netmap_recv(struct net_backend be, struct iovec iov, int iovcnt)
	+{
	+ struct netmap_priv *priv = be->priv;
	+ struct netmap_slot *slot = NULL;
	+ struct netmap_ring *ring;
	+ void *iov_frag_buf;
	+ int iov_frag_size;
	+ int totlen = 0;
	+ uint32_t head;
	+
	+ assert(iovcnt);
	+
	+ ring = priv->rx;
	+ head = ring->head;
	+ iov_frag_buf = iov->iov_base;
	+ iov_frag_size = iov->iov_len;
	+
	+ do {
	+ int nm_buf_len;
	+ void *nm_buf;
	+
	+ if (head == ring->tail) {
	+ return 0;
	+ }
	+
	+ slot = ring->slot + head;
	+ nm_buf = NETMAP_BUF(ring, slot->buf_idx);
	+ nm_buf_len = slot->len;
	+
	+ for (;;) {
	+ int copylen = nm_buf_len < iov_frag_size ? nm_buf_len : iov_frag_size;
	+
	+ pkt_copy(nm_buf, iov_frag_buf, copylen);
	+ nm_buf += copylen;
	+ nm_buf_len -= copylen;
	+ iov_frag_buf += copylen;
	+ iov_frag_size -= copylen;
	+ totlen += copylen;
	+
	+ if (nm_buf_len == 0) {
	+ break;
	+ }
	+
	+ iov++;
	+ iovcnt--;
	+ if (iovcnt == 0) {
	+ /* No space to receive. */
	+ D("Short iov, drop %d bytes", totlen);
	+ return -ENOSPC;
	+ }
	+ iov_frag_buf = iov->iov_base;
	+ iov_frag_size = iov->iov_len;
	+ }
	+
	+ head = nm_ring_next(ring, head);
	+
	+ } while (slot->flags & NS_MOREFRAG);
	+
	+ /* Release slots to netmap. */
	+ ring->head = ring->cur = head;
	+
	+ return totlen;
	+}
	+
	+static struct net_backend netmap_backend = {
	+ .name = "netmap\|vale",
	+ .init = netmap_init,
	+ .cleanup = netmap_cleanup,
	+ .send = netmap_send,
	+ .recv = netmap_recv,
	+ .get_cap = netmap_get_cap,
	+ .set_cap = netmap_set_cap,
	+};
	+
	+DATA_SET(net_backend_set, netmap_backend);
	+
	+/*
	+ * make sure a backend is properly initialized
	+ * TODO check and return error if not set!
	+ */
	+static int
	+netbe_fix(struct net_backend *be)
	+{
	+ if (be == NULL)
	+ return -1;
	+ if (be->name == NULL) {
	+ fprintf(stderr, "missing name for %p\n", be);
	+ return -1;
	+ }
	+ if (be->init == NULL) {
	+ fprintf(stderr, "missing init for %p %s\n", be, be->name);
	+ return -1;
	+ }
	+ if (be->cleanup == NULL) {
	+ fprintf(stderr, "missing cleanup for %p %s\n", be, be->name);
	+ return -1;
	+ }
	+ if (be->send == NULL) {
	+ fprintf(stderr, "missing send for %p %s\n", be, be->name);
	+ return -1;
	+ }
	+ if (be->recv == NULL) {
	+ fprintf(stderr, "missing recv for %p %s\n", be, be->name);
	+ return -1;
	+ }
	+ if (be->get_cap == NULL) {
	+ fprintf(stderr, "missing get_cap for %p %s\n",
	+ be, be->name);
	+ return -1;
	+ }
	+ if (be->set_cap == NULL) {
	+ fprintf(stderr, "missing set_cap for %p %s\n",
	+ be, be->name);
	+ return -1;
	+ }
	+
	+ return 0;
	+}
	+
	+/*
	+ * keys is a set of prefixes separated by '\|',
	+ * return 1 if the leftmost part of name matches one prefix.
	+ */
	+static const char *
	+netbe_name_match(const char keys, const char name)
	+{
	+ const char n = name, good = keys;
	+ char c;
	+
	+ if (!keys \|\| !name)
	+ return NULL;
	+ while ( (c = *keys++) ) {
	+ if (c == '\|') { /* reached the separator */
	+ if (good)
	+ break;
	+ /* prepare for new round */
	+ n = name;
	+ good = keys;
	+ } else if (good && c != *n++) {
	+ good = NULL; /* drop till next keyword */
	+ }
	+ }
	+ return good;
	+}
	+
	+/*
	+ * Initialize a backend and attach to the frontend.
	+ * This is called during frontend initialization.
	+ * @devname is the backend-name as supplied on the command line,
	+ * e.g. -s 2:0,frontend-name,backend-name[,other-args]
	+ * @cb is the receive callback supplied by the frontend,
	+ * and it is invoked in the event loop when a receive
	+ * event is generated in the hypervisor,
	+ * @param is a pointer to the frontend, and normally used as
	+ * the argument for the callback.
	+ */
	+struct net_backend *
	+netbe_init(const char devname, net_backend_cb_t cb, void param)
	+{
	+ struct net_backend *pbe, be, *tbe = NULL;
	+ int err;
	+
	+ /*
	+ * Find the network backend depending on the user-provided
	+ * device name. net_backend_set is built using a linker set.
	+ */
	+ SET_FOREACH(pbe, net_backend_set) {
	+ if (netbe_name_match((*pbe)->name, devname)) {
	+ tbe = *pbe;
	+ break;
	+ }
	+ }
	+ if (tbe == NULL)
	+ return NULL; /* or null backend ? */
	+ be = calloc(1, sizeof(*be));
	+ be = tbe; /* copy the template */
	+ if (netbe_fix(be)) { /* make sure we have all fields */
	+ free(be);
	+ return NULL;
	+ }
	+ be->fd = -1;
	+ be->priv = NULL;
	+ be->sc = param;
	+ be->be_vnet_hdr_len = 0;
	+ be->fe_vnet_hdr_len = 0;
	+
	+ /* initialize the backend */
	+ err = be->init(be, devname, cb, param);
	+ if (err) {
	+ free(be);
	+ be = NULL;
	+ }
	+ return be;
	+}
	+
	+void
	+netbe_cleanup(struct net_backend *be)
	+{
	+ if (be == NULL)
	+ return;
	+ be->cleanup(be);
	+ free(be);
	+}
	+
	+uint64_t
	+netbe_get_cap(struct net_backend *be)
	+{
	+ if (be == NULL)
	+ return 0;
	+ return be->get_cap(be);
	+}
	+
	+int
	+netbe_set_cap(struct net_backend *be, uint64_t features,
	+ unsigned vnet_hdr_len)
	+{
	+ int ret;
	+
	+ if (be == NULL)
	+ return 0;
	+
	+ /* There are only three valid lengths. */
	+ if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
	+ && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
	+ return -1;
	+
	+ be->fe_vnet_hdr_len = vnet_hdr_len;
	+
	+ ret = be->set_cap(be, features, vnet_hdr_len);
	+ assert(be->be_vnet_hdr_len == 0 \|\|
	+ be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
	+
	+ return ret;
	+}
	+
	+static __inline struct iovec *
	+iov_trim(struct iovec iov, int iovcnt, unsigned int tlen)
	+{
	+ struct iovec *riov;
	+
	+ /* XXX short-cut: assume first segment is >= tlen */
	+ assert(iov[0].iov_len >= tlen);
	+
	+ iov[0].iov_len -= tlen;
	+ if (iov[0].iov_len == 0) {
	+ assert(*iovcnt > 1);
	+ *iovcnt -= 1;
	+ riov = &iov[1];
	+ } else {
	+ iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
	+ riov = &iov[0];
	+ }
	+
	+ return (riov);
	+}
	+
	+void
	+netbe_send(struct net_backend be, struct iovec iov, int iovcnt, uint32_t len,
	+ int more)
	+{
	+ if (be == NULL)
	+ return;
	+#if 0
	+ int i;
	+ D("sending iovcnt %d len %d iovec %p", iovcnt, len, iov);
	+ for (i=0; i < iovcnt; i++)
	+ D(" %3d: %4d %p", i, (int)iov[i].iov_len, iov[i].iov_base);
	+#endif
	+ if (be->be_vnet_hdr_len != be->fe_vnet_hdr_len) {
	+ /* Here we are sure be->be_vnet_hdr_len is 0. */
	+ iov = iov_trim(iov, &iovcnt, be->fe_vnet_hdr_len);
	+ }
	+
	+ be->send(be, iov, iovcnt, len, more);
	+}
	+
	+/*
	+ * can return -1 in case of errors
	+ */
	+int
	+netbe_recv(struct net_backend be, struct iovec iov, int iovcnt)
	+{
	+ unsigned int hlen = 0; /* length of prepended virtio-net header */
	+ int ret;
	+
	+ if (be == NULL)
	+ return -1;
	+
	+ if (be->be_vnet_hdr_len != be->fe_vnet_hdr_len) {
	+ struct virtio_net_rxhdr *vh;
	+
	+ /* Here we are sure be->be_vnet_hdr_len is 0. */
	+ hlen = be->fe_vnet_hdr_len;
	+ /*
	+ * Get a pointer to the rx header, and use the
	+ * data immediately following it for the packet buffer.
	+ */
	+ vh = iov[0].iov_base;
	+ iov = iov_trim(iov, &iovcnt, hlen);
	+
	+ /*
	+ * Here we are sure be->fe_vnet_hdr_len is 0.
	+ * The only valid field in the rx packet header is the
	+ * number of buffers if merged rx bufs were negotiated.
	+ */
	+ memset(vh, 0, hlen);
	+
	+ if (hlen == VNET_HDR_LEN) {
	+ vh->vrh_bufs = 1;
	+ }
	+ }
	+
	+ ret = be->recv(be, iov, iovcnt);
	+ if (ret > 0) {
	+ ret += hlen;
	+ }
	+
	+ return ret;
	+}
	+
	+/*
	+ * Read a packet from the backend and discard it.
	+ * Returns the size of the discarded packet or zero if no packet was available.
	+ * A negative error code is returned in case of read error.
	+ */
	+int
	+netbe_rx_discard(struct net_backend *be)
	+{
	+ /*
	+ * MP note: the dummybuf is only used to discard frames,
	+ * so there is no need for it to be per-vtnet or locked.
	+ * We only make it large enough for TSO-sized segment.
	+ */
	+ static uint8_t dummybuf[65536+64];
	+ struct iovec iov;
	+
	+ iov.iov_base = dummybuf;
	+ iov.iov_len = sizeof(dummybuf);
	+
	+ return netbe_recv(be, &iov, 1);
	+}
	+
	Index: usr.sbin/bhyve/pci_virtio_net.c
	===================================================================
	--- usr.sbin/bhyve/pci_virtio_net.c
	+++ usr.sbin/bhyve/pci_virtio_net.c
	@@ -32,22 +32,13 @@
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	-#ifndef WITHOUT_CAPSICUM
	-#include <sys/capsicum.h>
	-#endif
	#include <sys/linker_set.h>
	#include <sys/select.h>
	#include <sys/uio.h>
	#include <sys/ioctl.h>
	#include <net/ethernet.h>
	-#ifndef NETMAP_WITH_LIBS
	-#define NETMAP_WITH_LIBS
	-#endif
	-#include <net/netmap_user.h>
	+#include <net/if.h> /* IFNAMSIZ */

	-#ifndef WITHOUT_CAPSICUM
	-#include <capsicum_helpers.h>
	-#endif
	#include <err.h>
	#include <errno.h>
	#include <fcntl.h>
	@@ -58,44 +49,20 @@
	#include <strings.h>
	#include <unistd.h>
	#include <assert.h>
	-#include <md5.h>
	#include <pthread.h>
	#include <pthread_np.h>
	-#include <sysexits.h>

	#include "bhyverun.h"
	#include "pci_emul.h"
	#include "mevent.h"
	#include "virtio.h"
	-#include "net_utils.h"
	+#include "net_utils.h" /* MAC address generation */
	+#include "net_backends.h" /* VirtIO capabilities */

	#define VTNET_RINGSZ 1024

	#define VTNET_MAXSEGS 256

	-/*
	- * Host capabilities. Note that we only offer a few of these.
	- */
	-#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */
	-#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */
	-#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
	-#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */
	-#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */
	-#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */
	-#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */
	-#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */
	-#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */
	-#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */
	-#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */
	-#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */
	-#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
	-#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
	-#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */
	-#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */
	-#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */
	-#define VIRTIO_NET_F_GUEST_ANNOUNCE \
	- (1 << 21) /* guest can send gratuitous pkts */
	-
	#define VTNET_S_HOSTCAPS \
	( VIRTIO_NET_F_MAC \| VIRTIO_NET_F_MRG_RXBUF \| VIRTIO_NET_F_STATUS \| \
	VIRTIO_F_NOTIFY_ON_EMPTY \| VIRTIO_RING_F_INDIRECT_DESC)
	@@ -117,19 +84,6 @@

	#define VTNET_MAXQ 3

	-/*
	- * Fixed network header size
	- */
	-struct virtio_net_rxhdr {
	- uint8_t vrh_flags;
	- uint8_t vrh_gso_type;
	- uint16_t vrh_hdr_len;
	- uint16_t vrh_gso_size;
	- uint16_t vrh_csum_start;
	- uint16_t vrh_csum_offset;
	- uint16_t vrh_bufs;
	-} __packed;
	-
	/*
	* Debug printf
	*/
	@@ -144,30 +98,24 @@
	struct virtio_softc vsc_vs;
	struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
	pthread_mutex_t vsc_mtx;
	- struct mevent *vsc_mevp;

	- int vsc_tapfd;
	- struct nm_desc *vsc_nmd;
	+ net_backend_t *vsc_be;

	int vsc_rx_ready;
	int resetting; /* protected by tx_mtx */

	uint64_t vsc_features; /* negotiated features */

	- struct virtio_net_config vsc_config;
	-
	pthread_mutex_t rx_mtx;
	- int rx_vhdrlen;
	+ unsigned int rx_vhdrlen;
	int rx_merge; /* merged rx bufs in use */

	pthread_t tx_tid;
	pthread_mutex_t tx_mtx;
	pthread_cond_t tx_cond;
	int tx_in_progress;
	+ struct virtio_net_config vsc_config;

	- void (pci_vtnet_rx)(struct pci_vtnet_softc sc);
	- void (pci_vtnet_tx)(struct pci_vtnet_softc sc, struct iovec *iov,
	- int iovcnt, int len);
	};

	static void pci_vtnet_reset(void *);
	@@ -227,80 +175,19 @@
	* Called to send a buffer chain out to the tap device
	*/
	static void
	-pci_vtnet_tap_tx(struct pci_vtnet_softc sc, struct iovec iov, int iovcnt,
	- int len)
	+pci_vtnet_rx(struct pci_vtnet_softc *sc)
	{
	- static char pad[60]; /* all zero bytes */
	-
	- if (sc->vsc_tapfd == -1)
	- return;
	-
	- /*
	- * If the length is < 60, pad out to that and add the
	- * extra zero'd segment to the iov. It is guaranteed that
	- * there is always an extra iov available by the caller.
	- */
	- if (len < 60) {
	- iov[iovcnt].iov_base = pad;
	- iov[iovcnt].iov_len = 60 - len;
	- iovcnt++;
	- }
	- (void) writev(sc->vsc_tapfd, iov, iovcnt);
	-}
	-
	-/*
	- * Called when there is read activity on the tap file descriptor.
	- * Each buffer posted by the guest is assumed to be able to contain
	- * an entire ethernet frame + rx header.
	- * MP note: the dummybuf is only used for discarding frames, so there
	- * is no need for it to be per-vtnet or locked.
	- */
	-static uint8_t dummybuf[2048];
	-
	-static __inline struct iovec *
	-rx_iov_trim(struct iovec iov, int niov, int tlen)
	-{
	- struct iovec *riov;
	-
	- /* XXX short-cut: assume first segment is >= tlen */
	- assert(iov[0].iov_len >= tlen);
	-
	- iov[0].iov_len -= tlen;
	- if (iov[0].iov_len == 0) {
	- assert(*niov > 1);
	- *niov -= 1;
	- riov = &iov[1];
	- } else {
	- iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
	- riov = &iov[0];
	- }
	-
	- return (riov);
	-}
	-
	-static void
	-pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
	-{
	- struct iovec iov[VTNET_MAXSEGS], *riov;
	+ struct iovec iov[VTNET_MAXSEGS + 1];
	struct vqueue_info *vq;
	- void *vrx;
	int len, n;
	uint16_t idx;

	- /*
	- * Should never be called without a valid tap fd
	- */
	- assert(sc->vsc_tapfd != -1);
	-
	- /*
	- * But, will be called when the rx ring hasn't yet
	- * been set up.
	- */
	if (!sc->vsc_rx_ready) {
	/*
	+ * The rx ring has not yet been set up.
	* Drop the packet and try later.
	*/
	- (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
	+ netbe_rx_discard(sc->vsc_be);
	return;
	}

	@@ -310,10 +197,10 @@
	vq = &sc->vsc_queues[VTNET_RXQ];
	if (!vq_has_descs(vq)) {
	/*
	- * Drop the packet and try later. Interrupt on
	- * empty, if that's negotiated.
	+ * No available rx buffers. Drop the packet and try later.
	+ * Interrupt on empty, if that's negotiated.
	*/
	- (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
	+ netbe_rx_discard(sc->vsc_be);
	vq_endchains(vq, 1);
	return;
	}
	@@ -325,211 +212,11 @@
	n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
	assert(n >= 1 && n <= VTNET_MAXSEGS);

	- /*
	- * Get a pointer to the rx header, and use the
	- * data immediately following it for the packet buffer.
	- */
	- vrx = iov[0].iov_base;
	- riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
	-
	- len = readv(sc->vsc_tapfd, riov, n);
	-
	- if (len < 0 && errno == EWOULDBLOCK) {
	- /*
	- * No more packets, but still some avail ring
	- * entries. Interrupt if needed/appropriate.
	- */
	- vq_retchain(vq);
	- vq_endchains(vq, 0);
	- return;
	- }
	-
	- /*
	- * The only valid field in the rx packet header is the
	- * number of buffers if merged rx bufs were negotiated.
	- */
	- memset(vrx, 0, sc->rx_vhdrlen);
	-
	- if (sc->rx_merge) {
	- struct virtio_net_rxhdr *vrxh;
	-
	- vrxh = vrx;
	- vrxh->vrh_bufs = 1;
	- }
	-
	- /*
	- * Release this chain and handle more chains.
	- */
	- vq_relchain(vq, idx, len + sc->rx_vhdrlen);
	- } while (vq_has_descs(vq));
	-
	- /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
	- vq_endchains(vq, 1);
	-}
	+ len = netbe_recv(sc->vsc_be, iov, n);

	-static __inline int
	-pci_vtnet_netmap_writev(struct nm_desc nmd, struct iovec iov, int iovcnt)
	-{
	- int r, i;
	- int len = 0;
	-
	- for (r = nmd->cur_tx_ring; ; ) {
	- struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r);
	- uint32_t cur, idx;
	- char *buf;
	-
	- if (nm_ring_empty(ring)) {
	- r++;
	- if (r > nmd->last_tx_ring)
	- r = nmd->first_tx_ring;
	- if (r == nmd->cur_tx_ring)
	- break;
	- continue;
	+ if (len < 0) {
	+ break;
	}
	- cur = ring->cur;
	- idx = ring->slot[cur].buf_idx;
	- buf = NETMAP_BUF(ring, idx);
	-
	- for (i = 0; i < iovcnt; i++) {
	- if (len + iov[i].iov_len > 2048)
	- break;
	- memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len);
	- len += iov[i].iov_len;
	- }
	- ring->slot[cur].len = len;
	- ring->head = ring->cur = nm_ring_next(ring, cur);
	- nmd->cur_tx_ring = r;
	- ioctl(nmd->fd, NIOCTXSYNC, NULL);
	- break;
	- }
	-
	- return (len);
	-}
	-
	-static __inline int
	-pci_vtnet_netmap_readv(struct nm_desc nmd, struct iovec iov, int iovcnt)
	-{
	- int len = 0;
	- int i = 0;
	- int r;
	-
	- for (r = nmd->cur_rx_ring; ; ) {
	- struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r);
	- uint32_t cur, idx;
	- char *buf;
	- size_t left;
	-
	- if (nm_ring_empty(ring)) {
	- r++;
	- if (r > nmd->last_rx_ring)
	- r = nmd->first_rx_ring;
	- if (r == nmd->cur_rx_ring)
	- break;
	- continue;
	- }
	- cur = ring->cur;
	- idx = ring->slot[cur].buf_idx;
	- buf = NETMAP_BUF(ring, idx);
	- left = ring->slot[cur].len;
	-
	- for (i = 0; i < iovcnt && left > 0; i++) {
	- if (iov[i].iov_len > left)
	- iov[i].iov_len = left;
	- memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len);
	- len += iov[i].iov_len;
	- left -= iov[i].iov_len;
	- }
	- ring->head = ring->cur = nm_ring_next(ring, cur);
	- nmd->cur_rx_ring = r;
	- ioctl(nmd->fd, NIOCRXSYNC, NULL);
	- break;
	- }
	- for (; i < iovcnt; i++)
	- iov[i].iov_len = 0;
	-
	- return (len);
	-}
	-
	-/*
	- * Called to send a buffer chain out to the vale port
	- */
	-static void
	-pci_vtnet_netmap_tx(struct pci_vtnet_softc sc, struct iovec iov, int iovcnt,
	- int len)
	-{
	- static char pad[60]; /* all zero bytes */
	-
	- if (sc->vsc_nmd == NULL)
	- return;
	-
	- /*
	- * If the length is < 60, pad out to that and add the
	- * extra zero'd segment to the iov. It is guaranteed that
	- * there is always an extra iov available by the caller.
	- */
	- if (len < 60) {
	- iov[iovcnt].iov_base = pad;
	- iov[iovcnt].iov_len = 60 - len;
	- iovcnt++;
	- }
	- (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt);
	-}
	-
	-static void
	-pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc)
	-{
	- struct iovec iov[VTNET_MAXSEGS], *riov;
	- struct vqueue_info *vq;
	- void *vrx;
	- int len, n;
	- uint16_t idx;
	-
	- /*
	- * Should never be called without a valid netmap descriptor
	- */
	- assert(sc->vsc_nmd != NULL);
	-
	- /*
	- * But, will be called when the rx ring hasn't yet
	- * been set up.
	- */
	- if (!sc->vsc_rx_ready) {
	- /*
	- * Drop the packet and try later.
	- */
	- (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
	- return;
	- }
	-
	- /*
	- * Check for available rx buffers
	- */
	- vq = &sc->vsc_queues[VTNET_RXQ];
	- if (!vq_has_descs(vq)) {
	- /*
	- * Drop the packet and try later. Interrupt on
	- * empty, if that's negotiated.
	- */
	- (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
	- vq_endchains(vq, 1);
	- return;
	- }
	-
	- do {
	- /*
	- * Get descriptor chain.
	- */
	- n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
	- assert(n >= 1 && n <= VTNET_MAXSEGS);
	-
	- /*
	- * Get a pointer to the rx header, and use the
	- * data immediately following it for the packet buffer.
	- */
	- vrx = iov[0].iov_base;
	- riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
	-
	- len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n);

	if (len == 0) {
	/*
	@@ -541,40 +228,31 @@
	return;
	}

	- /*
	- * The only valid field in the rx packet header is the
	- * number of buffers if merged rx bufs were negotiated.
	- */
	- memset(vrx, 0, sc->rx_vhdrlen);
	-
	- if (sc->rx_merge) {
	- struct virtio_net_rxhdr *vrxh;
	-
	- vrxh = vrx;
	- vrxh->vrh_bufs = 1;
	- }
	-
	- /*
	- * Release this chain and handle more chains.
	- */
	- vq_relchain(vq, idx, len + sc->rx_vhdrlen);
	+ /* Publish the info to the guest */
	+ vq_relchain(vq, idx, (uint32_t)len);
	} while (vq_has_descs(vq));

	/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
	vq_endchains(vq, 1);
	}

	+/*
	+ * Called when there is read activity on the backend file descriptor.
	+ * Each buffer posted by the guest is assumed to be able to contain
	+ * an entire ethernet frame + rx header.
	+ */
	static void
	pci_vtnet_rx_callback(int fd, enum ev_type type, void *param)
	{
	struct pci_vtnet_softc *sc = param;

	pthread_mutex_lock(&sc->rx_mtx);
	- sc->pci_vtnet_rx(sc);
	+ pci_vtnet_rx(sc);
	pthread_mutex_unlock(&sc->rx_mtx);

	}

	+/* Called on RX kick. */
	static void
	pci_vtnet_ping_rxq(void vsc, struct vqueue_info vq)
	{
	@@ -589,35 +267,33 @@
	}
	}

	+/* TX virtqueue processing, called by the TX thread. */
	static void
	pci_vtnet_proctx(struct pci_vtnet_softc sc, struct vqueue_info vq)
	{
	struct iovec iov[VTNET_MAXSEGS + 1];
	int i, n;
	- int plen, tlen;
	+ uint32_t len;
	uint16_t idx;

	/*
	- * Obtain chain of descriptors. The first one is
	- * really the header descriptor, so we need to sum
	- * up two lengths: packet length and transfer length.
	+ * Obtain chain of descriptors. The first descriptor also
	+ * contains the virtio-net header.
	*/
	n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
	assert(n >= 1 && n <= VTNET_MAXSEGS);
	- plen = 0;
	- tlen = iov[0].iov_len;
	- for (i = 1; i < n; i++) {
	- plen += iov[i].iov_len;
	- tlen += iov[i].iov_len;
	+ len = 0;
	+ for (i = 0; i < n; i++) {
	+ len += iov[i].iov_len;
	}

	- DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
	- sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen);
	+ netbe_send(sc->vsc_be, iov, n, len, 0 /* more */);

	- /* chain is processed, release it and set tlen */
	- vq_relchain(vq, idx, tlen);
	+ /* chain is processed, release it and set len */
	+ vq_relchain(vq, idx, len);
	}

	+/* Called on TX kick. */
	static void
	pci_vtnet_ping_txq(void vsc, struct vqueue_info vq)
	{
	@@ -647,6 +323,14 @@
	struct vqueue_info *vq;
	int error;

	+ {
	+ struct pci_devinst *pi = sc->vsc_vs.vs_pi;
	+ char tname[MAXCOMLEN + 1];
	+ snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot,
	+ pi->pi_func);
	+ pthread_set_name_np(pthread_self(), tname);
	+ }
	+
	vq = &sc->vsc_queues[VTNET_TXQ];

	/*
	@@ -699,93 +383,27 @@
	}
	#endif

	-static void
	-pci_vtnet_tap_setup(struct pci_vtnet_softc sc, char devname)
	-{
	- char tbuf[80];
	-#ifndef WITHOUT_CAPSICUM
	- cap_rights_t rights;
	-#endif
	-
	- strcpy(tbuf, "/dev/");
	- strlcat(tbuf, devname, sizeof(tbuf));
	-
	- sc->pci_vtnet_rx = pci_vtnet_tap_rx;
	- sc->pci_vtnet_tx = pci_vtnet_tap_tx;
	-
	- sc->vsc_tapfd = open(tbuf, O_RDWR);
	- if (sc->vsc_tapfd == -1) {
	- WPRINTF(("open of tap device %s failed\n", tbuf));
	- return;
	- }
	-
	- /*
	- * Set non-blocking and register for read
	- * notifications with the event loop
	- */
	- int opt = 1;
	- if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
	- WPRINTF(("tap device O_NONBLOCK failed\n"));
	- close(sc->vsc_tapfd);
	- sc->vsc_tapfd = -1;
	- }
	-
	-#ifndef WITHOUT_CAPSICUM
	- cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
	- if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1)
	- errx(EX_OSERR, "Unable to apply rights for sandbox");
	-#endif
	-
	- sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
	- EVF_READ,
	- pci_vtnet_rx_callback,
	- sc);
	- if (sc->vsc_mevp == NULL) {
	- WPRINTF(("Could not register event\n"));
	- close(sc->vsc_tapfd);
	- sc->vsc_tapfd = -1;
	- }
	-}
	-
	-static void
	-pci_vtnet_netmap_setup(struct pci_vtnet_softc sc, char ifname)
	-{
	- sc->pci_vtnet_rx = pci_vtnet_netmap_rx;
	- sc->pci_vtnet_tx = pci_vtnet_netmap_tx;
	-
	- sc->vsc_nmd = nm_open(ifname, NULL, 0, 0);
	- if (sc->vsc_nmd == NULL) {
	- WPRINTF(("open of netmap device %s failed\n", ifname));
	- return;
	- }
	-
	- sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd,
	- EVF_READ,
	- pci_vtnet_rx_callback,
	- sc);
	- if (sc->vsc_mevp == NULL) {
	- WPRINTF(("Could not register event\n"));
	- nm_close(sc->vsc_nmd);
	- sc->vsc_nmd = NULL;
	- }
	-}
	-
	static int
	pci_vtnet_init(struct vmctx ctx, struct pci_devinst pi, char *opts)
	{
	- char tname[MAXCOMLEN + 1];
	struct pci_vtnet_softc *sc;
	- char *devname;
	- char *vtopts;
	+ char tname[MAXCOMLEN + 1];
	+ struct virtio_consts *vc;
	int mac_provided;

	- sc = calloc(1, sizeof(struct pci_vtnet_softc));
	+ /*
	+ * Allocate data structures for further virtio initializations.
	+ * sc also contains a copy of the vtnet_vi_consts,
	+ * because the capabilities change depending on
	+ * the backend.
	+ */
	+ sc = calloc(1, sizeof(struct pci_vtnet_softc) +
	+ sizeof(struct virtio_consts));
	+ vc = (struct virtio_consts *)(sc + 1);
	+ memcpy(vc, &vtnet_vi_consts, sizeof(*vc));

	pthread_mutex_init(&sc->vsc_mtx, NULL);

	- vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues);
	- sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
	-
	sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
	sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
	sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
	@@ -796,13 +414,13 @@
	#endif

	/*
	- * Attempt to open the tap device and read the MAC address
	+ * Attempt to open the backend device and read the MAC address
	* if specified
	*/
	mac_provided = 0;
	- sc->vsc_tapfd = -1;
	- sc->vsc_nmd = NULL;
	if (opts != NULL) {
	+ char *devname;
	+ char *vtopts;
	int err;

	devname = vtopts = strdup(opts);
	@@ -817,13 +435,12 @@
	mac_provided = 1;
	}

	- if (strncmp(devname, "vale", 4) == 0)
	- pci_vtnet_netmap_setup(sc, devname);
	- if (strncmp(devname, "tap", 3) == 0 \|\|
	- strncmp(devname, "vmnet", 5) == 0)
	- pci_vtnet_tap_setup(sc, devname);
	-
	+ sc->vsc_be = netbe_init(devname, pci_vtnet_rx_callback, sc);
	free(devname);
	+ if (sc->vsc_be == NULL) {
	+ return (EINVAL);
	+ }
	+ vc->vc_hv_caps \|= netbe_get_cap(sc->vsc_be);
	}

	if (!mac_provided) {
	@@ -837,10 +454,12 @@
	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);

	- /* Link is up if we managed to open tap device or vale port. */
	- sc->vsc_config.status = (opts == NULL \|\| sc->vsc_tapfd >= 0 \|\|
	- sc->vsc_nmd != NULL);
	+ /* Link is up if we managed to open backend device. */
	+ sc->vsc_config.status = (opts == NULL \|\| sc->vsc_be);

	+ vi_softc_linkup(&sc->vsc_vs, vc, sc, pi, sc->vsc_queues);
	+ sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
	+
	/* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
	if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix()))
	return (1);
	@@ -876,8 +495,8 @@
	struct pci_vtnet_softc *sc = vsc;
	void *ptr;

	- if (offset < 6) {
	- assert(offset + size <= 6);
	+ if (offset < (int)sizeof(sc->vsc_config.mac)) {
	+ assert(offset + size <= (int)sizeof(sc->vsc_config.mac));
	/*
	* The driver is allowed to change the MAC address
	*/
	@@ -909,14 +528,17 @@

	sc->vsc_features = negotiated_features;

	- if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) {
	+ if (!(negotiated_features & VIRTIO_NET_F_MRG_RXBUF)) {
	sc->rx_merge = 0;
	/* non-merge rx header is 2 bytes shorter */
	sc->rx_vhdrlen -= 2;
	}
	+
	+ /* Tell the backend to enable some capabilities it has advertised. */
	+ netbe_set_cap(sc->vsc_be, negotiated_features, sc->rx_vhdrlen);
	}

	-struct pci_devemu pci_de_vnet = {
	+static struct pci_devemu pci_de_vnet = {
	.pe_emu = "virtio-net",
	.pe_init = pci_vtnet_init,
	.pe_barwrite = vi_pci_write,

File Metadata

Mime Type: text/plain
Expires: Wed, Apr 22, 1:33 PM (17 h, 5 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 31981945
Default Alt Text: D20659.id58685.diff (44 KB)

D20659.id58685.diffNo OneTemporaryActions

D20659.id58685.diffView Options

File Metadata

Event Timeline

D20659.id58685.diff
No OneTemporary
Actions

D20659.id58685.diff
View Options