Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F153511859
D20659.id58685.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
44 KB
Referenced Files
None
Subscribers
None
D20659.id58685.diff
View Options
Index: usr.sbin/bhyve/Makefile
===================================================================
--- usr.sbin/bhyve/Makefile
+++ usr.sbin/bhyve/Makefile
@@ -32,6 +32,7 @@
mem.c \
mevent.c \
mptbl.c \
+ net_backends.c \
net_utils.c \
pci_ahci.c \
pci_e82545.c \
Index: usr.sbin/bhyve/net_backends.h
===================================================================
--- /dev/null
+++ usr.sbin/bhyve/net_backends.h
@@ -0,0 +1,110 @@
+/*-
+ * Copyright (c) 2014 Vincenzo Maffione <v.maffione@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __NET_BACKENDS_H__
+#define __NET_BACKENDS_H__
+
+#include <stdint.h>
+#include <net/if.h>
+
+#include <net/netmap.h>
+#include <net/netmap_virt.h>
+#define NETMAP_WITH_LIBS
+#include <net/netmap_user.h>
+
+#include "mevent.h"
+
+extern int netmap_ioctl_counter;
+
+/* Opaque type representing a network backend. */
+typedef struct net_backend net_backend_t;
+
+/* Interface between network frontends and the network backends. */
+typedef void (*net_backend_cb_t)(int, enum ev_type, void *param);
+net_backend_t *netbe_init(const char *devname,
+ net_backend_cb_t cb, void *param);
+void netbe_cleanup(net_backend_t *be);
+uint64_t netbe_get_cap(net_backend_t *be);
+int netbe_set_cap(net_backend_t *be, uint64_t cap,
+ unsigned vnet_hdr_len);
+void netbe_send(net_backend_t *be, struct iovec *iov,
+ int iovcnt, uint32_t len, int more);
+int netbe_recv(net_backend_t *be, struct iovec *iov, int iovcnt);
+int netbe_rx_discard(net_backend_t *be);
+
+
+/*
+ * Network device capabilities taken from VirtIO standard.
+ * Despite the name, these capabilities can be used by different frontents
+ * (virtio-net, ptnet) and supported by different backends (netmap, tap, ...).
+ */
+#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */
+#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */
+#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
+#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */
+#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */
+#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */
+#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */
+#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */
+#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */
+#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */
+#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */
+#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */
+#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
+#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
+#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */
+#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */
+#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */
+#define VIRTIO_NET_F_GUEST_ANNOUNCE \
+ (1 << 21) /* guest can send gratuitous pkts */
+
+/*
+ * Fixed network header size
+ */
+struct virtio_net_rxhdr {
+ uint8_t vrh_flags;
+ uint8_t vrh_gso_type;
+ uint16_t vrh_hdr_len;
+ uint16_t vrh_gso_size;
+ uint16_t vrh_csum_start;
+ uint16_t vrh_csum_offset;
+ uint16_t vrh_bufs;
+} __packed;
+
+/* Used to get read-only info. */
+struct netmap_if_info {
+ uint32_t nifp_offset;
+ uint16_t num_tx_rings;
+ uint16_t num_rx_rings;
+ uint16_t num_tx_slots;
+ uint16_t num_rx_slots;
+};
+
+#include "pci_emul.h"
+int net_parsemac(char *mac_str, uint8_t *mac_addr);
+void net_genmac(struct pci_devinst *pi, uint8_t *macaddr);
+
+#endif /* __NET_BACKENDS_H__ */
Index: usr.sbin/bhyve/net_backends.c
===================================================================
--- /dev/null
+++ usr.sbin/bhyve/net_backends.c
@@ -0,0 +1,897 @@
+/*-
+ * Copyright (c) 2014-2016 Vincenzo Maffione
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This file implements multiple network backends (null, tap, netmap, ...),
+ * to be used by network frontends such as virtio-net and ptnet.
+ * The API to access the backend (e.g. send/receive packets, negotiate
+ * features) is exported by net_backends.h.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/types.h> /* u_short etc */
+#include <net/if.h>
+
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <sysexits.h>
+#include <assert.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <poll.h>
+#include <assert.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#include <sys/capsicum.h>
+#endif
+
+#include "mevent.h"
+#include "net_backends.h"
+
+#include <sys/linker_set.h>
+
+/*
+ * Each network backend registers a set of function pointers that are
+ * used to implement the net backends API.
+ * This might need to be exposed if we implement backends in separate files.
+ */
+struct net_backend {
+ const char *name; /* name of the backend */
+ /*
+ * The init and cleanup functions are used internally,
+ * virtio-net should never use it.
+ */
+ int (*init)(struct net_backend *be, const char *devname,
+ net_backend_cb_t cb, void *param);
+ void (*cleanup)(struct net_backend *be);
+
+
+ /*
+ * Called to serve a guest transmit request. The scatter-gather
+ * vector provided by the caller has 'iovcnt' elements and contains
+ * the packet to send. 'len' is the length of whole packet in bytes.
+ */
+ int (*send)(struct net_backend *be, struct iovec *iov,
+ int iovcnt, uint32_t len, int more);
+
+ /*
+ * Called to serve guest receive request. When the function
+ * returns a positive value, the scatter-gather vector
+ * provided by the caller (having 'iovcnt' elements in it) will
+ * contain a chunk of the received packet. The 'more' flag will
+ * be set if the returned chunk was the last one for the current
+ * packet, and 0 otherwise. The function returns the chunk size
+ * in bytes, or 0 if the backend doesn't have a new packet to
+ * receive.
+ * Note that it may be necessary to call this callback many
+ * times to receive a single packet, depending of how big is
+ * buffers you provide.
+ */
+ int (*recv)(struct net_backend *be, struct iovec *iov, int iovcnt);
+
+ /*
+ * Ask the backend for the virtio-net features it is able to
+ * support. Possible features are TSO, UFO and checksum offloading
+ * in both rx and tx direction and for both IPv4 and IPv6.
+ */
+ uint64_t (*get_cap)(struct net_backend *be);
+
+ /*
+ * Tell the backend to enable/disable the specified virtio-net
+ * features (capabilities).
+ */
+ int (*set_cap)(struct net_backend *be, uint64_t features,
+ unsigned int vnet_hdr_len);
+
+ struct pci_vtnet_softc *sc;
+ int fd;
+ unsigned int be_vnet_hdr_len;
+ unsigned int fe_vnet_hdr_len;
+ /* TODO: implement priv space as a 'char opaque[0]' */
+ void *priv; /* Pointer to backend-specific data. */
+};
+
+SET_DECLARE(net_backend_set, struct net_backend);
+
+#define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr)
+
+#define WPRINTF(params) printf params
+
+/* the tap backend */
+
+struct tap_priv {
+ struct mevent *mevp;
+};
+
+static void
+tap_cleanup(struct net_backend *be)
+{
+ struct tap_priv *priv = be->priv;
+
+ if (be->priv) {
+ mevent_delete(priv->mevp);
+ free(be->priv);
+ be->priv = NULL;
+ }
+ if (be->fd != -1) {
+ close(be->fd);
+ be->fd = -1;
+ }
+}
+
+static int
+tap_init(struct net_backend *be, const char *devname,
+ net_backend_cb_t cb, void *param)
+{
+ char tbuf[80];
+ int fd;
+ int opt = 1;
+ struct tap_priv *priv;
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rights;
+#endif
+
+ if (cb == NULL) {
+ WPRINTF(("TAP backend requires non-NULL callback\n"));
+ return -1;
+ }
+
+ priv = calloc(1, sizeof(struct tap_priv));
+ if (priv == NULL) {
+ WPRINTF(("tap_priv alloc failed\n"));
+ return -1;
+ }
+
+ strcpy(tbuf, "/dev/");
+ strlcat(tbuf, devname, sizeof(tbuf));
+
+ fd = open(tbuf, O_RDWR);
+ if (fd == -1) {
+ WPRINTF(("open of tap device %s failed\n", tbuf));
+ goto error;
+ }
+
+ /*
+ * Set non-blocking and register for read
+ * notifications with the event loop
+ */
+ if (ioctl(fd, FIONBIO, &opt) < 0) {
+ WPRINTF(("tap device O_NONBLOCK failed\n"));
+ goto error;
+ }
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
+ if (caph_rights_limit(fd, &rights) == -1)
+ errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+ priv->mevp = mevent_add(fd, EVF_READ, cb, param);
+ if (priv->mevp == NULL) {
+ WPRINTF(("Could not register event\n"));
+ goto error;
+ }
+
+ be->fd = fd;
+ be->priv = priv;
+
+ return 0;
+
+error:
+ tap_cleanup(be);
+ return -1;
+}
+
+/*
+ * Called to send a buffer chain out to the tap device
+ */
+static int
+tap_send(struct net_backend *be, struct iovec *iov, int iovcnt, uint32_t len,
+ int more)
+{
+ static char pad[60]; /* all zero bytes */
+
+ (void)more;
+ /*
+ * If the length is < 60, pad out to that and add the
+ * extra zero'd segment to the iov. It is guaranteed that
+ * there is always an extra iov available by the caller.
+ */
+ if (len < 60) {
+ iov[iovcnt].iov_base = pad;
+ iov[iovcnt].iov_len = (size_t)(60 - len);
+ iovcnt++;
+ }
+
+ return (int)writev(be->fd, iov, iovcnt);
+}
+
+static int
+tap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+ int ret;
+
+ /* Should never be called without a valid tap fd */
+ assert(be->fd != -1);
+
+ ret = (int)readv(be->fd, iov, iovcnt);
+
+ if (ret < 0 && errno == EWOULDBLOCK) {
+ return 0;
+ }
+
+ return ret;
+}
+
+static uint64_t
+tap_get_cap(struct net_backend *be)
+{
+ (void)be;
+ return 0; // nothing extra
+}
+
+static int
+tap_set_cap(struct net_backend *be, uint64_t features,
+ unsigned vnet_hdr_len)
+{
+ (void)be;
+ return (features || vnet_hdr_len) ? -1 : 0;
+}
+
+static struct net_backend tap_backend = {
+ .name = "tap|vmnet",
+ .init = tap_init,
+ .cleanup = tap_cleanup,
+ .send = tap_send,
+ .recv = tap_recv,
+ .get_cap = tap_get_cap,
+ .set_cap = tap_set_cap,
+};
+
+DATA_SET(net_backend_set, tap_backend);
+
+/*
+ * The netmap backend
+ */
+
+/* The virtio-net features supported by netmap. */
+#define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
+ VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
+ VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
+ VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
+
+#define NETMAP_POLLMASK (POLLIN | POLLRDNORM | POLLRDBAND)
+
+struct netmap_priv {
+ char ifname[IFNAMSIZ];
+ struct nm_desc *nmd;
+ uint16_t memid;
+ struct netmap_ring *rx;
+ struct netmap_ring *tx;
+ pthread_t evloop_tid;
+ net_backend_cb_t cb;
+ void *cb_param;
+};
+
+static void *
+netmap_evloop_thread(void *param)
+{
+ struct net_backend *be = param;
+ struct netmap_priv *priv = be->priv;
+ struct pollfd pfd;
+ int ret;
+
+ for (;;) {
+ pfd.fd = be->fd;
+ pfd.events = NETMAP_POLLMASK;
+ ret = poll(&pfd, 1, INFTIM);
+ if (ret == -1 && errno != EINTR) {
+ WPRINTF(("netmap poll failed, %d\n", errno));
+ } else if (ret == 1 && (pfd.revents & NETMAP_POLLMASK)) {
+ priv->cb(pfd.fd, EVF_READ, priv->cb_param);
+ }
+ }
+
+ return NULL;
+}
+
+static void
+nmreq_init(struct nmreq *req, char *ifname)
+{
+ memset(req, 0, sizeof(*req));
+ strncpy(req->nr_name, ifname, sizeof(req->nr_name));
+ req->nr_version = NETMAP_API;
+}
+
+static int
+netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
+{
+ int err;
+ struct nmreq req;
+ struct netmap_priv *priv = be->priv;
+
+ nmreq_init(&req, priv->ifname);
+ req.nr_cmd = NETMAP_BDG_VNET_HDR;
+ req.nr_arg1 = vnet_hdr_len;
+ err = ioctl(be->fd, NIOCREGIF, &req);
+ if (err) {
+ WPRINTF(("Unable to set vnet header length %d\n",
+ vnet_hdr_len));
+ return err;
+ }
+
+ be->be_vnet_hdr_len = vnet_hdr_len;
+
+ return 0;
+}
+
+static int
+netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
+{
+ int prev_hdr_len = be->be_vnet_hdr_len;
+ int ret;
+
+ if (vnet_hdr_len == prev_hdr_len) {
+ return 1;
+ }
+
+ ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
+ if (ret) {
+ return 0;
+ }
+
+ netmap_set_vnet_hdr_len(be, prev_hdr_len);
+
+ return 1;
+}
+
+static uint64_t
+netmap_get_cap(struct net_backend *be)
+{
+ return netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
+ NETMAP_FEATURES : 0;
+}
+
+static int
+netmap_set_cap(struct net_backend *be, uint64_t features,
+ unsigned vnet_hdr_len)
+{
+ return netmap_set_vnet_hdr_len(be, vnet_hdr_len);
+}
+
+static int
+netmap_init(struct net_backend *be, const char *devname,
+ net_backend_cb_t cb, void *param)
+{
+ struct netmap_priv *priv = NULL;
+
+ priv = calloc(1, sizeof(struct netmap_priv));
+ if (priv == NULL) {
+ WPRINTF(("Unable alloc netmap private data\n"));
+ return -1;
+ }
+
+ strncpy(priv->ifname, devname, sizeof(priv->ifname));
+ priv->ifname[sizeof(priv->ifname) - 1] = '\0';
+
+ priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
+ if (priv->nmd == NULL) {
+ WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)\n",
+ devname, strerror(errno)));
+ free(priv);
+ return -1;
+ }
+
+ priv->memid = priv->nmd->req.nr_arg2;
+ priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
+ priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
+ priv->cb = cb;
+ priv->cb_param = param;
+ be->fd = priv->nmd->fd;
+ be->priv = priv;
+
+ /* TODO Turn this into a mevent_add */
+ {
+ char tname[40];
+
+ /* Create a thread for netmap poll. */
+ pthread_create(&priv->evloop_tid, NULL, netmap_evloop_thread, (void *)be);
+ snprintf(tname, sizeof(tname), "netmap-evloop-%p", priv);
+ pthread_set_name_np(priv->evloop_tid, tname);
+ }
+
+ return 0;
+}
+
+static void
+netmap_cleanup(struct net_backend *be)
+{
+ struct netmap_priv *priv = be->priv;
+
+ if (be->priv) {
+ nm_close(priv->nmd);
+ free(be->priv);
+ be->priv = NULL;
+ }
+ be->fd = -1;
+}
+
+/* A fast copy routine only for multiples of 64 bytes, non overlapped. */
+static inline void
+pkt_copy(const void *_src, void *_dst, int l)
+{
+ const uint64_t *src = _src;
+ uint64_t *dst = _dst;
+ if (l >= 1024) {
+ bcopy(src, dst, l);
+ return;
+ }
+ for (; l > 0; l -= 64) {
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ }
+}
+
+static int
+netmap_send(struct net_backend *be, struct iovec *iov,
+ int iovcnt, uint32_t size, int more)
+{
+ struct netmap_priv *priv = be->priv;
+ struct netmap_ring *ring;
+ int nm_buf_size;
+ int nm_buf_len;
+ uint32_t head;
+ void *nm_buf;
+ int j;
+
+ if (iovcnt <= 0 || size <= 0) {
+ D("Wrong iov: iovcnt %d size %d", iovcnt, size);
+ return 0;
+ }
+
+ ring = priv->tx;
+ head = ring->head;
+ if (head == ring->tail) {
+ RD(1, "No space, drop %d bytes", size);
+ goto txsync;
+ }
+ nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
+ nm_buf_size = ring->nr_buf_size;
+ nm_buf_len = 0;
+
+ for (j = 0; j < iovcnt; j++) {
+ int iov_frag_size = iov[j].iov_len;
+ void *iov_frag_buf = iov[j].iov_base;
+
+ /* Split each iovec fragment over more netmap slots, if
+ necessary. */
+ for (;;) {
+ int copylen;
+
+ copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
+ pkt_copy(iov_frag_buf, nm_buf, copylen);
+
+ iov_frag_buf += copylen;
+ iov_frag_size -= copylen;
+ nm_buf += copylen;
+ nm_buf_size -= copylen;
+ nm_buf_len += copylen;
+
+ if (iov_frag_size == 0) {
+ break;
+ }
+
+ ring->slot[head].len = nm_buf_len;
+ ring->slot[head].flags = NS_MOREFRAG;
+ head = nm_ring_next(ring, head);
+ if (head == ring->tail) {
+ /* We ran out of netmap slots while
+ * splitting the iovec fragments. */
+ RD(1, "No space, drop %d bytes", size);
+ goto txsync;
+ }
+ nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
+ nm_buf_size = ring->nr_buf_size;
+ nm_buf_len = 0;
+ }
+ }
+
+ /* Complete the last slot, which must not have NS_MOREFRAG set. */
+ ring->slot[head].len = nm_buf_len;
+ ring->slot[head].flags = 0;
+ head = nm_ring_next(ring, head);
+
+ /* Now update ring->head and ring->cur. */
+ ring->head = ring->cur = head;
+
+ if (more) {// && nm_ring_space(ring) > 64
+ return 0;
+ }
+txsync:
+ ioctl(be->fd, NIOCTXSYNC, NULL);
+
+ return 0;
+}
+
+static int
+netmap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+ struct netmap_priv *priv = be->priv;
+ struct netmap_slot *slot = NULL;
+ struct netmap_ring *ring;
+ void *iov_frag_buf;
+ int iov_frag_size;
+ int totlen = 0;
+ uint32_t head;
+
+ assert(iovcnt);
+
+ ring = priv->rx;
+ head = ring->head;
+ iov_frag_buf = iov->iov_base;
+ iov_frag_size = iov->iov_len;
+
+ do {
+ int nm_buf_len;
+ void *nm_buf;
+
+ if (head == ring->tail) {
+ return 0;
+ }
+
+ slot = ring->slot + head;
+ nm_buf = NETMAP_BUF(ring, slot->buf_idx);
+ nm_buf_len = slot->len;
+
+ for (;;) {
+ int copylen = nm_buf_len < iov_frag_size ? nm_buf_len : iov_frag_size;
+
+ pkt_copy(nm_buf, iov_frag_buf, copylen);
+ nm_buf += copylen;
+ nm_buf_len -= copylen;
+ iov_frag_buf += copylen;
+ iov_frag_size -= copylen;
+ totlen += copylen;
+
+ if (nm_buf_len == 0) {
+ break;
+ }
+
+ iov++;
+ iovcnt--;
+ if (iovcnt == 0) {
+ /* No space to receive. */
+ D("Short iov, drop %d bytes", totlen);
+ return -ENOSPC;
+ }
+ iov_frag_buf = iov->iov_base;
+ iov_frag_size = iov->iov_len;
+ }
+
+ head = nm_ring_next(ring, head);
+
+ } while (slot->flags & NS_MOREFRAG);
+
+ /* Release slots to netmap. */
+ ring->head = ring->cur = head;
+
+ return totlen;
+}
+
+static struct net_backend netmap_backend = {
+ .name = "netmap|vale",
+ .init = netmap_init,
+ .cleanup = netmap_cleanup,
+ .send = netmap_send,
+ .recv = netmap_recv,
+ .get_cap = netmap_get_cap,
+ .set_cap = netmap_set_cap,
+};
+
+DATA_SET(net_backend_set, netmap_backend);
+
+/*
+ * make sure a backend is properly initialized
+ * TODO check and return error if not set!
+ */
+static int
+netbe_fix(struct net_backend *be)
+{
+ if (be == NULL)
+ return -1;
+ if (be->name == NULL) {
+ fprintf(stderr, "missing name for %p\n", be);
+ return -1;
+ }
+ if (be->init == NULL) {
+ fprintf(stderr, "missing init for %p %s\n", be, be->name);
+ return -1;
+ }
+ if (be->cleanup == NULL) {
+ fprintf(stderr, "missing cleanup for %p %s\n", be, be->name);
+ return -1;
+ }
+ if (be->send == NULL) {
+ fprintf(stderr, "missing send for %p %s\n", be, be->name);
+ return -1;
+ }
+ if (be->recv == NULL) {
+ fprintf(stderr, "missing recv for %p %s\n", be, be->name);
+ return -1;
+ }
+ if (be->get_cap == NULL) {
+ fprintf(stderr, "missing get_cap for %p %s\n",
+ be, be->name);
+ return -1;
+ }
+ if (be->set_cap == NULL) {
+ fprintf(stderr, "missing set_cap for %p %s\n",
+ be, be->name);
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * keys is a set of prefixes separated by '|',
+ * return 1 if the leftmost part of name matches one prefix.
+ */
+static const char *
+netbe_name_match(const char *keys, const char *name)
+{
+ const char *n = name, *good = keys;
+ char c;
+
+ if (!keys || !name)
+ return NULL;
+ while ( (c = *keys++) ) {
+ if (c == '|') { /* reached the separator */
+ if (good)
+ break;
+ /* prepare for new round */
+ n = name;
+ good = keys;
+ } else if (good && c != *n++) {
+ good = NULL; /* drop till next keyword */
+ }
+ }
+ return good;
+}
+
+/*
+ * Initialize a backend and attach to the frontend.
+ * This is called during frontend initialization.
+ * @devname is the backend-name as supplied on the command line,
+ * e.g. -s 2:0,frontend-name,backend-name[,other-args]
+ * @cb is the receive callback supplied by the frontend,
+ * and it is invoked in the event loop when a receive
+ * event is generated in the hypervisor,
+ * @param is a pointer to the frontend, and normally used as
+ * the argument for the callback.
+ */
+struct net_backend *
+netbe_init(const char *devname, net_backend_cb_t cb, void *param)
+{
+ struct net_backend **pbe, *be, *tbe = NULL;
+ int err;
+
+ /*
+ * Find the network backend depending on the user-provided
+ * device name. net_backend_set is built using a linker set.
+ */
+ SET_FOREACH(pbe, net_backend_set) {
+ if (netbe_name_match((*pbe)->name, devname)) {
+ tbe = *pbe;
+ break;
+ }
+ }
+ if (tbe == NULL)
+ return NULL; /* or null backend ? */
+ be = calloc(1, sizeof(*be));
+ *be = *tbe; /* copy the template */
+ if (netbe_fix(be)) { /* make sure we have all fields */
+ free(be);
+ return NULL;
+ }
+ be->fd = -1;
+ be->priv = NULL;
+ be->sc = param;
+ be->be_vnet_hdr_len = 0;
+ be->fe_vnet_hdr_len = 0;
+
+ /* initialize the backend */
+ err = be->init(be, devname, cb, param);
+ if (err) {
+ free(be);
+ be = NULL;
+ }
+ return be;
+}
+
+void
+netbe_cleanup(struct net_backend *be)
+{
+ if (be == NULL)
+ return;
+ be->cleanup(be);
+ free(be);
+}
+
+uint64_t
+netbe_get_cap(struct net_backend *be)
+{
+ if (be == NULL)
+ return 0;
+ return be->get_cap(be);
+}
+
+int
+netbe_set_cap(struct net_backend *be, uint64_t features,
+ unsigned vnet_hdr_len)
+{
+ int ret;
+
+ if (be == NULL)
+ return 0;
+
+ /* There are only three valid lengths. */
+ if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
+ && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
+ return -1;
+
+ be->fe_vnet_hdr_len = vnet_hdr_len;
+
+ ret = be->set_cap(be, features, vnet_hdr_len);
+ assert(be->be_vnet_hdr_len == 0 ||
+ be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
+
+ return ret;
+}
+
+static __inline struct iovec *
+iov_trim(struct iovec *iov, int *iovcnt, unsigned int tlen)
+{
+ struct iovec *riov;
+
+ /* XXX short-cut: assume first segment is >= tlen */
+ assert(iov[0].iov_len >= tlen);
+
+ iov[0].iov_len -= tlen;
+ if (iov[0].iov_len == 0) {
+ assert(*iovcnt > 1);
+ *iovcnt -= 1;
+ riov = &iov[1];
+ } else {
+ iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
+ riov = &iov[0];
+ }
+
+ return (riov);
+}
+
+void
+netbe_send(struct net_backend *be, struct iovec *iov, int iovcnt, uint32_t len,
+ int more)
+{
+ if (be == NULL)
+ return;
+#if 0
+ int i;
+ D("sending iovcnt %d len %d iovec %p", iovcnt, len, iov);
+ for (i=0; i < iovcnt; i++)
+ D(" %3d: %4d %p", i, (int)iov[i].iov_len, iov[i].iov_base);
+#endif
+ if (be->be_vnet_hdr_len != be->fe_vnet_hdr_len) {
+ /* Here we are sure be->be_vnet_hdr_len is 0. */
+ iov = iov_trim(iov, &iovcnt, be->fe_vnet_hdr_len);
+ }
+
+ be->send(be, iov, iovcnt, len, more);
+}
+
+/*
+ * can return -1 in case of errors
+ */
+int
+netbe_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+ unsigned int hlen = 0; /* length of prepended virtio-net header */
+ int ret;
+
+ if (be == NULL)
+ return -1;
+
+ if (be->be_vnet_hdr_len != be->fe_vnet_hdr_len) {
+ struct virtio_net_rxhdr *vh;
+
+ /* Here we are sure be->be_vnet_hdr_len is 0. */
+ hlen = be->fe_vnet_hdr_len;
+ /*
+ * Get a pointer to the rx header, and use the
+ * data immediately following it for the packet buffer.
+ */
+ vh = iov[0].iov_base;
+ iov = iov_trim(iov, &iovcnt, hlen);
+
+ /*
+ * Here we are sure be->fe_vnet_hdr_len is 0.
+ * The only valid field in the rx packet header is the
+ * number of buffers if merged rx bufs were negotiated.
+ */
+ memset(vh, 0, hlen);
+
+ if (hlen == VNET_HDR_LEN) {
+ vh->vrh_bufs = 1;
+ }
+ }
+
+ ret = be->recv(be, iov, iovcnt);
+ if (ret > 0) {
+ ret += hlen;
+ }
+
+ return ret;
+}
+
+/*
+ * Read a packet from the backend and discard it.
+ * Returns the size of the discarded packet or zero if no packet was available.
+ * A negative error code is returned in case of read error.
+ */
+int
+netbe_rx_discard(struct net_backend *be)
+{
+ /*
+ * MP note: the dummybuf is only used to discard frames,
+ * so there is no need for it to be per-vtnet or locked.
+ * We only make it large enough for TSO-sized segment.
+ */
+ static uint8_t dummybuf[65536+64];
+ struct iovec iov;
+
+ iov.iov_base = dummybuf;
+ iov.iov_len = sizeof(dummybuf);
+
+ return netbe_recv(be, &iov, 1);
+}
+
Index: usr.sbin/bhyve/pci_virtio_net.c
===================================================================
--- usr.sbin/bhyve/pci_virtio_net.c
+++ usr.sbin/bhyve/pci_virtio_net.c
@@ -32,22 +32,13 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
-#ifndef WITHOUT_CAPSICUM
-#include <sys/capsicum.h>
-#endif
#include <sys/linker_set.h>
#include <sys/select.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
#include <net/ethernet.h>
-#ifndef NETMAP_WITH_LIBS
-#define NETMAP_WITH_LIBS
-#endif
-#include <net/netmap_user.h>
+#include <net/if.h> /* IFNAMSIZ */
-#ifndef WITHOUT_CAPSICUM
-#include <capsicum_helpers.h>
-#endif
#include <err.h>
#include <errno.h>
#include <fcntl.h>
@@ -58,44 +49,20 @@
#include <strings.h>
#include <unistd.h>
#include <assert.h>
-#include <md5.h>
#include <pthread.h>
#include <pthread_np.h>
-#include <sysexits.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "mevent.h"
#include "virtio.h"
-#include "net_utils.h"
+#include "net_utils.h" /* MAC address generation */
+#include "net_backends.h" /* VirtIO capabilities */
#define VTNET_RINGSZ 1024
#define VTNET_MAXSEGS 256
-/*
- * Host capabilities. Note that we only offer a few of these.
- */
-#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */
-#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */
-#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
-#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */
-#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */
-#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */
-#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */
-#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */
-#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */
-#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */
-#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */
-#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */
-#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
-#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
-#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */
-#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */
-#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */
-#define VIRTIO_NET_F_GUEST_ANNOUNCE \
- (1 << 21) /* guest can send gratuitous pkts */
-
#define VTNET_S_HOSTCAPS \
( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \
VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC)
@@ -117,19 +84,6 @@
#define VTNET_MAXQ 3
-/*
- * Fixed network header size
- */
-struct virtio_net_rxhdr {
- uint8_t vrh_flags;
- uint8_t vrh_gso_type;
- uint16_t vrh_hdr_len;
- uint16_t vrh_gso_size;
- uint16_t vrh_csum_start;
- uint16_t vrh_csum_offset;
- uint16_t vrh_bufs;
-} __packed;
-
/*
* Debug printf
*/
@@ -144,30 +98,24 @@
struct virtio_softc vsc_vs;
struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
pthread_mutex_t vsc_mtx;
- struct mevent *vsc_mevp;
- int vsc_tapfd;
- struct nm_desc *vsc_nmd;
+ net_backend_t *vsc_be;
int vsc_rx_ready;
int resetting; /* protected by tx_mtx */
uint64_t vsc_features; /* negotiated features */
- struct virtio_net_config vsc_config;
-
pthread_mutex_t rx_mtx;
- int rx_vhdrlen;
+ unsigned int rx_vhdrlen;
int rx_merge; /* merged rx bufs in use */
pthread_t tx_tid;
pthread_mutex_t tx_mtx;
pthread_cond_t tx_cond;
int tx_in_progress;
+ struct virtio_net_config vsc_config;
- void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc);
- void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov,
- int iovcnt, int len);
};
static void pci_vtnet_reset(void *);
@@ -227,80 +175,19 @@
* Called to send a buffer chain out to the tap device
*/
static void
-pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
- int len)
+pci_vtnet_rx(struct pci_vtnet_softc *sc)
{
- static char pad[60]; /* all zero bytes */
-
- if (sc->vsc_tapfd == -1)
- return;
-
- /*
- * If the length is < 60, pad out to that and add the
- * extra zero'd segment to the iov. It is guaranteed that
- * there is always an extra iov available by the caller.
- */
- if (len < 60) {
- iov[iovcnt].iov_base = pad;
- iov[iovcnt].iov_len = 60 - len;
- iovcnt++;
- }
- (void) writev(sc->vsc_tapfd, iov, iovcnt);
-}
-
-/*
- * Called when there is read activity on the tap file descriptor.
- * Each buffer posted by the guest is assumed to be able to contain
- * an entire ethernet frame + rx header.
- * MP note: the dummybuf is only used for discarding frames, so there
- * is no need for it to be per-vtnet or locked.
- */
-static uint8_t dummybuf[2048];
-
-static __inline struct iovec *
-rx_iov_trim(struct iovec *iov, int *niov, int tlen)
-{
- struct iovec *riov;
-
- /* XXX short-cut: assume first segment is >= tlen */
- assert(iov[0].iov_len >= tlen);
-
- iov[0].iov_len -= tlen;
- if (iov[0].iov_len == 0) {
- assert(*niov > 1);
- *niov -= 1;
- riov = &iov[1];
- } else {
- iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
- riov = &iov[0];
- }
-
- return (riov);
-}
-
-static void
-pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
-{
- struct iovec iov[VTNET_MAXSEGS], *riov;
+ struct iovec iov[VTNET_MAXSEGS + 1];
struct vqueue_info *vq;
- void *vrx;
int len, n;
uint16_t idx;
- /*
- * Should never be called without a valid tap fd
- */
- assert(sc->vsc_tapfd != -1);
-
- /*
- * But, will be called when the rx ring hasn't yet
- * been set up.
- */
if (!sc->vsc_rx_ready) {
/*
+ * The rx ring has not yet been set up.
* Drop the packet and try later.
*/
- (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+ netbe_rx_discard(sc->vsc_be);
return;
}
@@ -310,10 +197,10 @@
vq = &sc->vsc_queues[VTNET_RXQ];
if (!vq_has_descs(vq)) {
/*
- * Drop the packet and try later. Interrupt on
- * empty, if that's negotiated.
+ * No available rx buffers. Drop the packet and try later.
+ * Interrupt on empty, if that's negotiated.
*/
- (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+ netbe_rx_discard(sc->vsc_be);
vq_endchains(vq, 1);
return;
}
@@ -325,211 +212,11 @@
n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
assert(n >= 1 && n <= VTNET_MAXSEGS);
- /*
- * Get a pointer to the rx header, and use the
- * data immediately following it for the packet buffer.
- */
- vrx = iov[0].iov_base;
- riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
-
- len = readv(sc->vsc_tapfd, riov, n);
-
- if (len < 0 && errno == EWOULDBLOCK) {
- /*
- * No more packets, but still some avail ring
- * entries. Interrupt if needed/appropriate.
- */
- vq_retchain(vq);
- vq_endchains(vq, 0);
- return;
- }
-
- /*
- * The only valid field in the rx packet header is the
- * number of buffers if merged rx bufs were negotiated.
- */
- memset(vrx, 0, sc->rx_vhdrlen);
-
- if (sc->rx_merge) {
- struct virtio_net_rxhdr *vrxh;
-
- vrxh = vrx;
- vrxh->vrh_bufs = 1;
- }
-
- /*
- * Release this chain and handle more chains.
- */
- vq_relchain(vq, idx, len + sc->rx_vhdrlen);
- } while (vq_has_descs(vq));
-
- /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
- vq_endchains(vq, 1);
-}
+ len = netbe_recv(sc->vsc_be, iov, n);
-static __inline int
-pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
-{
- int r, i;
- int len = 0;
-
- for (r = nmd->cur_tx_ring; ; ) {
- struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r);
- uint32_t cur, idx;
- char *buf;
-
- if (nm_ring_empty(ring)) {
- r++;
- if (r > nmd->last_tx_ring)
- r = nmd->first_tx_ring;
- if (r == nmd->cur_tx_ring)
- break;
- continue;
+ if (len < 0) {
+ break;
}
- cur = ring->cur;
- idx = ring->slot[cur].buf_idx;
- buf = NETMAP_BUF(ring, idx);
-
- for (i = 0; i < iovcnt; i++) {
- if (len + iov[i].iov_len > 2048)
- break;
- memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len);
- len += iov[i].iov_len;
- }
- ring->slot[cur].len = len;
- ring->head = ring->cur = nm_ring_next(ring, cur);
- nmd->cur_tx_ring = r;
- ioctl(nmd->fd, NIOCTXSYNC, NULL);
- break;
- }
-
- return (len);
-}
-
-static __inline int
-pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
-{
- int len = 0;
- int i = 0;
- int r;
-
- for (r = nmd->cur_rx_ring; ; ) {
- struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r);
- uint32_t cur, idx;
- char *buf;
- size_t left;
-
- if (nm_ring_empty(ring)) {
- r++;
- if (r > nmd->last_rx_ring)
- r = nmd->first_rx_ring;
- if (r == nmd->cur_rx_ring)
- break;
- continue;
- }
- cur = ring->cur;
- idx = ring->slot[cur].buf_idx;
- buf = NETMAP_BUF(ring, idx);
- left = ring->slot[cur].len;
-
- for (i = 0; i < iovcnt && left > 0; i++) {
- if (iov[i].iov_len > left)
- iov[i].iov_len = left;
- memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len);
- len += iov[i].iov_len;
- left -= iov[i].iov_len;
- }
- ring->head = ring->cur = nm_ring_next(ring, cur);
- nmd->cur_rx_ring = r;
- ioctl(nmd->fd, NIOCRXSYNC, NULL);
- break;
- }
- for (; i < iovcnt; i++)
- iov[i].iov_len = 0;
-
- return (len);
-}
-
-/*
- * Called to send a buffer chain out to the vale port
- */
-static void
-pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
- int len)
-{
- static char pad[60]; /* all zero bytes */
-
- if (sc->vsc_nmd == NULL)
- return;
-
- /*
- * If the length is < 60, pad out to that and add the
- * extra zero'd segment to the iov. It is guaranteed that
- * there is always an extra iov available by the caller.
- */
- if (len < 60) {
- iov[iovcnt].iov_base = pad;
- iov[iovcnt].iov_len = 60 - len;
- iovcnt++;
- }
- (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt);
-}
-
-static void
-pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc)
-{
- struct iovec iov[VTNET_MAXSEGS], *riov;
- struct vqueue_info *vq;
- void *vrx;
- int len, n;
- uint16_t idx;
-
- /*
- * Should never be called without a valid netmap descriptor
- */
- assert(sc->vsc_nmd != NULL);
-
- /*
- * But, will be called when the rx ring hasn't yet
- * been set up.
- */
- if (!sc->vsc_rx_ready) {
- /*
- * Drop the packet and try later.
- */
- (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
- return;
- }
-
- /*
- * Check for available rx buffers
- */
- vq = &sc->vsc_queues[VTNET_RXQ];
- if (!vq_has_descs(vq)) {
- /*
- * Drop the packet and try later. Interrupt on
- * empty, if that's negotiated.
- */
- (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
- vq_endchains(vq, 1);
- return;
- }
-
- do {
- /*
- * Get descriptor chain.
- */
- n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
- assert(n >= 1 && n <= VTNET_MAXSEGS);
-
- /*
- * Get a pointer to the rx header, and use the
- * data immediately following it for the packet buffer.
- */
- vrx = iov[0].iov_base;
- riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
-
- len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n);
if (len == 0) {
/*
@@ -541,40 +228,31 @@
return;
}
- /*
- * The only valid field in the rx packet header is the
- * number of buffers if merged rx bufs were negotiated.
- */
- memset(vrx, 0, sc->rx_vhdrlen);
-
- if (sc->rx_merge) {
- struct virtio_net_rxhdr *vrxh;
-
- vrxh = vrx;
- vrxh->vrh_bufs = 1;
- }
-
- /*
- * Release this chain and handle more chains.
- */
- vq_relchain(vq, idx, len + sc->rx_vhdrlen);
+ /* Publish the info to the guest */
+ vq_relchain(vq, idx, (uint32_t)len);
} while (vq_has_descs(vq));
/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
vq_endchains(vq, 1);
}
+/*
+ * Called when there is read activity on the backend file descriptor.
+ * Each buffer posted by the guest is assumed to be able to contain
+ * an entire ethernet frame + rx header.
+ */
static void
pci_vtnet_rx_callback(int fd, enum ev_type type, void *param)
{
struct pci_vtnet_softc *sc = param;
pthread_mutex_lock(&sc->rx_mtx);
- sc->pci_vtnet_rx(sc);
+ pci_vtnet_rx(sc);
pthread_mutex_unlock(&sc->rx_mtx);
}
+/* Called on RX kick. */
static void
pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
{
@@ -589,35 +267,33 @@
}
}
+/* TX virtqueue processing, called by the TX thread. */
static void
pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
{
struct iovec iov[VTNET_MAXSEGS + 1];
int i, n;
- int plen, tlen;
+ uint32_t len;
uint16_t idx;
/*
- * Obtain chain of descriptors. The first one is
- * really the header descriptor, so we need to sum
- * up two lengths: packet length and transfer length.
+ * Obtain chain of descriptors. The first descriptor also
+ * contains the virtio-net header.
*/
n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
assert(n >= 1 && n <= VTNET_MAXSEGS);
- plen = 0;
- tlen = iov[0].iov_len;
- for (i = 1; i < n; i++) {
- plen += iov[i].iov_len;
- tlen += iov[i].iov_len;
+ len = 0;
+ for (i = 0; i < n; i++) {
+ len += iov[i].iov_len;
}
- DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
- sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen);
+ netbe_send(sc->vsc_be, iov, n, len, 0 /* more */);
- /* chain is processed, release it and set tlen */
- vq_relchain(vq, idx, tlen);
+ /* chain is processed, release it and set len */
+ vq_relchain(vq, idx, len);
}
+/* Called on TX kick. */
static void
pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
{
@@ -647,6 +323,14 @@
struct vqueue_info *vq;
int error;
+ {
+ struct pci_devinst *pi = sc->vsc_vs.vs_pi;
+ char tname[MAXCOMLEN + 1];
+ snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot,
+ pi->pi_func);
+ pthread_set_name_np(pthread_self(), tname);
+ }
+
vq = &sc->vsc_queues[VTNET_TXQ];
/*
@@ -699,93 +383,27 @@
}
#endif
-static void
-pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname)
-{
- char tbuf[80];
-#ifndef WITHOUT_CAPSICUM
- cap_rights_t rights;
-#endif
-
- strcpy(tbuf, "/dev/");
- strlcat(tbuf, devname, sizeof(tbuf));
-
- sc->pci_vtnet_rx = pci_vtnet_tap_rx;
- sc->pci_vtnet_tx = pci_vtnet_tap_tx;
-
- sc->vsc_tapfd = open(tbuf, O_RDWR);
- if (sc->vsc_tapfd == -1) {
- WPRINTF(("open of tap device %s failed\n", tbuf));
- return;
- }
-
- /*
- * Set non-blocking and register for read
- * notifications with the event loop
- */
- int opt = 1;
- if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
- WPRINTF(("tap device O_NONBLOCK failed\n"));
- close(sc->vsc_tapfd);
- sc->vsc_tapfd = -1;
- }
-
-#ifndef WITHOUT_CAPSICUM
- cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
- if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1)
- errx(EX_OSERR, "Unable to apply rights for sandbox");
-#endif
-
- sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
- EVF_READ,
- pci_vtnet_rx_callback,
- sc);
- if (sc->vsc_mevp == NULL) {
- WPRINTF(("Could not register event\n"));
- close(sc->vsc_tapfd);
- sc->vsc_tapfd = -1;
- }
-}
-
-static void
-pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname)
-{
- sc->pci_vtnet_rx = pci_vtnet_netmap_rx;
- sc->pci_vtnet_tx = pci_vtnet_netmap_tx;
-
- sc->vsc_nmd = nm_open(ifname, NULL, 0, 0);
- if (sc->vsc_nmd == NULL) {
- WPRINTF(("open of netmap device %s failed\n", ifname));
- return;
- }
-
- sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd,
- EVF_READ,
- pci_vtnet_rx_callback,
- sc);
- if (sc->vsc_mevp == NULL) {
- WPRINTF(("Could not register event\n"));
- nm_close(sc->vsc_nmd);
- sc->vsc_nmd = NULL;
- }
-}
-
static int
pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
- char tname[MAXCOMLEN + 1];
struct pci_vtnet_softc *sc;
- char *devname;
- char *vtopts;
+ char tname[MAXCOMLEN + 1];
+ struct virtio_consts *vc;
int mac_provided;
- sc = calloc(1, sizeof(struct pci_vtnet_softc));
+ /*
+ * Allocate data structures for further virtio initializations.
+ * sc also contains a copy of the vtnet_vi_consts,
+ * because the capabilities change depending on
+ * the backend.
+ */
+ sc = calloc(1, sizeof(struct pci_vtnet_softc) +
+ sizeof(struct virtio_consts));
+ vc = (struct virtio_consts *)(sc + 1);
+ memcpy(vc, &vtnet_vi_consts, sizeof(*vc));
pthread_mutex_init(&sc->vsc_mtx, NULL);
- vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues);
- sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
-
sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
@@ -796,13 +414,13 @@
#endif
/*
- * Attempt to open the tap device and read the MAC address
+ * Attempt to open the backend device and read the MAC address
* if specified
*/
mac_provided = 0;
- sc->vsc_tapfd = -1;
- sc->vsc_nmd = NULL;
if (opts != NULL) {
+ char *devname;
+ char *vtopts;
int err;
devname = vtopts = strdup(opts);
@@ -817,13 +435,12 @@
mac_provided = 1;
}
- if (strncmp(devname, "vale", 4) == 0)
- pci_vtnet_netmap_setup(sc, devname);
- if (strncmp(devname, "tap", 3) == 0 ||
- strncmp(devname, "vmnet", 5) == 0)
- pci_vtnet_tap_setup(sc, devname);
-
+ sc->vsc_be = netbe_init(devname, pci_vtnet_rx_callback, sc);
free(devname);
+ if (sc->vsc_be == NULL) {
+ return (EINVAL);
+ }
+ vc->vc_hv_caps |= netbe_get_cap(sc->vsc_be);
}
if (!mac_provided) {
@@ -837,10 +454,12 @@
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
- /* Link is up if we managed to open tap device or vale port. */
- sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 ||
- sc->vsc_nmd != NULL);
+ /* Link is up if we managed to open backend device. */
+ sc->vsc_config.status = (opts == NULL || sc->vsc_be);
+ vi_softc_linkup(&sc->vsc_vs, vc, sc, pi, sc->vsc_queues);
+ sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
+
/* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix()))
return (1);
@@ -876,8 +495,8 @@
struct pci_vtnet_softc *sc = vsc;
void *ptr;
- if (offset < 6) {
- assert(offset + size <= 6);
+ if (offset < (int)sizeof(sc->vsc_config.mac)) {
+ assert(offset + size <= (int)sizeof(sc->vsc_config.mac));
/*
* The driver is allowed to change the MAC address
*/
@@ -909,14 +528,17 @@
sc->vsc_features = negotiated_features;
- if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) {
+ if (!(negotiated_features & VIRTIO_NET_F_MRG_RXBUF)) {
sc->rx_merge = 0;
/* non-merge rx header is 2 bytes shorter */
sc->rx_vhdrlen -= 2;
}
+
+ /* Tell the backend to enable some capabilities it has advertised. */
+ netbe_set_cap(sc->vsc_be, negotiated_features, sc->rx_vhdrlen);
}
-struct pci_devemu pci_de_vnet = {
+static struct pci_devemu pci_de_vnet = {
.pe_emu = "virtio-net",
.pe_init = pci_vtnet_init,
.pe_barwrite = vi_pci_write,
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Apr 22, 1:33 PM (17 h, 5 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
31981945
Default Alt Text
D20659.id58685.diff (44 KB)
Attached To
Mode
D20659: bhyve: abstraction for network backends
Attached
Detach File
Event Timeline
Log In to Comment