diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -33,6 +33,7 @@ iov.c \ mem.c \ mevent.c \ + net_backend_netmap.c \ net_backends.c \ net_utils.c \ pci_emul.c \ @@ -92,8 +93,8 @@ CFLAGS+=-DINET6 .endif .if ${MK_NETGRAPH_SUPPORT} != "no" -CFLAGS+=-DNETGRAPH -LIBADD+= netgraph +SRCS+= net_backend_netgraph.c +LIBADD+= netgraph .endif .if ${MK_OPENSSL} == "no" CFLAGS+=-DNO_OPENSSL diff --git a/usr.sbin/bhyve/net_backend_netgraph.c b/usr.sbin/bhyve/net_backend_netgraph.c new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/net_backend_netgraph.c @@ -0,0 +1,191 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2019 Vincenzo Maffione + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include + +#include "config.h" +#include "debug.h" +#include "net_backends.h" +#include "net_backends_priv.h" + +#define NG_SBUF_MAX_SIZE (4 * 1024 * 1024) + +static int +ng_init(struct net_backend *be, const char *devname __unused, + nvlist_t *nvl, net_be_rxeof_t cb, void *param) +{ + struct tap_priv *p = NET_BE_PRIV(be); + struct ngm_connect ngc; + const char *value, *nodename; + int sbsz; + int ctrl_sock; + int flags; + unsigned long maxsbsz; + size_t msbsz; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + if (cb == NULL) { + EPRINTLN("Netgraph backend requires non-NULL callback"); + return (-1); + } + + be->fd = -1; + + memset(&ngc, 0, sizeof(ngc)); + + value = get_config_value_node(nvl, "path"); + if (value == NULL) { + EPRINTLN("path must be provided"); + return (-1); + } + strncpy(ngc.path, value, NG_PATHSIZ - 1); + + value = get_config_value_node(nvl, "hook"); + if (value == NULL) + value = "vmlink"; + strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1); + + value = get_config_value_node(nvl, "peerhook"); + if (value == NULL) { + EPRINTLN("peer hook must be provided"); + return (-1); + } + strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1); + + nodename = get_config_value_node(nvl, "socket"); + if (NgMkSockNode(nodename, + &ctrl_sock, &be->fd) < 0) { + EPRINTLN("can't get Netgraph sockets"); + return (-1); + } + + if (NgSendMsg(ctrl_sock, ".", + NGM_GENERIC_COOKIE, + NGM_CONNECT, &ngc, sizeof(ngc)) < 0) { + EPRINTLN("can't connect to node"); + close(ctrl_sock); + goto error; + } + + close(ctrl_sock); + + flags = fcntl(be->fd, F_GETFL); + + if (flags < 0) { + EPRINTLN("can't get socket flags"); + goto error; + } + + if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) { + EPRINTLN("can't set O_NONBLOCK flag"); + goto error; + } + + /* + * The default ng_socket(4) buffer's size is too low. + * Calculate the minimum value between NG_SBUF_MAX_SIZE + * and kern.ipc.maxsockbuf. + */ + msbsz = sizeof(maxsbsz); + if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz, + NULL, 0) < 0) { + EPRINTLN("can't get 'kern.ipc.maxsockbuf' value"); + goto error; + } + + /* + * We can't set the socket buffer size to kern.ipc.maxsockbuf value, + * as it takes into account the mbuf(9) overhead. + */ + maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES); + + sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz); + + if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz, + sizeof(sbsz)) < 0) { + EPRINTLN("can't set TX buffer size"); + goto error; + } + + if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz, + sizeof(sbsz)) < 0) { + EPRINTLN("can't set RX buffer size"); + goto error; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(be->fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + memset(p->bbuf, 0, sizeof(p->bbuf)); + p->bbuflen = 0; + + p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); + if (p->mevp == NULL) { + EPRINTLN("Could not register event"); + goto error; + } + + return (0); + +error: + tap_cleanup(be); + return (-1); +} + +static struct net_backend ng_backend = { + .prefix = "netgraph", + .priv_size = sizeof(struct tap_priv), + .init = ng_init, + .cleanup = tap_cleanup, + .send = tap_send, + .peek_recvlen = tap_peek_recvlen, + .recv = tap_recv, + .recv_enable = tap_recv_enable, + .recv_disable = tap_recv_disable, + .get_cap = tap_get_cap, + .set_cap = tap_set_cap, +}; + +DATA_SET(net_backend_set, ng_backend); diff --git a/usr.sbin/bhyve/net_backend_netmap.c b/usr.sbin/bhyve/net_backend_netmap.c new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/net_backend_netmap.c @@ -0,0 +1,384 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2019 Vincenzo Maffione + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#define NETMAP_WITH_LIBS +#include + +#include + +#include "debug.h" +#include "iov.h" +#include "mevent.h" +#include "net_backends.h" +#include "net_backends_priv.h" + +/* The virtio-net features supported by netmap. */ +#define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ + VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ + VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ + VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) + +struct netmap_priv { + char ifname[IFNAMSIZ]; + struct nm_desc *nmd; + uint16_t memid; + struct netmap_ring *rx; + struct netmap_ring *tx; + struct mevent *mevp; + net_be_rxeof_t cb; + void *cb_param; +}; + +static void +nmreq_init(struct nmreq *req, char *ifname) +{ + + memset(req, 0, sizeof(*req)); + strlcpy(req->nr_name, ifname, sizeof(req->nr_name)); + req->nr_version = NETMAP_API; +} + +static int +netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) +{ + int err; + struct nmreq req; + struct netmap_priv *priv = NET_BE_PRIV(be); + + nmreq_init(&req, priv->ifname); + req.nr_cmd = NETMAP_BDG_VNET_HDR; + req.nr_arg1 = vnet_hdr_len; + err = ioctl(be->fd, NIOCREGIF, &req); + if (err) { + EPRINTLN("Unable to set vnet header length %d", vnet_hdr_len); + return (err); + } + + be->be_vnet_hdr_len = vnet_hdr_len; + + return (0); +} + +static int +netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) +{ + unsigned prev_hdr_len = be->be_vnet_hdr_len; + int ret; + + if (vnet_hdr_len == prev_hdr_len) { + return (1); + } + + ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); + if (ret) { + return (0); + } + + netmap_set_vnet_hdr_len(be, prev_hdr_len); + + return (1); +} + +static uint64_t +netmap_get_cap(struct net_backend *be) +{ + + return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? + NETMAP_FEATURES : 0); +} + +static int +netmap_set_cap(struct net_backend *be, uint64_t features __unused, + unsigned vnet_hdr_len) +{ + + return (netmap_set_vnet_hdr_len(be, vnet_hdr_len)); +} + +static int +netmap_init(struct net_backend *be, const char *devname, + nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) +{ + struct netmap_priv *priv = NET_BE_PRIV(be); + + strlcpy(priv->ifname, devname, sizeof(priv->ifname)); + priv->ifname[sizeof(priv->ifname) - 1] = '\0'; + + priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); + if (priv->nmd == NULL) { + EPRINTLN("Unable to nm_open(): interface '%s', errno (%s)", + devname, strerror(errno)); + return (-1); + } + + priv->memid = priv->nmd->req.nr_arg2; + priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); + priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); + priv->cb = cb; + priv->cb_param = param; + be->fd = priv->nmd->fd; + + priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); + if (priv->mevp == NULL) { + EPRINTLN("Could not register event"); + return (-1); + } + + return (0); +} + +static void +netmap_cleanup(struct net_backend *be) +{ + struct netmap_priv *priv = NET_BE_PRIV(be); + + if (priv->mevp) { + mevent_delete(priv->mevp); + } + if (priv->nmd) { + nm_close(priv->nmd); + } + be->fd = -1; +} + +static ssize_t +netmap_send(struct net_backend *be, const struct iovec *iov, + int iovcnt) +{ + struct netmap_priv *priv = NET_BE_PRIV(be); + struct netmap_ring *ring; + ssize_t totlen = 0; + int nm_buf_size; + int nm_buf_len; + uint32_t head; + uint8_t *nm_buf; + int j; + + ring = priv->tx; + head = ring->head; + if (head == ring->tail) { + EPRINTLN("No space, drop %zu bytes", count_iov(iov, iovcnt)); + goto txsync; + } + nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); + nm_buf_size = ring->nr_buf_size; + nm_buf_len = 0; + + for (j = 0; j < iovcnt; j++) { + uint8_t *iov_frag_buf = iov[j].iov_base; + int iov_frag_size = iov[j].iov_len; + + totlen += iov_frag_size; + + /* + * Split each iovec fragment over more netmap slots, if + * necessary. + */ + for (;;) { + int copylen; + + copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; + memcpy(nm_buf, iov_frag_buf, copylen); + + iov_frag_buf += copylen; + iov_frag_size -= copylen; + nm_buf += copylen; + nm_buf_size -= copylen; + nm_buf_len += copylen; + + if (iov_frag_size == 0) { + break; + } + + ring->slot[head].len = nm_buf_len; + ring->slot[head].flags = NS_MOREFRAG; + head = nm_ring_next(ring, head); + if (head == ring->tail) { + /* + * We ran out of netmap slots while + * splitting the iovec fragments. + */ + EPRINTLN("No space, drop %zu bytes", + count_iov(iov, iovcnt)); + goto txsync; + } + nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); + nm_buf_size = ring->nr_buf_size; + nm_buf_len = 0; + } + } + + /* Complete the last slot, which must not have NS_MOREFRAG set. */ + ring->slot[head].len = nm_buf_len; + ring->slot[head].flags = 0; + head = nm_ring_next(ring, head); + + /* Now update ring->head and ring->cur. */ + ring->head = ring->cur = head; +txsync: + ioctl(be->fd, NIOCTXSYNC, NULL); + + return (totlen); +} + +static ssize_t +netmap_peek_recvlen(struct net_backend *be) +{ + struct netmap_priv *priv = NET_BE_PRIV(be); + struct netmap_ring *ring = priv->rx; + uint32_t head = ring->head; + ssize_t totlen = 0; + + while (head != ring->tail) { + struct netmap_slot *slot = ring->slot + head; + + totlen += slot->len; + if ((slot->flags & NS_MOREFRAG) == 0) + break; + head = nm_ring_next(ring, head); + } + + return (totlen); +} + +static ssize_t +netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) +{ + struct netmap_priv *priv = NET_BE_PRIV(be); + struct netmap_slot *slot = NULL; + struct netmap_ring *ring; + uint8_t *iov_frag_buf; + int iov_frag_size; + ssize_t totlen = 0; + uint32_t head; + + assert(iovcnt); + + ring = priv->rx; + head = ring->head; + iov_frag_buf = iov->iov_base; + iov_frag_size = iov->iov_len; + + do { + uint8_t *nm_buf; + int nm_buf_len; + + if (head == ring->tail) { + return (0); + } + + slot = ring->slot + head; + nm_buf = NETMAP_BUF(ring, slot->buf_idx); + nm_buf_len = slot->len; + + for (;;) { + int copylen = nm_buf_len < iov_frag_size ? + nm_buf_len : iov_frag_size; + + memcpy(iov_frag_buf, nm_buf, copylen); + nm_buf += copylen; + nm_buf_len -= copylen; + iov_frag_buf += copylen; + iov_frag_size -= copylen; + totlen += copylen; + + if (nm_buf_len == 0) { + break; + } + + iov++; + iovcnt--; + if (iovcnt == 0) { + /* No space to receive. */ + EPRINTLN("Short iov, drop %zd bytes", + totlen); + return (-ENOSPC); + } + iov_frag_buf = iov->iov_base; + iov_frag_size = iov->iov_len; + } + + head = nm_ring_next(ring, head); + + } while (slot->flags & NS_MOREFRAG); + + /* Release slots to netmap. */ + ring->head = ring->cur = head; + + return (totlen); +} + +static void +netmap_recv_enable(struct net_backend *be) +{ + struct netmap_priv *priv = NET_BE_PRIV(be); + + mevent_enable(priv->mevp); +} + +static void +netmap_recv_disable(struct net_backend *be) +{ + struct netmap_priv *priv = NET_BE_PRIV(be); + + mevent_disable(priv->mevp); +} + +static struct net_backend netmap_backend = { + .prefix = "netmap", + .priv_size = sizeof(struct netmap_priv), + .init = netmap_init, + .cleanup = netmap_cleanup, + .send = netmap_send, + .peek_recvlen = netmap_peek_recvlen, + .recv = netmap_recv, + .recv_enable = netmap_recv_enable, + .recv_disable = netmap_recv_disable, + .get_cap = netmap_get_cap, + .set_cap = netmap_set_cap, +}; + +/* A clone of the netmap backend, with a different prefix. */ +static struct net_backend vale_backend = { + .prefix = "vale", + .priv_size = sizeof(struct netmap_priv), + .init = netmap_init, + .cleanup = netmap_cleanup, + .send = netmap_send, + .peek_recvlen = netmap_peek_recvlen, + .recv = netmap_recv, + .recv_enable = netmap_recv_enable, + .recv_disable = netmap_recv_disable, + .get_cap = netmap_get_cap, + .set_cap = netmap_set_cap, +}; + +DATA_SET(net_backend_set, netmap_backend); +DATA_SET(net_backend_set, vale_backend); diff --git a/usr.sbin/bhyve/net_backends.h b/usr.sbin/bhyve/net_backends.h --- a/usr.sbin/bhyve/net_backends.h +++ b/usr.sbin/bhyve/net_backends.h @@ -28,7 +28,11 @@ #ifndef __NET_BACKENDS_H__ #define __NET_BACKENDS_H__ -#include +#include +#include +#include + +#include "mevent.h" /* Opaque type representing a network backend. */ typedef struct net_backend net_backend_t; @@ -91,4 +95,6 @@ uint16_t vrh_bufs; } __packed; +#define NET_BE_PRIV(be) ((void *)((be) + 1)) + #endif /* __NET_BACKENDS_H__ */ diff --git a/usr.sbin/bhyve/net_backends.c b/usr.sbin/bhyve/net_backends.c --- a/usr.sbin/bhyve/net_backends.c +++ b/usr.sbin/bhyve/net_backends.c @@ -32,8 +32,7 @@ * features) is exported by net_backends.h. */ -#include -#include /* u_short etc */ +#include #ifndef WITHOUT_CAPSICUM #include #endif @@ -43,153 +42,35 @@ #include #include -#include -#include -#define NETMAP_WITH_LIBS -#include +#include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include +#include +#include +#include #include #include #include #include -#include #include -#include -#include -#include -#include -#include - -#ifdef NETGRAPH -#include -#include -#include -#endif +#include #include "config.h" #include "debug.h" #include "iov.h" #include "mevent.h" #include "net_backends.h" +#include "net_backends_priv.h" #include "pci_emul.h" -#include - -/* - * Each network backend registers a set of function pointers that are - * used to implement the net backends API. - * This might need to be exposed if we implement backends in separate files. - */ -struct net_backend { - const char *prefix; /* prefix matching this backend */ - - /* - * Routines used to initialize and cleanup the resources needed - * by a backend. The cleanup function is used internally, - * and should not be called by the frontend. - */ - int (*init)(struct net_backend *be, const char *devname, - nvlist_t *nvl, net_be_rxeof_t cb, void *param); - void (*cleanup)(struct net_backend *be); - - /* - * Called to serve a guest transmit request. The scatter-gather - * vector provided by the caller has 'iovcnt' elements and contains - * the packet to send. - */ - ssize_t (*send)(struct net_backend *be, const struct iovec *iov, - int iovcnt); - - /* - * Get the length of the next packet that can be received from - * the backend. If no packets are currently available, this - * function returns 0. - */ - ssize_t (*peek_recvlen)(struct net_backend *be); - - /* - * Called to receive a packet from the backend. When the function - * returns a positive value 'len', the scatter-gather vector - * provided by the caller contains a packet with such length. - * The function returns 0 if the backend doesn't have a new packet to - * receive. - */ - ssize_t (*recv)(struct net_backend *be, const struct iovec *iov, - int iovcnt); - - /* - * Ask the backend to enable or disable receive operation in the - * backend. On return from a disable operation, it is guaranteed - * that the receive callback won't be called until receive is - * enabled again. Note however that it is up to the caller to make - * sure that netbe_recv() is not currently being executed by another - * thread. - */ - void (*recv_enable)(struct net_backend *be); - void (*recv_disable)(struct net_backend *be); - - /* - * Ask the backend for the virtio-net features it is able to - * support. Possible features are TSO, UFO and checksum offloading - * in both rx and tx direction and for both IPv4 and IPv6. - */ - uint64_t (*get_cap)(struct net_backend *be); - - /* - * Tell the backend to enable/disable the specified virtio-net - * features (capabilities). - */ - int (*set_cap)(struct net_backend *be, uint64_t features, - unsigned int vnet_hdr_len); - - struct pci_vtnet_softc *sc; - int fd; - - /* - * Length of the virtio-net header used by the backend and the - * frontend, respectively. A zero value means that the header - * is not used. - */ - unsigned int be_vnet_hdr_len; - unsigned int fe_vnet_hdr_len; - - /* Size of backend-specific private data. */ - size_t priv_size; - - /* Backend-specific private data follows. */ -}; - -#define NET_BE_PRIV(be) ((void *)((be) + 1)) #define NET_BE_SIZE(be) (sizeof(*be) + (be)->priv_size) -SET_DECLARE(net_backend_set, struct net_backend); - -#define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) - -#define WPRINTF(params) PRINTLN params - -/* - * The tap backend - */ - -struct tap_priv { - struct mevent *mevp; - /* - * A bounce buffer that allows us to implement the peek_recvlen - * callback. In the future we may get the same information from - * the kevent data. - */ - char bbuf[1 << 16]; - ssize_t bbuflen; -}; - -static void +void tap_cleanup(struct net_backend *be) { struct tap_priv *priv = NET_BE_PRIV(be); @@ -216,7 +97,7 @@ #endif if (cb == NULL) { - WPRINTF(("TAP backend requires non-NULL callback")); + EPRINTLN("TAP backend requires non-NULL callback"); return (-1); } @@ -225,7 +106,7 @@ be->fd = open(tbuf, O_RDWR); if (be->fd == -1) { - WPRINTF(("open of tap device %s failed", tbuf)); + EPRINTLN("open of tap device %s failed", tbuf); goto error; } @@ -234,12 +115,12 @@ * notifications with the event loop */ if (ioctl(be->fd, FIONBIO, &opt) < 0) { - WPRINTF(("tap device O_NONBLOCK failed")); + EPRINTLN("tap device O_NONBLOCK failed"); goto error; } if (ioctl(be->fd, VMIO_SIOCSIFFLAGS, up)) { - WPRINTF(("tap device link up failed")); + EPRINTLN("tap device link up failed"); goto error; } @@ -254,7 +135,7 @@ priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); if (priv->mevp == NULL) { - WPRINTF(("Could not register event")); + EPRINTLN("Could not register event"); goto error; } @@ -268,13 +149,13 @@ /* * Called to send a buffer chain out to the tap device */ -static ssize_t +ssize_t tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) { return (writev(be->fd, iov, iovcnt)); } -static ssize_t +ssize_t tap_peek_recvlen(struct net_backend *be) { struct tap_priv *priv = NET_BE_PRIV(be); @@ -304,7 +185,7 @@ return (ret); } -static ssize_t +ssize_t tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) { struct tap_priv *priv = NET_BE_PRIV(be); @@ -332,7 +213,7 @@ return (ret); } -static void +void tap_recv_enable(struct net_backend *be) { struct tap_priv *priv = NET_BE_PRIV(be); @@ -340,7 +221,7 @@ mevent_enable(priv->mevp); } -static void +void tap_recv_disable(struct net_backend *be) { struct tap_priv *priv = NET_BE_PRIV(be); @@ -348,14 +229,14 @@ mevent_disable(priv->mevp); } -static uint64_t +uint64_t tap_get_cap(struct net_backend *be __unused) { return (0); /* no capabilities for now */ } -static int +int tap_set_cap(struct net_backend *be __unused, uint64_t features, unsigned vnet_hdr_len) { @@ -395,508 +276,6 @@ DATA_SET(net_backend_set, tap_backend); DATA_SET(net_backend_set, vmnet_backend); -#ifdef NETGRAPH - -/* - * Netgraph backend - */ - -#define NG_SBUF_MAX_SIZE (4 * 1024 * 1024) - -static int -ng_init(struct net_backend *be, const char *devname __unused, - nvlist_t *nvl, net_be_rxeof_t cb, void *param) -{ - struct tap_priv *p = NET_BE_PRIV(be); - struct ngm_connect ngc; - const char *value, *nodename; - int sbsz; - int ctrl_sock; - int flags; - unsigned long maxsbsz; - size_t msbsz; -#ifndef WITHOUT_CAPSICUM - cap_rights_t rights; -#endif - - if (cb == NULL) { - WPRINTF(("Netgraph backend requires non-NULL callback")); - return (-1); - } - - be->fd = -1; - - memset(&ngc, 0, sizeof(ngc)); - - value = get_config_value_node(nvl, "path"); - if (value == NULL) { - WPRINTF(("path must be provided")); - return (-1); - } - strncpy(ngc.path, value, NG_PATHSIZ - 1); - - value = get_config_value_node(nvl, "hook"); - if (value == NULL) - value = "vmlink"; - strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1); - - value = get_config_value_node(nvl, "peerhook"); - if (value == NULL) { - WPRINTF(("peer hook must be provided")); - return (-1); - } - strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1); - - nodename = get_config_value_node(nvl, "socket"); - if (NgMkSockNode(nodename, - &ctrl_sock, &be->fd) < 0) { - WPRINTF(("can't get Netgraph sockets")); - return (-1); - } - - if (NgSendMsg(ctrl_sock, ".", - NGM_GENERIC_COOKIE, - NGM_CONNECT, &ngc, sizeof(ngc)) < 0) { - WPRINTF(("can't connect to node")); - close(ctrl_sock); - goto error; - } - - close(ctrl_sock); - - flags = fcntl(be->fd, F_GETFL); - - if (flags < 0) { - WPRINTF(("can't get socket flags")); - goto error; - } - - if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) { - WPRINTF(("can't set O_NONBLOCK flag")); - goto error; - } - - /* - * The default ng_socket(4) buffer's size is too low. - * Calculate the minimum value between NG_SBUF_MAX_SIZE - * and kern.ipc.maxsockbuf. - */ - msbsz = sizeof(maxsbsz); - if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz, - NULL, 0) < 0) { - WPRINTF(("can't get 'kern.ipc.maxsockbuf' value")); - goto error; - } - - /* - * We can't set the socket buffer size to kern.ipc.maxsockbuf value, - * as it takes into account the mbuf(9) overhead. - */ - maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES); - - sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz); - - if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz, - sizeof(sbsz)) < 0) { - WPRINTF(("can't set TX buffer size")); - goto error; - } - - if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz, - sizeof(sbsz)) < 0) { - WPRINTF(("can't set RX buffer size")); - goto error; - } - -#ifndef WITHOUT_CAPSICUM - cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); - if (caph_rights_limit(be->fd, &rights) == -1) - errx(EX_OSERR, "Unable to apply rights for sandbox"); -#endif - - memset(p->bbuf, 0, sizeof(p->bbuf)); - p->bbuflen = 0; - - p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); - if (p->mevp == NULL) { - WPRINTF(("Could not register event")); - goto error; - } - - return (0); - -error: - tap_cleanup(be); - return (-1); -} - -static struct net_backend ng_backend = { - .prefix = "netgraph", - .priv_size = sizeof(struct tap_priv), - .init = ng_init, - .cleanup = tap_cleanup, - .send = tap_send, - .peek_recvlen = tap_peek_recvlen, - .recv = tap_recv, - .recv_enable = tap_recv_enable, - .recv_disable = tap_recv_disable, - .get_cap = tap_get_cap, - .set_cap = tap_set_cap, -}; - -DATA_SET(net_backend_set, ng_backend); - -#endif /* NETGRAPH */ - -/* - * The netmap backend - */ - -/* The virtio-net features supported by netmap. */ -#define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ - VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ - VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ - VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) - -struct netmap_priv { - char ifname[IFNAMSIZ]; - struct nm_desc *nmd; - uint16_t memid; - struct netmap_ring *rx; - struct netmap_ring *tx; - struct mevent *mevp; - net_be_rxeof_t cb; - void *cb_param; -}; - -static void -nmreq_init(struct nmreq *req, char *ifname) -{ - - memset(req, 0, sizeof(*req)); - strlcpy(req->nr_name, ifname, sizeof(req->nr_name)); - req->nr_version = NETMAP_API; -} - -static int -netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) -{ - int err; - struct nmreq req; - struct netmap_priv *priv = NET_BE_PRIV(be); - - nmreq_init(&req, priv->ifname); - req.nr_cmd = NETMAP_BDG_VNET_HDR; - req.nr_arg1 = vnet_hdr_len; - err = ioctl(be->fd, NIOCREGIF, &req); - if (err) { - WPRINTF(("Unable to set vnet header length %d", - vnet_hdr_len)); - return (err); - } - - be->be_vnet_hdr_len = vnet_hdr_len; - - return (0); -} - -static int -netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) -{ - unsigned prev_hdr_len = be->be_vnet_hdr_len; - int ret; - - if (vnet_hdr_len == prev_hdr_len) { - return (1); - } - - ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); - if (ret) { - return (0); - } - - netmap_set_vnet_hdr_len(be, prev_hdr_len); - - return (1); -} - -static uint64_t -netmap_get_cap(struct net_backend *be) -{ - - return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? - NETMAP_FEATURES : 0); -} - -static int -netmap_set_cap(struct net_backend *be, uint64_t features __unused, - unsigned vnet_hdr_len) -{ - - return (netmap_set_vnet_hdr_len(be, vnet_hdr_len)); -} - -static int -netmap_init(struct net_backend *be, const char *devname, - nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) -{ - struct netmap_priv *priv = NET_BE_PRIV(be); - - strlcpy(priv->ifname, devname, sizeof(priv->ifname)); - priv->ifname[sizeof(priv->ifname) - 1] = '\0'; - - priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); - if (priv->nmd == NULL) { - WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)", - devname, strerror(errno))); - return (-1); - } - - priv->memid = priv->nmd->req.nr_arg2; - priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); - priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); - priv->cb = cb; - priv->cb_param = param; - be->fd = priv->nmd->fd; - - priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); - if (priv->mevp == NULL) { - WPRINTF(("Could not register event")); - return (-1); - } - - return (0); -} - -static void -netmap_cleanup(struct net_backend *be) -{ - struct netmap_priv *priv = NET_BE_PRIV(be); - - if (priv->mevp) { - mevent_delete(priv->mevp); - } - if (priv->nmd) { - nm_close(priv->nmd); - } - be->fd = -1; -} - -static ssize_t -netmap_send(struct net_backend *be, const struct iovec *iov, - int iovcnt) -{ - struct netmap_priv *priv = NET_BE_PRIV(be); - struct netmap_ring *ring; - ssize_t totlen = 0; - int nm_buf_size; - int nm_buf_len; - uint32_t head; - uint8_t *nm_buf; - int j; - - ring = priv->tx; - head = ring->head; - if (head == ring->tail) { - WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt))); - goto txsync; - } - nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); - nm_buf_size = ring->nr_buf_size; - nm_buf_len = 0; - - for (j = 0; j < iovcnt; j++) { - uint8_t *iov_frag_buf = iov[j].iov_base; - int iov_frag_size = iov[j].iov_len; - - totlen += iov_frag_size; - - /* - * Split each iovec fragment over more netmap slots, if - * necessary. - */ - for (;;) { - int copylen; - - copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; - memcpy(nm_buf, iov_frag_buf, copylen); - - iov_frag_buf += copylen; - iov_frag_size -= copylen; - nm_buf += copylen; - nm_buf_size -= copylen; - nm_buf_len += copylen; - - if (iov_frag_size == 0) { - break; - } - - ring->slot[head].len = nm_buf_len; - ring->slot[head].flags = NS_MOREFRAG; - head = nm_ring_next(ring, head); - if (head == ring->tail) { - /* - * We ran out of netmap slots while - * splitting the iovec fragments. - */ - WPRINTF(("No space, drop %zu bytes", - count_iov(iov, iovcnt))); - goto txsync; - } - nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); - nm_buf_size = ring->nr_buf_size; - nm_buf_len = 0; - } - } - - /* Complete the last slot, which must not have NS_MOREFRAG set. */ - ring->slot[head].len = nm_buf_len; - ring->slot[head].flags = 0; - head = nm_ring_next(ring, head); - - /* Now update ring->head and ring->cur. */ - ring->head = ring->cur = head; -txsync: - ioctl(be->fd, NIOCTXSYNC, NULL); - - return (totlen); -} - -static ssize_t -netmap_peek_recvlen(struct net_backend *be) -{ - struct netmap_priv *priv = NET_BE_PRIV(be); - struct netmap_ring *ring = priv->rx; - uint32_t head = ring->head; - ssize_t totlen = 0; - - while (head != ring->tail) { - struct netmap_slot *slot = ring->slot + head; - - totlen += slot->len; - if ((slot->flags & NS_MOREFRAG) == 0) - break; - head = nm_ring_next(ring, head); - } - - return (totlen); -} - -static ssize_t -netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) -{ - struct netmap_priv *priv = NET_BE_PRIV(be); - struct netmap_slot *slot = NULL; - struct netmap_ring *ring; - uint8_t *iov_frag_buf; - int iov_frag_size; - ssize_t totlen = 0; - uint32_t head; - - assert(iovcnt); - - ring = priv->rx; - head = ring->head; - iov_frag_buf = iov->iov_base; - iov_frag_size = iov->iov_len; - - do { - uint8_t *nm_buf; - int nm_buf_len; - - if (head == ring->tail) { - return (0); - } - - slot = ring->slot + head; - nm_buf = NETMAP_BUF(ring, slot->buf_idx); - nm_buf_len = slot->len; - - for (;;) { - int copylen = nm_buf_len < iov_frag_size ? - nm_buf_len : iov_frag_size; - - memcpy(iov_frag_buf, nm_buf, copylen); - nm_buf += copylen; - nm_buf_len -= copylen; - iov_frag_buf += copylen; - iov_frag_size -= copylen; - totlen += copylen; - - if (nm_buf_len == 0) { - break; - } - - iov++; - iovcnt--; - if (iovcnt == 0) { - /* No space to receive. */ - WPRINTF(("Short iov, drop %zd bytes", - totlen)); - return (-ENOSPC); - } - iov_frag_buf = iov->iov_base; - iov_frag_size = iov->iov_len; - } - - head = nm_ring_next(ring, head); - - } while (slot->flags & NS_MOREFRAG); - - /* Release slots to netmap. */ - ring->head = ring->cur = head; - - return (totlen); -} - -static void -netmap_recv_enable(struct net_backend *be) -{ - struct netmap_priv *priv = NET_BE_PRIV(be); - - mevent_enable(priv->mevp); -} - -static void -netmap_recv_disable(struct net_backend *be) -{ - struct netmap_priv *priv = NET_BE_PRIV(be); - - mevent_disable(priv->mevp); -} - -static struct net_backend netmap_backend = { - .prefix = "netmap", - .priv_size = sizeof(struct netmap_priv), - .init = netmap_init, - .cleanup = netmap_cleanup, - .send = netmap_send, - .peek_recvlen = netmap_peek_recvlen, - .recv = netmap_recv, - .recv_enable = netmap_recv_enable, - .recv_disable = netmap_recv_disable, - .get_cap = netmap_get_cap, - .set_cap = netmap_set_cap, -}; - -/* A clone of the netmap backend, with a different prefix. */ -static struct net_backend vale_backend = { - .prefix = "vale", - .priv_size = sizeof(struct netmap_priv), - .init = netmap_init, - .cleanup = netmap_cleanup, - .send = netmap_send, - .peek_recvlen = netmap_peek_recvlen, - .recv = netmap_recv, - .recv_enable = netmap_recv_enable, - .recv_disable = netmap_recv_disable, - .get_cap = netmap_get_cap, - .set_cap = netmap_set_cap, -}; - -DATA_SET(net_backend_set, netmap_backend); -DATA_SET(net_backend_set, vale_backend); - int netbe_legacy_config(nvlist_t *nvl, const char *opts) { diff --git a/usr.sbin/bhyve/net_backends_priv.h b/usr.sbin/bhyve/net_backends_priv.h new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/net_backends_priv.h @@ -0,0 +1,150 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2019 Vincenzo Maffione + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __NET_BACKENDS_PRIV_H__ +#define __NET_BACKENDS_PRIV_H__ + +#include + +/* + * Each network backend registers a set of function pointers that are + * used to implement the net backends API. Frontends should not invoke + * these functions directly, but should instead use the interface provided by + * net_backends.h. + */ +struct net_backend { + const char *prefix; /* prefix matching this backend */ + + /* + * Routines used to initialize and cleanup the resources needed + * by a backend. The cleanup function is used internally, + * and should not be called by the frontend. + */ + int (*init)(struct net_backend *be, const char *devname, + nvlist_t *nvl, net_be_rxeof_t cb, void *param); + void (*cleanup)(struct net_backend *be); + + /* + * Called to serve a guest transmit request. The scatter-gather + * vector provided by the caller has 'iovcnt' elements and contains + * the packet to send. + */ + ssize_t (*send)(struct net_backend *be, const struct iovec *iov, + int iovcnt); + + /* + * Get the length of the next packet that can be received from + * the backend. If no packets are currently available, this + * function returns 0. + */ + ssize_t (*peek_recvlen)(struct net_backend *be); + + /* + * Called to receive a packet from the backend. When the function + * returns a positive value 'len', the scatter-gather vector + * provided by the caller contains a packet with such length. + * The function returns 0 if the backend doesn't have a new packet to + * receive. + */ + ssize_t (*recv)(struct net_backend *be, const struct iovec *iov, + int iovcnt); + + /* + * Ask the backend to enable or disable receive operation in the + * backend. On return from a disable operation, it is guaranteed + * that the receive callback won't be called until receive is + * enabled again. Note however that it is up to the caller to make + * sure that netbe_recv() is not currently being executed by another + * thread. + */ + void (*recv_enable)(struct net_backend *be); + void (*recv_disable)(struct net_backend *be); + + /* + * Ask the backend for the virtio-net features it is able to + * support. Possible features are TSO, UFO and checksum offloading + * in both rx and tx direction and for both IPv4 and IPv6. + */ + uint64_t (*get_cap)(struct net_backend *be); + + /* + * Tell the backend to enable/disable the specified virtio-net + * features (capabilities). + */ + int (*set_cap)(struct net_backend *be, uint64_t features, + unsigned int vnet_hdr_len); + + struct pci_vtnet_softc *sc; + int fd; + + /* + * Length of the virtio-net header used by the backend and the + * frontend, respectively. A zero value means that the header + * is not used. + */ + unsigned int be_vnet_hdr_len; + unsigned int fe_vnet_hdr_len; + + /* Size of backend-specific private data. */ + size_t priv_size; + + /* Backend-specific private data follows. */ +}; + +SET_DECLARE(net_backend_set, struct net_backend); + +#define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) + +/* + * Export the tap backend routines for the benefit of other backends which have + * a similar interface to the kernel, i.e., they send and receive data using + * standard I/O system calls with a single file descriptor. + */ + +struct tap_priv { + struct mevent *mevp; + /* + * A bounce buffer that allows us to implement the peek_recvlen + * callback. In the future we may get the same information from + * the kevent data. + */ + char bbuf[1 << 16]; + ssize_t bbuflen; +}; + +void tap_cleanup(struct net_backend *be); +ssize_t tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt); +ssize_t tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt); +ssize_t tap_peek_recvlen(struct net_backend *be); +void tap_recv_enable(struct net_backend *be); +ssize_t tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt); +void tap_recv_disable(struct net_backend *be); +uint64_t tap_get_cap(struct net_backend *be); +int tap_set_cap(struct net_backend *be, uint64_t features, + unsigned vnet_hdr_len); + +#endif /* !__NET_BACKENDS_PRIV_H__ */