diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile index de8e87d2ad49..6ce7f6c7ba62 100644 --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -1,119 +1,120 @@ # # .include PROG= bhyve PACKAGE= bhyve MAN= bhyve.8 bhyve_config.5 BHYVE_SYSDIR?=${SRCTOP} .PATH: ${.CURDIR}/${MACHINE_CPUARCH} \ ${SRCTOP}/sys/libkern \ ${SRCTOP}/sys/cam/ctl SRCS= \ acpi.c \ acpi_device.c \ audio.c \ basl.c \ bhyvegc.c \ bhyverun.c \ bhyverun_machdep.c \ block_if.c \ bootrom.c \ config.c \ console.c \ crc16.c \ ctl_scsi_all.c \ ctl_util.c \ hda_codec.c \ iov.c \ mem.c \ mevent.c \ + net_backend_netmap.c \ net_backends.c \ net_utils.c \ pci_emul.c \ pci_hostbridge.c \ pci_nvme.c \ pci_passthru.c \ pci_virtio_9p.c \ pci_virtio_block.c \ pci_virtio_console.c \ pci_virtio_input.c \ pci_virtio_net.c \ pci_virtio_rnd.c \ pci_virtio_scsi.c \ qemu_fwcfg.c \ qemu_loader.c \ smbiostbl.c \ sockstream.c \ tpm_device.c \ tpm_emul_passthru.c \ tpm_intf_crb.c \ tpm_ppi_qemu.c \ uart_emul.c \ usb_emul.c \ usb_mouse.c \ virtio.c \ vmexit.c \ vmgenc.c .if ${MK_BHYVE_SNAPSHOT} != "no" SRCS+= snapshot.c .endif .include "${MACHINE_CPUARCH}/Makefile.inc" .if defined(BHYVE_GDB_SUPPORT) SRCS+= gdb.c CFLAGS+= -DBHYVE_GDB .ifdef GDB_LOG CFLAGS+=-DGDB_LOG .endif .endif CFLAGS+=-I${.CURDIR} \ -I${.CURDIR}/../../contrib/lib9p \ -I${SRCTOP}/sys LIBADD= vmmapi md nv pthread z util sbuf cam 9p .if ${MK_BHYVE_SNAPSHOT} != "no" LIBADD+= ucl xo .endif .if ${MK_INET_SUPPORT} != "no" CFLAGS+=-DINET .endif .if ${MK_INET6_SUPPORT} != "no" CFLAGS+=-DINET6 .endif .if ${MK_NETGRAPH_SUPPORT} != "no" -CFLAGS+=-DNETGRAPH -LIBADD+= netgraph +SRCS+= net_backend_netgraph.c +LIBADD+= netgraph .endif .if ${MK_OPENSSL} == "no" CFLAGS+=-DNO_OPENSSL .else LIBADD+= crypto CFLAGS+=-DOPENSSL_API_COMPAT=0x10100000L .endif CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000 CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller .if ${MK_BHYVE_SNAPSHOT} != "no" CFLAGS+= -I${SRCTOP}/contrib/libucl/include CFLAGS+= -DBHYVE_SNAPSHOT .endif # Disable thread safety analysis since it only finds very simple bugs and # yields many false positives. NO_WTHREAD_SAFETY= NO_WCAST_ALIGN= .include diff --git a/usr.sbin/bhyve/net_backend_netgraph.c b/usr.sbin/bhyve/net_backend_netgraph.c new file mode 100644 index 000000000000..7d1659d611e3 --- /dev/null +++ b/usr.sbin/bhyve/net_backend_netgraph.c @@ -0,0 +1,191 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2019 Vincenzo Maffione + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include + +#include "config.h" +#include "debug.h" +#include "net_backends.h" +#include "net_backends_priv.h" + +#define NG_SBUF_MAX_SIZE (4 * 1024 * 1024) + +static int +ng_init(struct net_backend *be, const char *devname __unused, + nvlist_t *nvl, net_be_rxeof_t cb, void *param) +{ + struct tap_priv *p = NET_BE_PRIV(be); + struct ngm_connect ngc; + const char *value, *nodename; + int sbsz; + int ctrl_sock; + int flags; + unsigned long maxsbsz; + size_t msbsz; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + if (cb == NULL) { + EPRINTLN("Netgraph backend requires non-NULL callback"); + return (-1); + } + + be->fd = -1; + + memset(&ngc, 0, sizeof(ngc)); + + value = get_config_value_node(nvl, "path"); + if (value == NULL) { + EPRINTLN("path must be provided"); + return (-1); + } + strncpy(ngc.path, value, NG_PATHSIZ - 1); + + value = get_config_value_node(nvl, "hook"); + if (value == NULL) + value = "vmlink"; + strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1); + + value = get_config_value_node(nvl, "peerhook"); + if (value == NULL) { + EPRINTLN("peer hook must be provided"); + return (-1); + } + strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1); + + nodename = get_config_value_node(nvl, "socket"); + if (NgMkSockNode(nodename, + &ctrl_sock, &be->fd) < 0) { + EPRINTLN("can't get Netgraph sockets"); + return (-1); + } + + if (NgSendMsg(ctrl_sock, ".", + NGM_GENERIC_COOKIE, + NGM_CONNECT, &ngc, sizeof(ngc)) < 0) { + EPRINTLN("can't connect to node"); + close(ctrl_sock); + goto error; + } + + close(ctrl_sock); + + flags = fcntl(be->fd, F_GETFL); + + if (flags < 0) { + EPRINTLN("can't get socket flags"); + goto error; + } + + if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) { + EPRINTLN("can't set O_NONBLOCK flag"); + goto error; + } + + /* + * The default ng_socket(4) buffer's size is too low. + * Calculate the minimum value between NG_SBUF_MAX_SIZE + * and kern.ipc.maxsockbuf. + */ + msbsz = sizeof(maxsbsz); + if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz, + NULL, 0) < 0) { + EPRINTLN("can't get 'kern.ipc.maxsockbuf' value"); + goto error; + } + + /* + * We can't set the socket buffer size to kern.ipc.maxsockbuf value, + * as it takes into account the mbuf(9) overhead. + */ + maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES); + + sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz); + + if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz, + sizeof(sbsz)) < 0) { + EPRINTLN("can't set TX buffer size"); + goto error; + } + + if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz, + sizeof(sbsz)) < 0) { + EPRINTLN("can't set RX buffer size"); + goto error; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(be->fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + memset(p->bbuf, 0, sizeof(p->bbuf)); + p->bbuflen = 0; + + p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); + if (p->mevp == NULL) { + EPRINTLN("Could not register event"); + goto error; + } + + return (0); + +error: + tap_cleanup(be); + return (-1); +} + +static struct net_backend ng_backend = { + .prefix = "netgraph", + .priv_size = sizeof(struct tap_priv), + .init = ng_init, + .cleanup = tap_cleanup, + .send = tap_send, + .peek_recvlen = tap_peek_recvlen, + .recv = tap_recv, + .recv_enable = tap_recv_enable, + .recv_disable = tap_recv_disable, + .get_cap = tap_get_cap, + .set_cap = tap_set_cap, +}; + +DATA_SET(net_backend_set, ng_backend); diff --git a/usr.sbin/bhyve/net_backend_netmap.c b/usr.sbin/bhyve/net_backend_netmap.c new file mode 100644 index 000000000000..5ba11b96797c --- /dev/null +++ b/usr.sbin/bhyve/net_backend_netmap.c @@ -0,0 +1,384 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2019 Vincenzo Maffione + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#define NETMAP_WITH_LIBS +#include + +#include + +#include "debug.h" +#include "iov.h" +#include "mevent.h" +#include "net_backends.h" +#include "net_backends_priv.h" + +/* The virtio-net features supported by netmap. */ +#define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ + VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ + VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ + VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) + +struct netmap_priv { + char ifname[IFNAMSIZ]; + struct nm_desc *nmd; + uint16_t memid; + struct netmap_ring *rx; + struct netmap_ring *tx; + struct mevent *mevp; + net_be_rxeof_t cb; + void *cb_param; +}; + +static void +nmreq_init(struct nmreq *req, char *ifname) +{ + + memset(req, 0, sizeof(*req)); + strlcpy(req->nr_name, ifname, sizeof(req->nr_name)); + req->nr_version = NETMAP_API; +} + +static int +netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) +{ + int err; + struct nmreq req; + struct netmap_priv *priv = NET_BE_PRIV(be); + + nmreq_init(&req, priv->ifname); + req.nr_cmd = NETMAP_BDG_VNET_HDR; + req.nr_arg1 = vnet_hdr_len; + err = ioctl(be->fd, NIOCREGIF, &req); + if (err) { + EPRINTLN("Unable to set vnet header length %d", vnet_hdr_len); + return (err); + } + + be->be_vnet_hdr_len = vnet_hdr_len; + + return (0); +} + +static int +netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) +{ + unsigned prev_hdr_len = be->be_vnet_hdr_len; + int ret; + + if (vnet_hdr_len == prev_hdr_len) { + return (1); + } + + ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); + if (ret) { + return (0); + } + + netmap_set_vnet_hdr_len(be, prev_hdr_len); + + return (1); +} + +static uint64_t +netmap_get_cap(struct net_backend *be) +{ + + return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? + NETMAP_FEATURES : 0); +} + +static int +netmap_set_cap(struct net_backend *be, uint64_t features __unused, + unsigned vnet_hdr_len) +{ + + return (netmap_set_vnet_hdr_len(be, vnet_hdr_len)); +} + +static int +netmap_init(struct net_backend *be, const char *devname, + nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) +{ + struct netmap_priv *priv = NET_BE_PRIV(be); + + strlcpy(priv->ifname, devname, sizeof(priv->ifname)); + priv->ifname[sizeof(priv->ifname) - 1] = '\0'; + + priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); + if (priv->nmd == NULL) { + EPRINTLN("Unable to nm_open(): interface '%s', errno (%s)", + devname, strerror(errno)); + return (-1); + } + + priv->memid = priv->nmd->req.nr_arg2; + priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); + priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); + priv->cb = cb; + priv->cb_param = param; + be->fd = priv->nmd->fd; + + priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); + if (priv->mevp == NULL) { + EPRINTLN("Could not register event"); + return (-1); + } + + return (0); +} + +static void +netmap_cleanup(struct net_backend *be) +{ + struct netmap_priv *priv = NET_BE_PRIV(be); + + if (priv->mevp) { + mevent_delete(priv->mevp); + } + if (priv->nmd) { + nm_close(priv->nmd); + } + be->fd = -1; +} + +static ssize_t +netmap_send(struct net_backend *be, const struct iovec *iov, + int iovcnt) +{ + struct netmap_priv *priv = NET_BE_PRIV(be); + struct netmap_ring *ring; + ssize_t totlen = 0; + int nm_buf_size; + int nm_buf_len; + uint32_t head; + uint8_t *nm_buf; + int j; + + ring = priv->tx; + head = ring->head; + if (head == ring->tail) { + EPRINTLN("No space, drop %zu bytes", count_iov(iov, iovcnt)); + goto txsync; + } + nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); + nm_buf_size = ring->nr_buf_size; + nm_buf_len = 0; + + for (j = 0; j < iovcnt; j++) { + uint8_t *iov_frag_buf = iov[j].iov_base; + int iov_frag_size = iov[j].iov_len; + + totlen += iov_frag_size; + + /* + * Split each iovec fragment over more netmap slots, if + * necessary. + */ + for (;;) { + int copylen; + + copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; + memcpy(nm_buf, iov_frag_buf, copylen); + + iov_frag_buf += copylen; + iov_frag_size -= copylen; + nm_buf += copylen; + nm_buf_size -= copylen; + nm_buf_len += copylen; + + if (iov_frag_size == 0) { + break; + } + + ring->slot[head].len = nm_buf_len; + ring->slot[head].flags = NS_MOREFRAG; + head = nm_ring_next(ring, head); + if (head == ring->tail) { + /* + * We ran out of netmap slots while + * splitting the iovec fragments. + */ + EPRINTLN("No space, drop %zu bytes", + count_iov(iov, iovcnt)); + goto txsync; + } + nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); + nm_buf_size = ring->nr_buf_size; + nm_buf_len = 0; + } + } + + /* Complete the last slot, which must not have NS_MOREFRAG set. */ + ring->slot[head].len = nm_buf_len; + ring->slot[head].flags = 0; + head = nm_ring_next(ring, head); + + /* Now update ring->head and ring->cur. */ + ring->head = ring->cur = head; +txsync: + ioctl(be->fd, NIOCTXSYNC, NULL); + + return (totlen); +} + +static ssize_t +netmap_peek_recvlen(struct net_backend *be) +{ + struct netmap_priv *priv = NET_BE_PRIV(be); + struct netmap_ring *ring = priv->rx; + uint32_t head = ring->head; + ssize_t totlen = 0; + + while (head != ring->tail) { + struct netmap_slot *slot = ring->slot + head; + + totlen += slot->len; + if ((slot->flags & NS_MOREFRAG) == 0) + break; + head = nm_ring_next(ring, head); + } + + return (totlen); +} + +static ssize_t +netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) +{ + struct netmap_priv *priv = NET_BE_PRIV(be); + struct netmap_slot *slot = NULL; + struct netmap_ring *ring; + uint8_t *iov_frag_buf; + int iov_frag_size; + ssize_t totlen = 0; + uint32_t head; + + assert(iovcnt); + + ring = priv->rx; + head = ring->head; + iov_frag_buf = iov->iov_base; + iov_frag_size = iov->iov_len; + + do { + uint8_t *nm_buf; + int nm_buf_len; + + if (head == ring->tail) { + return (0); + } + + slot = ring->slot + head; + nm_buf = NETMAP_BUF(ring, slot->buf_idx); + nm_buf_len = slot->len; + + for (;;) { + int copylen = nm_buf_len < iov_frag_size ? + nm_buf_len : iov_frag_size; + + memcpy(iov_frag_buf, nm_buf, copylen); + nm_buf += copylen; + nm_buf_len -= copylen; + iov_frag_buf += copylen; + iov_frag_size -= copylen; + totlen += copylen; + + if (nm_buf_len == 0) { + break; + } + + iov++; + iovcnt--; + if (iovcnt == 0) { + /* No space to receive. */ + EPRINTLN("Short iov, drop %zd bytes", + totlen); + return (-ENOSPC); + } + iov_frag_buf = iov->iov_base; + iov_frag_size = iov->iov_len; + } + + head = nm_ring_next(ring, head); + + } while (slot->flags & NS_MOREFRAG); + + /* Release slots to netmap. */ + ring->head = ring->cur = head; + + return (totlen); +} + +static void +netmap_recv_enable(struct net_backend *be) +{ + struct netmap_priv *priv = NET_BE_PRIV(be); + + mevent_enable(priv->mevp); +} + +static void +netmap_recv_disable(struct net_backend *be) +{ + struct netmap_priv *priv = NET_BE_PRIV(be); + + mevent_disable(priv->mevp); +} + +static struct net_backend netmap_backend = { + .prefix = "netmap", + .priv_size = sizeof(struct netmap_priv), + .init = netmap_init, + .cleanup = netmap_cleanup, + .send = netmap_send, + .peek_recvlen = netmap_peek_recvlen, + .recv = netmap_recv, + .recv_enable = netmap_recv_enable, + .recv_disable = netmap_recv_disable, + .get_cap = netmap_get_cap, + .set_cap = netmap_set_cap, +}; + +/* A clone of the netmap backend, with a different prefix. */ +static struct net_backend vale_backend = { + .prefix = "vale", + .priv_size = sizeof(struct netmap_priv), + .init = netmap_init, + .cleanup = netmap_cleanup, + .send = netmap_send, + .peek_recvlen = netmap_peek_recvlen, + .recv = netmap_recv, + .recv_enable = netmap_recv_enable, + .recv_disable = netmap_recv_disable, + .get_cap = netmap_get_cap, + .set_cap = netmap_set_cap, +}; + +DATA_SET(net_backend_set, netmap_backend); +DATA_SET(net_backend_set, vale_backend); diff --git a/usr.sbin/bhyve/net_backends.c b/usr.sbin/bhyve/net_backends.c index de6afab53854..2d11c45f217a 100644 --- a/usr.sbin/bhyve/net_backends.c +++ b/usr.sbin/bhyve/net_backends.c @@ -1,1106 +1,485 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2019 Vincenzo Maffione * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This file implements multiple network backends (tap, netmap, ...), * to be used by network frontends such as virtio-net and e1000. * The API to access the backend (e.g. send/receive packets, negotiate * features) is exported by net_backends.h. */ -#include -#include /* u_short etc */ +#include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include -#include -#include -#define NETMAP_WITH_LIBS -#include +#include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include +#include +#include +#include #include #include #include #include -#include #include -#include -#include -#include -#include -#include - -#ifdef NETGRAPH -#include -#include -#include -#endif +#include #include "config.h" #include "debug.h" #include "iov.h" #include "mevent.h" #include "net_backends.h" +#include "net_backends_priv.h" #include "pci_emul.h" -#include - -/* - * Each network backend registers a set of function pointers that are - * used to implement the net backends API. - * This might need to be exposed if we implement backends in separate files. - */ -struct net_backend { - const char *prefix; /* prefix matching this backend */ - - /* - * Routines used to initialize and cleanup the resources needed - * by a backend. The cleanup function is used internally, - * and should not be called by the frontend. - */ - int (*init)(struct net_backend *be, const char *devname, - nvlist_t *nvl, net_be_rxeof_t cb, void *param); - void (*cleanup)(struct net_backend *be); - - /* - * Called to serve a guest transmit request. The scatter-gather - * vector provided by the caller has 'iovcnt' elements and contains - * the packet to send. - */ - ssize_t (*send)(struct net_backend *be, const struct iovec *iov, - int iovcnt); - - /* - * Get the length of the next packet that can be received from - * the backend. If no packets are currently available, this - * function returns 0. - */ - ssize_t (*peek_recvlen)(struct net_backend *be); - - /* - * Called to receive a packet from the backend. When the function - * returns a positive value 'len', the scatter-gather vector - * provided by the caller contains a packet with such length. - * The function returns 0 if the backend doesn't have a new packet to - * receive. - */ - ssize_t (*recv)(struct net_backend *be, const struct iovec *iov, - int iovcnt); - - /* - * Ask the backend to enable or disable receive operation in the - * backend. On return from a disable operation, it is guaranteed - * that the receive callback won't be called until receive is - * enabled again. Note however that it is up to the caller to make - * sure that netbe_recv() is not currently being executed by another - * thread. - */ - void (*recv_enable)(struct net_backend *be); - void (*recv_disable)(struct net_backend *be); - - /* - * Ask the backend for the virtio-net features it is able to - * support. Possible features are TSO, UFO and checksum offloading - * in both rx and tx direction and for both IPv4 and IPv6. - */ - uint64_t (*get_cap)(struct net_backend *be); - - /* - * Tell the backend to enable/disable the specified virtio-net - * features (capabilities). - */ - int (*set_cap)(struct net_backend *be, uint64_t features, - unsigned int vnet_hdr_len); - - struct pci_vtnet_softc *sc; - int fd; - - /* - * Length of the virtio-net header used by the backend and the - * frontend, respectively. A zero value means that the header - * is not used. - */ - unsigned int be_vnet_hdr_len; - unsigned int fe_vnet_hdr_len; - - /* Size of backend-specific private data. */ - size_t priv_size; - - /* Backend-specific private data follows. */ -}; - -#define NET_BE_PRIV(be) ((void *)((be) + 1)) #define NET_BE_SIZE(be) (sizeof(*be) + (be)->priv_size) -SET_DECLARE(net_backend_set, struct net_backend); - -#define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) - -#define WPRINTF(params) PRINTLN params - -/* - * The tap backend - */ - -struct tap_priv { - struct mevent *mevp; - /* - * A bounce buffer that allows us to implement the peek_recvlen - * callback. In the future we may get the same information from - * the kevent data. - */ - char bbuf[1 << 16]; - ssize_t bbuflen; -}; - -static void +void tap_cleanup(struct net_backend *be) { struct tap_priv *priv = NET_BE_PRIV(be); if (priv->mevp) { mevent_delete(priv->mevp); } if (be->fd != -1) { close(be->fd); be->fd = -1; } } static int tap_init(struct net_backend *be, const char *devname, nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) { struct tap_priv *priv = NET_BE_PRIV(be); char tbuf[80]; int opt = 1, up = IFF_UP; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif if (cb == NULL) { - WPRINTF(("TAP backend requires non-NULL callback")); + EPRINTLN("TAP backend requires non-NULL callback"); return (-1); } strcpy(tbuf, "/dev/"); strlcat(tbuf, devname, sizeof(tbuf)); be->fd = open(tbuf, O_RDWR); if (be->fd == -1) { - WPRINTF(("open of tap device %s failed", tbuf)); + EPRINTLN("open of tap device %s failed", tbuf); goto error; } /* * Set non-blocking and register for read * notifications with the event loop */ if (ioctl(be->fd, FIONBIO, &opt) < 0) { - WPRINTF(("tap device O_NONBLOCK failed")); + EPRINTLN("tap device O_NONBLOCK failed"); goto error; } if (ioctl(be->fd, VMIO_SIOCSIFFLAGS, up)) { - WPRINTF(("tap device link up failed")); + EPRINTLN("tap device link up failed"); goto error; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); if (caph_rights_limit(be->fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif memset(priv->bbuf, 0, sizeof(priv->bbuf)); priv->bbuflen = 0; priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); if (priv->mevp == NULL) { - WPRINTF(("Could not register event")); + EPRINTLN("Could not register event"); goto error; } return (0); error: tap_cleanup(be); return (-1); } /* * Called to send a buffer chain out to the tap device */ -static ssize_t +ssize_t tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) { return (writev(be->fd, iov, iovcnt)); } -static ssize_t +ssize_t tap_peek_recvlen(struct net_backend *be) { struct tap_priv *priv = NET_BE_PRIV(be); ssize_t ret; if (priv->bbuflen > 0) { /* * We already have a packet in the bounce buffer. * Just return its length. */ return priv->bbuflen; } /* * Read the next packet (if any) into the bounce buffer, so * that we get to know its length and we can return that * to the caller. */ ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); if (ret < 0 && errno == EWOULDBLOCK) { return (0); } if (ret > 0) priv->bbuflen = ret; return (ret); } -static ssize_t +ssize_t tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) { struct tap_priv *priv = NET_BE_PRIV(be); ssize_t ret; if (priv->bbuflen > 0) { /* * A packet is available in the bounce buffer, so * we read it from there. */ ret = buf_to_iov(priv->bbuf, priv->bbuflen, iov, iovcnt, 0); /* Mark the bounce buffer as empty. */ priv->bbuflen = 0; return (ret); } ret = readv(be->fd, iov, iovcnt); if (ret < 0 && errno == EWOULDBLOCK) { return (0); } return (ret); } -static void +void tap_recv_enable(struct net_backend *be) { struct tap_priv *priv = NET_BE_PRIV(be); mevent_enable(priv->mevp); } -static void +void tap_recv_disable(struct net_backend *be) { struct tap_priv *priv = NET_BE_PRIV(be); mevent_disable(priv->mevp); } -static uint64_t +uint64_t tap_get_cap(struct net_backend *be __unused) { return (0); /* no capabilities for now */ } -static int +int tap_set_cap(struct net_backend *be __unused, uint64_t features, unsigned vnet_hdr_len) { return ((features || vnet_hdr_len) ? -1 : 0); } static struct net_backend tap_backend = { .prefix = "tap", .priv_size = sizeof(struct tap_priv), .init = tap_init, .cleanup = tap_cleanup, .send = tap_send, .peek_recvlen = tap_peek_recvlen, .recv = tap_recv, .recv_enable = tap_recv_enable, .recv_disable = tap_recv_disable, .get_cap = tap_get_cap, .set_cap = tap_set_cap, }; /* A clone of the tap backend, with a different prefix. */ static struct net_backend vmnet_backend = { .prefix = "vmnet", .priv_size = sizeof(struct tap_priv), .init = tap_init, .cleanup = tap_cleanup, .send = tap_send, .peek_recvlen = tap_peek_recvlen, .recv = tap_recv, .recv_enable = tap_recv_enable, .recv_disable = tap_recv_disable, .get_cap = tap_get_cap, .set_cap = tap_set_cap, }; DATA_SET(net_backend_set, tap_backend); DATA_SET(net_backend_set, vmnet_backend); -#ifdef NETGRAPH - -/* - * Netgraph backend - */ - -#define NG_SBUF_MAX_SIZE (4 * 1024 * 1024) - -static int -ng_init(struct net_backend *be, const char *devname __unused, - nvlist_t *nvl, net_be_rxeof_t cb, void *param) -{ - struct tap_priv *p = NET_BE_PRIV(be); - struct ngm_connect ngc; - const char *value, *nodename; - int sbsz; - int ctrl_sock; - int flags; - unsigned long maxsbsz; - size_t msbsz; -#ifndef WITHOUT_CAPSICUM - cap_rights_t rights; -#endif - - if (cb == NULL) { - WPRINTF(("Netgraph backend requires non-NULL callback")); - return (-1); - } - - be->fd = -1; - - memset(&ngc, 0, sizeof(ngc)); - - value = get_config_value_node(nvl, "path"); - if (value == NULL) { - WPRINTF(("path must be provided")); - return (-1); - } - strncpy(ngc.path, value, NG_PATHSIZ - 1); - - value = get_config_value_node(nvl, "hook"); - if (value == NULL) - value = "vmlink"; - strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1); - - value = get_config_value_node(nvl, "peerhook"); - if (value == NULL) { - WPRINTF(("peer hook must be provided")); - return (-1); - } - strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1); - - nodename = get_config_value_node(nvl, "socket"); - if (NgMkSockNode(nodename, - &ctrl_sock, &be->fd) < 0) { - WPRINTF(("can't get Netgraph sockets")); - return (-1); - } - - if (NgSendMsg(ctrl_sock, ".", - NGM_GENERIC_COOKIE, - NGM_CONNECT, &ngc, sizeof(ngc)) < 0) { - WPRINTF(("can't connect to node")); - close(ctrl_sock); - goto error; - } - - close(ctrl_sock); - - flags = fcntl(be->fd, F_GETFL); - - if (flags < 0) { - WPRINTF(("can't get socket flags")); - goto error; - } - - if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) { - WPRINTF(("can't set O_NONBLOCK flag")); - goto error; - } - - /* - * The default ng_socket(4) buffer's size is too low. - * Calculate the minimum value between NG_SBUF_MAX_SIZE - * and kern.ipc.maxsockbuf. - */ - msbsz = sizeof(maxsbsz); - if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz, - NULL, 0) < 0) { - WPRINTF(("can't get 'kern.ipc.maxsockbuf' value")); - goto error; - } - - /* - * We can't set the socket buffer size to kern.ipc.maxsockbuf value, - * as it takes into account the mbuf(9) overhead. - */ - maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES); - - sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz); - - if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz, - sizeof(sbsz)) < 0) { - WPRINTF(("can't set TX buffer size")); - goto error; - } - - if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz, - sizeof(sbsz)) < 0) { - WPRINTF(("can't set RX buffer size")); - goto error; - } - -#ifndef WITHOUT_CAPSICUM - cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); - if (caph_rights_limit(be->fd, &rights) == -1) - errx(EX_OSERR, "Unable to apply rights for sandbox"); -#endif - - memset(p->bbuf, 0, sizeof(p->bbuf)); - p->bbuflen = 0; - - p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); - if (p->mevp == NULL) { - WPRINTF(("Could not register event")); - goto error; - } - - return (0); - -error: - tap_cleanup(be); - return (-1); -} - -static struct net_backend ng_backend = { - .prefix = "netgraph", - .priv_size = sizeof(struct tap_priv), - .init = ng_init, - .cleanup = tap_cleanup, - .send = tap_send, - .peek_recvlen = tap_peek_recvlen, - .recv = tap_recv, - .recv_enable = tap_recv_enable, - .recv_disable = tap_recv_disable, - .get_cap = tap_get_cap, - .set_cap = tap_set_cap, -}; - -DATA_SET(net_backend_set, ng_backend); - -#endif /* NETGRAPH */ - -/* - * The netmap backend - */ - -/* The virtio-net features supported by netmap. */ -#define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ - VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ - VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ - VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) - -struct netmap_priv { - char ifname[IFNAMSIZ]; - struct nm_desc *nmd; - uint16_t memid; - struct netmap_ring *rx; - struct netmap_ring *tx; - struct mevent *mevp; - net_be_rxeof_t cb; - void *cb_param; -}; - -static void -nmreq_init(struct nmreq *req, char *ifname) -{ - - memset(req, 0, sizeof(*req)); - strlcpy(req->nr_name, ifname, sizeof(req->nr_name)); - req->nr_version = NETMAP_API; -} - -static int -netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) -{ - int err; - struct nmreq req; - struct netmap_priv *priv = NET_BE_PRIV(be); - - nmreq_init(&req, priv->ifname); - req.nr_cmd = NETMAP_BDG_VNET_HDR; - req.nr_arg1 = vnet_hdr_len; - err = ioctl(be->fd, NIOCREGIF, &req); - if (err) { - WPRINTF(("Unable to set vnet header length %d", - vnet_hdr_len)); - return (err); - } - - be->be_vnet_hdr_len = vnet_hdr_len; - - return (0); -} - -static int -netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) -{ - unsigned prev_hdr_len = be->be_vnet_hdr_len; - int ret; - - if (vnet_hdr_len == prev_hdr_len) { - return (1); - } - - ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); - if (ret) { - return (0); - } - - netmap_set_vnet_hdr_len(be, prev_hdr_len); - - return (1); -} - -static uint64_t -netmap_get_cap(struct net_backend *be) -{ - - return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? - NETMAP_FEATURES : 0); -} - -static int -netmap_set_cap(struct net_backend *be, uint64_t features __unused, - unsigned vnet_hdr_len) -{ - - return (netmap_set_vnet_hdr_len(be, vnet_hdr_len)); -} - -static int -netmap_init(struct net_backend *be, const char *devname, - nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) -{ - struct netmap_priv *priv = NET_BE_PRIV(be); - - strlcpy(priv->ifname, devname, sizeof(priv->ifname)); - priv->ifname[sizeof(priv->ifname) - 1] = '\0'; - - priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); - if (priv->nmd == NULL) { - WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)", - devname, strerror(errno))); - return (-1); - } - - priv->memid = priv->nmd->req.nr_arg2; - priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); - priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); - priv->cb = cb; - priv->cb_param = param; - be->fd = priv->nmd->fd; - - priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); - if (priv->mevp == NULL) { - WPRINTF(("Could not register event")); - return (-1); - } - - return (0); -} - -static void -netmap_cleanup(struct net_backend *be) -{ - struct netmap_priv *priv = NET_BE_PRIV(be); - - if (priv->mevp) { - mevent_delete(priv->mevp); - } - if (priv->nmd) { - nm_close(priv->nmd); - } - be->fd = -1; -} - -static ssize_t -netmap_send(struct net_backend *be, const struct iovec *iov, - int iovcnt) -{ - struct netmap_priv *priv = NET_BE_PRIV(be); - struct netmap_ring *ring; - ssize_t totlen = 0; - int nm_buf_size; - int nm_buf_len; - uint32_t head; - uint8_t *nm_buf; - int j; - - ring = priv->tx; - head = ring->head; - if (head == ring->tail) { - WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt))); - goto txsync; - } - nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); - nm_buf_size = ring->nr_buf_size; - nm_buf_len = 0; - - for (j = 0; j < iovcnt; j++) { - uint8_t *iov_frag_buf = iov[j].iov_base; - int iov_frag_size = iov[j].iov_len; - - totlen += iov_frag_size; - - /* - * Split each iovec fragment over more netmap slots, if - * necessary. - */ - for (;;) { - int copylen; - - copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; - memcpy(nm_buf, iov_frag_buf, copylen); - - iov_frag_buf += copylen; - iov_frag_size -= copylen; - nm_buf += copylen; - nm_buf_size -= copylen; - nm_buf_len += copylen; - - if (iov_frag_size == 0) { - break; - } - - ring->slot[head].len = nm_buf_len; - ring->slot[head].flags = NS_MOREFRAG; - head = nm_ring_next(ring, head); - if (head == ring->tail) { - /* - * We ran out of netmap slots while - * splitting the iovec fragments. - */ - WPRINTF(("No space, drop %zu bytes", - count_iov(iov, iovcnt))); - goto txsync; - } - nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); - nm_buf_size = ring->nr_buf_size; - nm_buf_len = 0; - } - } - - /* Complete the last slot, which must not have NS_MOREFRAG set. */ - ring->slot[head].len = nm_buf_len; - ring->slot[head].flags = 0; - head = nm_ring_next(ring, head); - - /* Now update ring->head and ring->cur. */ - ring->head = ring->cur = head; -txsync: - ioctl(be->fd, NIOCTXSYNC, NULL); - - return (totlen); -} - -static ssize_t -netmap_peek_recvlen(struct net_backend *be) -{ - struct netmap_priv *priv = NET_BE_PRIV(be); - struct netmap_ring *ring = priv->rx; - uint32_t head = ring->head; - ssize_t totlen = 0; - - while (head != ring->tail) { - struct netmap_slot *slot = ring->slot + head; - - totlen += slot->len; - if ((slot->flags & NS_MOREFRAG) == 0) - break; - head = nm_ring_next(ring, head); - } - - return (totlen); -} - -static ssize_t -netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) -{ - struct netmap_priv *priv = NET_BE_PRIV(be); - struct netmap_slot *slot = NULL; - struct netmap_ring *ring; - uint8_t *iov_frag_buf; - int iov_frag_size; - ssize_t totlen = 0; - uint32_t head; - - assert(iovcnt); - - ring = priv->rx; - head = ring->head; - iov_frag_buf = iov->iov_base; - iov_frag_size = iov->iov_len; - - do { - uint8_t *nm_buf; - int nm_buf_len; - - if (head == ring->tail) { - return (0); - } - - slot = ring->slot + head; - nm_buf = NETMAP_BUF(ring, slot->buf_idx); - nm_buf_len = slot->len; - - for (;;) { - int copylen = nm_buf_len < iov_frag_size ? - nm_buf_len : iov_frag_size; - - memcpy(iov_frag_buf, nm_buf, copylen); - nm_buf += copylen; - nm_buf_len -= copylen; - iov_frag_buf += copylen; - iov_frag_size -= copylen; - totlen += copylen; - - if (nm_buf_len == 0) { - break; - } - - iov++; - iovcnt--; - if (iovcnt == 0) { - /* No space to receive. */ - WPRINTF(("Short iov, drop %zd bytes", - totlen)); - return (-ENOSPC); - } - iov_frag_buf = iov->iov_base; - iov_frag_size = iov->iov_len; - } - - head = nm_ring_next(ring, head); - - } while (slot->flags & NS_MOREFRAG); - - /* Release slots to netmap. */ - ring->head = ring->cur = head; - - return (totlen); -} - -static void -netmap_recv_enable(struct net_backend *be) -{ - struct netmap_priv *priv = NET_BE_PRIV(be); - - mevent_enable(priv->mevp); -} - -static void -netmap_recv_disable(struct net_backend *be) -{ - struct netmap_priv *priv = NET_BE_PRIV(be); - - mevent_disable(priv->mevp); -} - -static struct net_backend netmap_backend = { - .prefix = "netmap", - .priv_size = sizeof(struct netmap_priv), - .init = netmap_init, - .cleanup = netmap_cleanup, - .send = netmap_send, - .peek_recvlen = netmap_peek_recvlen, - .recv = netmap_recv, - .recv_enable = netmap_recv_enable, - .recv_disable = netmap_recv_disable, - .get_cap = netmap_get_cap, - .set_cap = netmap_set_cap, -}; - -/* A clone of the netmap backend, with a different prefix. */ -static struct net_backend vale_backend = { - .prefix = "vale", - .priv_size = sizeof(struct netmap_priv), - .init = netmap_init, - .cleanup = netmap_cleanup, - .send = netmap_send, - .peek_recvlen = netmap_peek_recvlen, - .recv = netmap_recv, - .recv_enable = netmap_recv_enable, - .recv_disable = netmap_recv_disable, - .get_cap = netmap_get_cap, - .set_cap = netmap_set_cap, -}; - -DATA_SET(net_backend_set, netmap_backend); -DATA_SET(net_backend_set, vale_backend); - int netbe_legacy_config(nvlist_t *nvl, const char *opts) { char *backend, *cp; if (opts == NULL) return (0); cp = strchr(opts, ','); if (cp == NULL) { set_config_value_node(nvl, "backend", opts); return (0); } backend = strndup(opts, cp - opts); set_config_value_node(nvl, "backend", backend); free(backend); return (pci_parse_legacy_config(nvl, cp + 1)); } /* * Initialize a backend and attach to the frontend. * This is called during frontend initialization. * @ret is a pointer to the backend to be initialized * @devname is the backend-name as supplied on the command line, * e.g. -s 2:0,frontend-name,backend-name[,other-args] * @cb is the receive callback supplied by the frontend, * and it is invoked in the event loop when a receive * event is generated in the hypervisor, * @param is a pointer to the frontend, and normally used as * the argument for the callback. */ int netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb, void *param) { struct net_backend **pbe, *nbe, *tbe = NULL; const char *value, *type; char *devname; int err; value = get_config_value_node(nvl, "backend"); if (value == NULL) { return (-1); } devname = strdup(value); /* * Use the type given by configuration if exists; otherwise * use the prefix of the backend as the type. */ type = get_config_value_node(nvl, "type"); if (type == NULL) type = devname; /* * Find the network backend that matches the user-provided * device name. net_backend_set is built using a linker set. */ SET_FOREACH(pbe, net_backend_set) { if (strncmp(type, (*pbe)->prefix, strlen((*pbe)->prefix)) == 0) { tbe = *pbe; assert(tbe->init != NULL); assert(tbe->cleanup != NULL); assert(tbe->send != NULL); assert(tbe->recv != NULL); assert(tbe->get_cap != NULL); assert(tbe->set_cap != NULL); break; } } *ret = NULL; if (tbe == NULL) { free(devname); return (EINVAL); } nbe = calloc(1, NET_BE_SIZE(tbe)); *nbe = *tbe; /* copy the template */ nbe->fd = -1; nbe->sc = param; nbe->be_vnet_hdr_len = 0; nbe->fe_vnet_hdr_len = 0; /* Initialize the backend. */ err = nbe->init(nbe, devname, nvl, cb, param); if (err) { free(devname); free(nbe); return (err); } *ret = nbe; free(devname); return (0); } void netbe_cleanup(struct net_backend *be) { if (be != NULL) { be->cleanup(be); free(be); } } uint64_t netbe_get_cap(struct net_backend *be) { assert(be != NULL); return (be->get_cap(be)); } int netbe_set_cap(struct net_backend *be, uint64_t features, unsigned vnet_hdr_len) { int ret; assert(be != NULL); /* There are only three valid lengths, i.e., 0, 10 and 12. */ if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) return (-1); be->fe_vnet_hdr_len = vnet_hdr_len; ret = be->set_cap(be, features, vnet_hdr_len); assert(be->be_vnet_hdr_len == 0 || be->be_vnet_hdr_len == be->fe_vnet_hdr_len); return (ret); } ssize_t netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) { return (be->send(be, iov, iovcnt)); } ssize_t netbe_peek_recvlen(struct net_backend *be) { return (be->peek_recvlen(be)); } /* * Try to read a packet from the backend, without blocking. * If no packets are available, return 0. In case of success, return * the length of the packet just read. Return -1 in case of errors. */ ssize_t netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) { return (be->recv(be, iov, iovcnt)); } /* * Read a packet from the backend and discard it. * Returns the size of the discarded packet or zero if no packet was available. * A negative error code is returned in case of read error. */ ssize_t netbe_rx_discard(struct net_backend *be) { /* * MP note: the dummybuf is only used to discard frames, * so there is no need for it to be per-vtnet or locked. * We only make it large enough for TSO-sized segment. */ static uint8_t dummybuf[65536 + 64]; struct iovec iov; iov.iov_base = dummybuf; iov.iov_len = sizeof(dummybuf); return netbe_recv(be, &iov, 1); } void netbe_rx_disable(struct net_backend *be) { return be->recv_disable(be); } void netbe_rx_enable(struct net_backend *be) { return be->recv_enable(be); } size_t netbe_get_vnet_hdr_len(struct net_backend *be) { return (be->be_vnet_hdr_len); } diff --git a/usr.sbin/bhyve/net_backends.h b/usr.sbin/bhyve/net_backends.h index 6d58e04b3470..e623b7c64637 100644 --- a/usr.sbin/bhyve/net_backends.h +++ b/usr.sbin/bhyve/net_backends.h @@ -1,94 +1,99 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2019 Vincenzo Maffione * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __NET_BACKENDS_H__ #define __NET_BACKENDS_H__ -#include +#include +#include +#include + +#include "mevent.h" /* Opaque type representing a network backend. */ typedef struct net_backend net_backend_t; /* Interface between network frontends and the network backends. */ typedef void (*net_be_rxeof_t)(int, enum ev_type, void *param); int netbe_init(net_backend_t **be, nvlist_t *nvl, net_be_rxeof_t cb, void *param); int netbe_legacy_config(nvlist_t *nvl, const char *opts); void netbe_cleanup(net_backend_t *be); uint64_t netbe_get_cap(net_backend_t *be); int netbe_set_cap(net_backend_t *be, uint64_t cap, unsigned vnet_hdr_len); size_t netbe_get_vnet_hdr_len(net_backend_t *be); ssize_t netbe_send(net_backend_t *be, const struct iovec *iov, int iovcnt); ssize_t netbe_peek_recvlen(net_backend_t *be); ssize_t netbe_recv(net_backend_t *be, const struct iovec *iov, int iovcnt); ssize_t netbe_rx_discard(net_backend_t *be); void netbe_rx_disable(net_backend_t *be); void netbe_rx_enable(net_backend_t *be); /* * Network device capabilities taken from the VirtIO standard. * Despite the name, these capabilities can be used by different frontends * (virtio-net, ptnet) and supported by different backends (netmap, tap, ...). */ #define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */ #define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */ #define VIRTIO_NET_F_MTU (1 << 3) /* initial MTU advice */ #define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ #define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */ #define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */ #define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */ #define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */ #define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */ #define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */ #define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */ #define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */ #define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */ #define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ #define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ #define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */ #define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */ #define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */ #define VIRTIO_NET_F_GUEST_ANNOUNCE \ (1 << 21) /* guest can send gratuitous pkts */ #define VIRTIO_NET_F_MQ (1 << 22) /* host supports multiple VQ pairs */ /* * Fixed network header size */ struct virtio_net_rxhdr { uint8_t vrh_flags; uint8_t vrh_gso_type; uint16_t vrh_hdr_len; uint16_t vrh_gso_size; uint16_t vrh_csum_start; uint16_t vrh_csum_offset; uint16_t vrh_bufs; } __packed; +#define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) #endif /* __NET_BACKENDS_H__ */ diff --git a/usr.sbin/bhyve/net_backends_priv.h b/usr.sbin/bhyve/net_backends_priv.h new file mode 100644 index 000000000000..4b3f4e4358a3 --- /dev/null +++ b/usr.sbin/bhyve/net_backends_priv.h @@ -0,0 +1,152 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2019 Vincenzo Maffione + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __NET_BACKENDS_PRIV_H__ +#define __NET_BACKENDS_PRIV_H__ + +#include + +/* + * Each network backend registers a set of function pointers that are + * used to implement the net backends API. Frontends should not invoke + * these functions directly, but should instead use the interface provided by + * net_backends.h. + */ +struct net_backend { + const char *prefix; /* prefix matching this backend */ + + /* + * Routines used to initialize and cleanup the resources needed + * by a backend. The cleanup function is used internally, + * and should not be called by the frontend. + */ + int (*init)(struct net_backend *be, const char *devname, + nvlist_t *nvl, net_be_rxeof_t cb, void *param); + void (*cleanup)(struct net_backend *be); + + /* + * Called to serve a guest transmit request. The scatter-gather + * vector provided by the caller has 'iovcnt' elements and contains + * the packet to send. + */ + ssize_t (*send)(struct net_backend *be, const struct iovec *iov, + int iovcnt); + + /* + * Get the length of the next packet that can be received from + * the backend. If no packets are currently available, this + * function returns 0. + */ + ssize_t (*peek_recvlen)(struct net_backend *be); + + /* + * Called to receive a packet from the backend. When the function + * returns a positive value 'len', the scatter-gather vector + * provided by the caller contains a packet with such length. + * The function returns 0 if the backend doesn't have a new packet to + * receive. + */ + ssize_t (*recv)(struct net_backend *be, const struct iovec *iov, + int iovcnt); + + /* + * Ask the backend to enable or disable receive operation in the + * backend. On return from a disable operation, it is guaranteed + * that the receive callback won't be called until receive is + * enabled again. Note however that it is up to the caller to make + * sure that netbe_recv() is not currently being executed by another + * thread. + */ + void (*recv_enable)(struct net_backend *be); + void (*recv_disable)(struct net_backend *be); + + /* + * Ask the backend for the virtio-net features it is able to + * support. Possible features are TSO, UFO and checksum offloading + * in both rx and tx direction and for both IPv4 and IPv6. + */ + uint64_t (*get_cap)(struct net_backend *be); + + /* + * Tell the backend to enable/disable the specified virtio-net + * features (capabilities). + */ + int (*set_cap)(struct net_backend *be, uint64_t features, + unsigned int vnet_hdr_len); + + struct pci_vtnet_softc *sc; + int fd; + + /* + * Length of the virtio-net header used by the backend and the + * frontend, respectively. A zero value means that the header + * is not used. + */ + unsigned int be_vnet_hdr_len; + unsigned int fe_vnet_hdr_len; + + /* Size of backend-specific private data. */ + size_t priv_size; + + /* Backend-specific private data follows. */ +}; + +#define NET_BE_PRIV(be) ((void *)((be) + 1)) + +SET_DECLARE(net_backend_set, struct net_backend); + +#define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) + +/* + * Export the tap backend routines for the benefit of other backends which have + * a similar interface to the kernel, i.e., they send and receive data using + * standard I/O system calls with a single file descriptor. + */ + +struct tap_priv { + struct mevent *mevp; + /* + * A bounce buffer that allows us to implement the peek_recvlen + * callback. In the future we may get the same information from + * the kevent data. + */ + char bbuf[1 << 16]; + ssize_t bbuflen; +}; + +void tap_cleanup(struct net_backend *be); +ssize_t tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt); +ssize_t tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt); +ssize_t tap_peek_recvlen(struct net_backend *be); +void tap_recv_enable(struct net_backend *be); +ssize_t tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt); +void tap_recv_disable(struct net_backend *be); +uint64_t tap_get_cap(struct net_backend *be); +int tap_set_cap(struct net_backend *be, uint64_t features, + unsigned vnet_hdr_len); + +#endif /* !__NET_BACKENDS_PRIV_H__ */