Index: usr.sbin/bhyve/Makefile =================================================================== --- usr.sbin/bhyve/Makefile +++ usr.sbin/bhyve/Makefile @@ -32,6 +32,7 @@ mem.c \ mevent.c \ mptbl.c \ + net_backends.c \ net_utils.c \ pci_ahci.c \ pci_e82545.c \ Index: usr.sbin/bhyve/net_backends.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/net_backends.h @@ -0,0 +1,109 @@ +/*- + * Copyright (c) 2019 Vincenzo Maffione + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __NET_BACKENDS_H__ +#define __NET_BACKENDS_H__ + +#include +#include + +#include +#include +#define NETMAP_WITH_LIBS +#include + +#include "mevent.h" + +extern int netmap_ioctl_counter; + +/* Opaque type representing a network backend. */ +typedef struct net_backend net_backend_t; + +/* Interface between network frontends and the network backends. */ +typedef void (*net_backend_cb_t)(int, enum ev_type, void *param); +net_backend_t *netbe_init(const char *devname, + net_backend_cb_t cb, void *param); +void netbe_cleanup(net_backend_t *be); +uint64_t netbe_get_cap(net_backend_t *be); +int netbe_set_cap(net_backend_t *be, uint64_t cap, + unsigned vnet_hdr_len); +void netbe_send(net_backend_t *be, struct iovec *iov, + int iovcnt, uint32_t len, int more); +int netbe_recv(net_backend_t *be, struct iovec *iov, int iovcnt); +int netbe_rx_discard(net_backend_t *be); + + +/* + * Network device capabilities taken from VirtIO standard. + * Despite the name, these capabilities can be used by different frontents + * (virtio-net, ptnet) and supported by different backends (netmap, tap, ...). + */ +#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */ +#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */ +#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ +#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */ +#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */ +#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */ +#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */ +#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */ +#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */ +#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */ +#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */ +#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */ +#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ +#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ +#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */ +#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */ +#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */ +#define VIRTIO_NET_F_GUEST_ANNOUNCE \ + (1 << 21) /* guest can send gratuitous pkts */ + +/* + * Fixed network header size + */ +struct virtio_net_rxhdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; + uint16_t vrh_bufs; +} __packed; + +/* Used to get read-only info. */ +struct netmap_if_info { + uint32_t nifp_offset; + uint16_t num_tx_rings; + uint16_t num_rx_rings; + uint16_t num_tx_slots; + uint16_t num_rx_slots; +}; + +#include "pci_emul.h" +int net_parsemac(char *mac_str, uint8_t *mac_addr); +void net_genmac(struct pci_devinst *pi, uint8_t *macaddr); + +#endif /* __NET_BACKENDS_H__ */ Index: usr.sbin/bhyve/net_backends.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/net_backends.c @@ -0,0 +1,889 @@ +/*- + * Copyright (c) 2019 Vincenzo Maffione + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This file implements multiple network backends (tap, netmap, ...), + * to be used by network frontends such as virtio-net and ptnet. + * The API to access the backend (e.g. send/receive packets, negotiate + * features) is exported by net_backends.h. + */ + +#include +#include +#include +#include +#include /* u_short etc */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef WITHOUT_CAPSICUM +#include +#include +#endif + +#include "mevent.h" +#include "net_backends.h" + +#include + +/* + * Each network backend registers a set of function pointers that are + * used to implement the net backends API. + * This might need to be exposed if we implement backends in separate files. + */ +struct net_backend { + const char *name; /* name of the backend */ + /* + * The init and cleanup functions are used internally, + * virtio-net should never use it. + */ + int (*init)(struct net_backend *be, const char *devname, + net_backend_cb_t cb, void *param); + void (*cleanup)(struct net_backend *be); + + + /* + * Called to serve a guest transmit request. The scatter-gather + * vector provided by the caller has 'iovcnt' elements and contains + * the packet to send. 'len' is the length of whole packet in bytes. + */ + /* TODO ssize_t */ + int (*send)(struct net_backend *be, struct iovec *iov, + int iovcnt, uint32_t len, int more); + + /* + * Called to serve guest receive request. When the function + * returns a positive value, the scatter-gather vector + * provided by the caller (having 'iovcnt' elements in it) will + * contain a chunk of the received packet. The 'more' flag will + * be set if the returned chunk was the last one for the current + * packet, and 0 otherwise. The function returns the chunk size + * in bytes, or 0 if the backend doesn't have a new packet to + * receive. + * Note that it may be necessary to call this callback many + * times to receive a single packet, depending of how big is + * buffers you provide. + */ + /* TODO ssize_t */ + int (*recv)(struct net_backend *be, struct iovec *iov, int iovcnt); + + /* + * Ask the backend for the virtio-net features it is able to + * support. Possible features are TSO, UFO and checksum offloading + * in both rx and tx direction and for both IPv4 and IPv6. + */ + uint64_t (*get_cap)(struct net_backend *be); + + /* + * Tell the backend to enable/disable the specified virtio-net + * features (capabilities). + */ + int (*set_cap)(struct net_backend *be, uint64_t features, + unsigned int vnet_hdr_len); + + struct pci_vtnet_softc *sc; + int fd; + unsigned int be_vnet_hdr_len; + unsigned int fe_vnet_hdr_len; + /* TODO: implement priv space as a 'char opaque[0]' */ + void *priv; /* Pointer to backend-specific data. */ +}; + +SET_DECLARE(net_backend_set, struct net_backend); + +#define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) + +#define WPRINTF(params) printf params + +/* the tap backend */ + +struct tap_priv { + struct mevent *mevp; +}; + +static void +tap_cleanup(struct net_backend *be) +{ + struct tap_priv *priv = be->priv; + + if (be->priv) { + mevent_delete(priv->mevp); + free(be->priv); + be->priv = NULL; + } + if (be->fd != -1) { + close(be->fd); + be->fd = -1; + } +} + +static int +tap_init(struct net_backend *be, const char *devname, + net_backend_cb_t cb, void *param) +{ + char tbuf[80]; + int fd; + int opt = 1; + struct tap_priv *priv; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + if (cb == NULL) { + WPRINTF(("TAP backend requires non-NULL callback\n")); + return -1; + } + + priv = calloc(1, sizeof(struct tap_priv)); + if (priv == NULL) { + WPRINTF(("tap_priv alloc failed\n")); + return -1; + } + + strcpy(tbuf, "/dev/"); + strlcat(tbuf, devname, sizeof(tbuf)); + + fd = open(tbuf, O_RDWR); + if (fd == -1) { + WPRINTF(("open of tap device %s failed\n", tbuf)); + goto error; + } + + /* + * Set non-blocking and register for read + * notifications with the event loop + */ + if (ioctl(fd, FIONBIO, &opt) < 0) { + WPRINTF(("tap device O_NONBLOCK failed\n")); + goto error; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + priv->mevp = mevent_add(fd, EVF_READ, cb, param); + if (priv->mevp == NULL) { + WPRINTF(("Could not register event\n")); + goto error; + } + + be->fd = fd; + be->priv = priv; + + return 0; + +error: + tap_cleanup(be); + return -1; +} + +/* + * Called to send a buffer chain out to the tap device + */ +static int +tap_send(struct net_backend *be, struct iovec *iov, int iovcnt, uint32_t len, + int more) +{ + static char pad[60]; /* all zero bytes */ + + /* + * If the length is < 60, pad out to that and add the + * extra zero'd segment to the iov. It is guaranteed that + * there is always an extra iov available by the caller. + */ + if (len < 60) { + iov[iovcnt].iov_base = pad; + iov[iovcnt].iov_len = (size_t)(60 - len); + iovcnt++; + } + + return (int)writev(be->fd, iov, iovcnt); +} + +static int +tap_recv(struct net_backend *be, struct iovec *iov, int iovcnt) +{ + int ret; + + /* Should never be called without a valid tap fd */ + assert(be->fd != -1); + + ret = (int)readv(be->fd, iov, iovcnt); + + if (ret < 0 && errno == EWOULDBLOCK) { + return 0; + } + + return ret; +} + +static uint64_t +tap_get_cap(struct net_backend *be) +{ + return 0; /* no capabilities for now */ +} + +static int +tap_set_cap(struct net_backend *be, uint64_t features, + unsigned vnet_hdr_len) +{ + return (features || vnet_hdr_len) ? -1 : 0; +} + +static struct net_backend tap_backend = { + .name = "tap|vmnet", + .init = tap_init, + .cleanup = tap_cleanup, + .send = tap_send, + .recv = tap_recv, + .get_cap = tap_get_cap, + .set_cap = tap_set_cap, +}; + +DATA_SET(net_backend_set, tap_backend); + +/* + * The netmap backend + */ + +/* The virtio-net features supported by netmap. */ +#define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ + VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ + VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ + VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) + +#define NETMAP_POLLMASK (POLLIN | POLLRDNORM | POLLRDBAND) + +struct netmap_priv { + char ifname[IFNAMSIZ]; + struct nm_desc *nmd; + uint16_t memid; + struct netmap_ring *rx; + struct netmap_ring *tx; + pthread_t evloop_tid; + net_backend_cb_t cb; + void *cb_param; +}; + +static void * +netmap_evloop_thread(void *param) +{ + struct net_backend *be = param; + struct netmap_priv *priv = be->priv; + struct pollfd pfd; + int ret; + + for (;;) { + pfd.fd = be->fd; + pfd.events = NETMAP_POLLMASK; + ret = poll(&pfd, 1, INFTIM); + if (ret == -1 && errno != EINTR) { + WPRINTF(("netmap poll failed, %d\n", errno)); + } else if (ret == 1 && (pfd.revents & NETMAP_POLLMASK)) { + priv->cb(pfd.fd, EVF_READ, priv->cb_param); + } + } + + return NULL; +} + +static void +nmreq_init(struct nmreq *req, char *ifname) +{ + memset(req, 0, sizeof(*req)); + strncpy(req->nr_name, ifname, sizeof(req->nr_name)); + req->nr_version = NETMAP_API; +} + +static int +netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) +{ + int err; + struct nmreq req; + struct netmap_priv *priv = be->priv; + + nmreq_init(&req, priv->ifname); + req.nr_cmd = NETMAP_BDG_VNET_HDR; + req.nr_arg1 = vnet_hdr_len; + err = ioctl(be->fd, NIOCREGIF, &req); + if (err) { + WPRINTF(("Unable to set vnet header length %d\n", + vnet_hdr_len)); + return err; + } + + be->be_vnet_hdr_len = vnet_hdr_len; + + return 0; +} + +static int +netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) +{ + int prev_hdr_len = be->be_vnet_hdr_len; + int ret; + + if (vnet_hdr_len == prev_hdr_len) { + return 1; + } + + ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); + if (ret) { + return 0; + } + + netmap_set_vnet_hdr_len(be, prev_hdr_len); + + return 1; +} + +static uint64_t +netmap_get_cap(struct net_backend *be) +{ + return netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? + NETMAP_FEATURES : 0; +} + +static int +netmap_set_cap(struct net_backend *be, uint64_t features, + unsigned vnet_hdr_len) +{ + return netmap_set_vnet_hdr_len(be, vnet_hdr_len); +} + +static int +netmap_init(struct net_backend *be, const char *devname, + net_backend_cb_t cb, void *param) +{ + struct netmap_priv *priv = NULL; + + priv = calloc(1, sizeof(struct netmap_priv)); + if (priv == NULL) { + WPRINTF(("Unable alloc netmap private data\n")); + return -1; + } + + strncpy(priv->ifname, devname, sizeof(priv->ifname)); + priv->ifname[sizeof(priv->ifname) - 1] = '\0'; + + priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); + if (priv->nmd == NULL) { + WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)\n", + devname, strerror(errno))); + free(priv); + return -1; + } + + priv->memid = priv->nmd->req.nr_arg2; + priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); + priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); + priv->cb = cb; + priv->cb_param = param; + be->fd = priv->nmd->fd; + be->priv = priv; + + /* TODO Turn this into a mevent_add */ + { + char tname[40]; + + /* Create a thread for netmap poll. */ + pthread_create(&priv->evloop_tid, NULL, netmap_evloop_thread, (void *)be); + snprintf(tname, sizeof(tname), "netmap-evloop-%p", priv); + pthread_set_name_np(priv->evloop_tid, tname); + } + + return 0; +} + +static void +netmap_cleanup(struct net_backend *be) +{ + struct netmap_priv *priv = be->priv; + + if (be->priv) { + nm_close(priv->nmd); + free(be->priv); + be->priv = NULL; + } + be->fd = -1; +} + +/* A fast copy routine only for multiples of 64 bytes, non overlapped. */ +static inline void +pkt_copy(const void *_src, void *_dst, int l) +{ + const uint64_t *src = _src; + uint64_t *dst = _dst; + if (l >= 1024) { + bcopy(src, dst, l); + return; + } + for (; l > 0; l -= 64) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } +} + +static int +netmap_send(struct net_backend *be, struct iovec *iov, + int iovcnt, uint32_t size, int more) +{ + struct netmap_priv *priv = be->priv; + struct netmap_ring *ring; + int nm_buf_size; + int nm_buf_len; + uint32_t head; + void *nm_buf; + int j; + + if (iovcnt <= 0 || size <= 0) { + D("Wrong iov: iovcnt %d size %d", iovcnt, size); + return 0; + } + + ring = priv->tx; + head = ring->head; + if (head == ring->tail) { + RD(1, "No space, drop %d bytes", size); + goto txsync; + } + nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); + nm_buf_size = ring->nr_buf_size; + nm_buf_len = 0; + + for (j = 0; j < iovcnt; j++) { + int iov_frag_size = iov[j].iov_len; + void *iov_frag_buf = iov[j].iov_base; + + /* Split each iovec fragment over more netmap slots, if + necessary. */ + for (;;) { + int copylen; + + copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; + pkt_copy(iov_frag_buf, nm_buf, copylen); + + iov_frag_buf += copylen; + iov_frag_size -= copylen; + nm_buf += copylen; + nm_buf_size -= copylen; + nm_buf_len += copylen; + + if (iov_frag_size == 0) { + break; + } + + ring->slot[head].len = nm_buf_len; + ring->slot[head].flags = NS_MOREFRAG; + head = nm_ring_next(ring, head); + if (head == ring->tail) { + /* We ran out of netmap slots while + * splitting the iovec fragments. */ + RD(1, "No space, drop %d bytes", size); + goto txsync; + } + nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); + nm_buf_size = ring->nr_buf_size; + nm_buf_len = 0; + } + } + + /* Complete the last slot, which must not have NS_MOREFRAG set. */ + ring->slot[head].len = nm_buf_len; + ring->slot[head].flags = 0; + head = nm_ring_next(ring, head); + + /* Now update ring->head and ring->cur. */ + ring->head = ring->cur = head; + + if (more) {// && nm_ring_space(ring) > 64 + return 0; + } +txsync: + ioctl(be->fd, NIOCTXSYNC, NULL); + + return 0; +} + +static int +netmap_recv(struct net_backend *be, struct iovec *iov, int iovcnt) +{ + struct netmap_priv *priv = be->priv; + struct netmap_slot *slot = NULL; + struct netmap_ring *ring; + void *iov_frag_buf; + int iov_frag_size; + int totlen = 0; + uint32_t head; + + assert(iovcnt); + + ring = priv->rx; + head = ring->head; + iov_frag_buf = iov->iov_base; + iov_frag_size = iov->iov_len; + + do { + int nm_buf_len; + void *nm_buf; + + if (head == ring->tail) { + return 0; + } + + slot = ring->slot + head; + nm_buf = NETMAP_BUF(ring, slot->buf_idx); + nm_buf_len = slot->len; + + for (;;) { + int copylen = nm_buf_len < iov_frag_size ? nm_buf_len : iov_frag_size; + + pkt_copy(nm_buf, iov_frag_buf, copylen); + nm_buf += copylen; + nm_buf_len -= copylen; + iov_frag_buf += copylen; + iov_frag_size -= copylen; + totlen += copylen; + + if (nm_buf_len == 0) { + break; + } + + iov++; + iovcnt--; + if (iovcnt == 0) { + /* No space to receive. */ + D("Short iov, drop %d bytes", totlen); + return -ENOSPC; + } + iov_frag_buf = iov->iov_base; + iov_frag_size = iov->iov_len; + } + + head = nm_ring_next(ring, head); + + } while (slot->flags & NS_MOREFRAG); + + /* Release slots to netmap. */ + ring->head = ring->cur = head; + + return totlen; +} + +static struct net_backend netmap_backend = { + .name = "netmap|vale", + .init = netmap_init, + .cleanup = netmap_cleanup, + .send = netmap_send, + .recv = netmap_recv, + .get_cap = netmap_get_cap, + .set_cap = netmap_set_cap, +}; + +DATA_SET(net_backend_set, netmap_backend); + +/* + * make sure a backend is properly initialized + * TODO check and return error if not set! + */ +static int +netbe_fix(struct net_backend *be) +{ + if (be == NULL) + return -1; + if (be->name == NULL) { + fprintf(stderr, "missing name for %p\n", be); + return -1; + } + if (be->init == NULL) { + fprintf(stderr, "missing init for %p %s\n", be, be->name); + return -1; + } + if (be->cleanup == NULL) { + fprintf(stderr, "missing cleanup for %p %s\n", be, be->name); + return -1; + } + if (be->send == NULL) { + fprintf(stderr, "missing send for %p %s\n", be, be->name); + return -1; + } + if (be->recv == NULL) { + fprintf(stderr, "missing recv for %p %s\n", be, be->name); + return -1; + } + if (be->get_cap == NULL) { + fprintf(stderr, "missing get_cap for %p %s\n", + be, be->name); + return -1; + } + if (be->set_cap == NULL) { + fprintf(stderr, "missing set_cap for %p %s\n", + be, be->name); + return -1; + } + + return 0; +} + +/* + * keys is a set of prefixes separated by '|', + * return 1 if the leftmost part of name matches one prefix. + */ +static const char * +netbe_name_match(const char *keys, const char *name) +{ + const char *n = name, *good = keys; + char c; + + if (!keys || !name) + return NULL; + while ( (c = *keys++) ) { + if (c == '|') { /* reached the separator */ + if (good) + break; + /* prepare for new round */ + n = name; + good = keys; + } else if (good && c != *n++) { + good = NULL; /* drop till next keyword */ + } + } + return good; +} + +/* + * Initialize a backend and attach to the frontend. + * This is called during frontend initialization. + * @devname is the backend-name as supplied on the command line, + * e.g. -s 2:0,frontend-name,backend-name[,other-args] + * @cb is the receive callback supplied by the frontend, + * and it is invoked in the event loop when a receive + * event is generated in the hypervisor, + * @param is a pointer to the frontend, and normally used as + * the argument for the callback. + */ +struct net_backend * +netbe_init(const char *devname, net_backend_cb_t cb, void *param) +{ + struct net_backend **pbe, *be, *tbe = NULL; + int err; + + /* + * Find the network backend depending on the user-provided + * device name. net_backend_set is built using a linker set. + */ + SET_FOREACH(pbe, net_backend_set) { + if (netbe_name_match((*pbe)->name, devname)) { + tbe = *pbe; + break; + } + } + if (tbe == NULL) + return NULL; + be = calloc(1, sizeof(*be)); + *be = *tbe; /* copy the template */ + if (netbe_fix(be)) { /* make sure we have all fields */ + free(be); + return NULL; + } + be->fd = -1; + be->priv = NULL; + be->sc = param; + be->be_vnet_hdr_len = 0; + be->fe_vnet_hdr_len = 0; + + /* initialize the backend */ + err = be->init(be, devname, cb, param); + if (err) { + free(be); + be = NULL; + } + return be; +} + +void +netbe_cleanup(struct net_backend *be) +{ + if (be != NULL) { + be->cleanup(be); + free(be); + } +} + +uint64_t +netbe_get_cap(struct net_backend *be) +{ + assert(be != NULL); + return be->get_cap(be); +} + +int +netbe_set_cap(struct net_backend *be, uint64_t features, + unsigned vnet_hdr_len) +{ + int ret; + + assert(be != NULL); + + /* There are only three valid lengths. */ + if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN + && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) + return -1; + + be->fe_vnet_hdr_len = vnet_hdr_len; + + ret = be->set_cap(be, features, vnet_hdr_len); + assert(be->be_vnet_hdr_len == 0 || + be->be_vnet_hdr_len == be->fe_vnet_hdr_len); + + return ret; +} + +static __inline struct iovec * +iov_trim(struct iovec *iov, int *iovcnt, unsigned int tlen) +{ + struct iovec *riov; + + /* XXX short-cut: assume first segment is >= tlen */ + assert(iov[0].iov_len >= tlen); + + iov[0].iov_len -= tlen; + if (iov[0].iov_len == 0) { + assert(*iovcnt > 1); + *iovcnt -= 1; + riov = &iov[1]; + } else { + iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen); + riov = &iov[0]; + } + + return (riov); +} + +void +netbe_send(struct net_backend *be, struct iovec *iov, int iovcnt, uint32_t len, + int more) +{ + assert(be != NULL); +#if 0 + int i; + D("sending iovcnt %d len %d iovec %p", iovcnt, len, iov); + for (i=0; i < iovcnt; i++) + D(" %3d: %4d %p", i, (int)iov[i].iov_len, iov[i].iov_base); +#endif + if (be->be_vnet_hdr_len != be->fe_vnet_hdr_len) { + /* Here we are sure be->be_vnet_hdr_len is 0. */ + iov = iov_trim(iov, &iovcnt, be->fe_vnet_hdr_len); + } + + be->send(be, iov, iovcnt, len, more); +} + +/* + * can return -1 in case of errors + */ +int +netbe_recv(struct net_backend *be, struct iovec *iov, int iovcnt) +{ + /* Length of prepended virtio-net header. */ + unsigned int hlen = be->fe_vnet_hdr_len; + int ret; + + assert(be != NULL); + + if (hlen && hlen != be->be_vnet_hdr_len) { + /* Here we are sure be->be_vnet_hdr_len is 0. */ + struct virtio_net_rxhdr *vh; + + /* + * Get a pointer to the rx header, and use the + * data immediately following it for the packet buffer. + */ + vh = iov[0].iov_base; + iov = iov_trim(iov, &iovcnt, hlen); + + /* + * The only valid field in the rx packet header is the + * number of buffers if merged rx bufs were negotiated. + */ + memset(vh, 0, hlen); + if (hlen == VNET_HDR_LEN) { + vh->vrh_bufs = 1; + } + } + + ret = be->recv(be, iov, iovcnt); + if (ret > 0) { + ret += hlen; + } + + return ret; +} + +/* + * Read a packet from the backend and discard it. + * Returns the size of the discarded packet or zero if no packet was available. + * A negative error code is returned in case of read error. + */ +int +netbe_rx_discard(struct net_backend *be) +{ + /* + * MP note: the dummybuf is only used to discard frames, + * so there is no need for it to be per-vtnet or locked. + * We only make it large enough for TSO-sized segment. + */ + static uint8_t dummybuf[65536+64]; + struct iovec iov; + + iov.iov_base = dummybuf; + iov.iov_len = sizeof(dummybuf); + + return netbe_recv(be, &iov, 1); +} + Index: usr.sbin/bhyve/pci_virtio_net.c =================================================================== --- usr.sbin/bhyve/pci_virtio_net.c +++ usr.sbin/bhyve/pci_virtio_net.c @@ -32,22 +32,13 @@ __FBSDID("$FreeBSD$"); #include -#ifndef WITHOUT_CAPSICUM -#include -#endif #include #include #include #include #include -#ifndef NETMAP_WITH_LIBS -#define NETMAP_WITH_LIBS -#endif -#include +#include /* IFNAMSIZ */ -#ifndef WITHOUT_CAPSICUM -#include -#endif #include #include #include @@ -58,44 +49,20 @@ #include #include #include -#include #include #include -#include #include "bhyverun.h" #include "pci_emul.h" #include "mevent.h" #include "virtio.h" #include "net_utils.h" +#include "net_backends.h" #define VTNET_RINGSZ 1024 #define VTNET_MAXSEGS 256 -/* - * Host capabilities. Note that we only offer a few of these. - */ -#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */ -#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */ -#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ -#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */ -#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */ -#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */ -#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */ -#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */ -#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */ -#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */ -#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */ -#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */ -#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ -#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ -#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */ -#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */ -#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */ -#define VIRTIO_NET_F_GUEST_ANNOUNCE \ - (1 << 21) /* guest can send gratuitous pkts */ - #define VTNET_S_HOSTCAPS \ ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \ VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC) @@ -117,19 +84,6 @@ #define VTNET_MAXQ 3 -/* - * Fixed network header size - */ -struct virtio_net_rxhdr { - uint8_t vrh_flags; - uint8_t vrh_gso_type; - uint16_t vrh_hdr_len; - uint16_t vrh_gso_size; - uint16_t vrh_csum_start; - uint16_t vrh_csum_offset; - uint16_t vrh_bufs; -} __packed; - /* * Debug printf */ @@ -144,30 +98,24 @@ struct virtio_softc vsc_vs; struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; pthread_mutex_t vsc_mtx; - struct mevent *vsc_mevp; - int vsc_tapfd; - struct nm_desc *vsc_nmd; + net_backend_t *vsc_be; int vsc_rx_ready; int resetting; /* protected by tx_mtx */ uint64_t vsc_features; /* negotiated features */ - struct virtio_net_config vsc_config; - pthread_mutex_t rx_mtx; - int rx_vhdrlen; + unsigned int rx_vhdrlen; int rx_merge; /* merged rx bufs in use */ pthread_t tx_tid; pthread_mutex_t tx_mtx; pthread_cond_t tx_cond; int tx_in_progress; + struct virtio_net_config vsc_config; - void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc); - void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov, - int iovcnt, int len); }; static void pci_vtnet_reset(void *); @@ -223,84 +171,20 @@ pthread_mutex_unlock(&sc->rx_mtx); } -/* - * Called to send a buffer chain out to the tap device - */ -static void -pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, - int len) -{ - static char pad[60]; /* all zero bytes */ - - if (sc->vsc_tapfd == -1) - return; - - /* - * If the length is < 60, pad out to that and add the - * extra zero'd segment to the iov. It is guaranteed that - * there is always an extra iov available by the caller. - */ - if (len < 60) { - iov[iovcnt].iov_base = pad; - iov[iovcnt].iov_len = 60 - len; - iovcnt++; - } - (void) writev(sc->vsc_tapfd, iov, iovcnt); -} - -/* - * Called when there is read activity on the tap file descriptor. - * Each buffer posted by the guest is assumed to be able to contain - * an entire ethernet frame + rx header. - * MP note: the dummybuf is only used for discarding frames, so there - * is no need for it to be per-vtnet or locked. - */ -static uint8_t dummybuf[2048]; - -static __inline struct iovec * -rx_iov_trim(struct iovec *iov, int *niov, int tlen) -{ - struct iovec *riov; - - /* XXX short-cut: assume first segment is >= tlen */ - assert(iov[0].iov_len >= tlen); - - iov[0].iov_len -= tlen; - if (iov[0].iov_len == 0) { - assert(*niov > 1); - *niov -= 1; - riov = &iov[1]; - } else { - iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen); - riov = &iov[0]; - } - - return (riov); -} - static void -pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) +pci_vtnet_rx(struct pci_vtnet_softc *sc) { - struct iovec iov[VTNET_MAXSEGS], *riov; + struct iovec iov[VTNET_MAXSEGS + 1]; struct vqueue_info *vq; - void *vrx; int len, n; uint16_t idx; - /* - * Should never be called without a valid tap fd - */ - assert(sc->vsc_tapfd != -1); - - /* - * But, will be called when the rx ring hasn't yet - * been set up. - */ if (!sc->vsc_rx_ready) { /* + * The rx ring has not yet been set up. * Drop the packet and try later. */ - (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); + netbe_rx_discard(sc->vsc_be); return; } @@ -310,11 +194,11 @@ vq = &sc->vsc_queues[VTNET_RXQ]; if (!vq_has_descs(vq)) { /* - * Drop the packet and try later. Interrupt on - * empty, if that's negotiated. + * No available rx buffers. Drop the packet and try later. + * Interrupt on empty, if that's negotiated. */ - (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); - vq_endchains(vq, 1); + netbe_rx_discard(sc->vsc_be); + vq_endchains(vq, /*used_all_avail=*/1); return; } @@ -325,256 +209,46 @@ n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); assert(n >= 1 && n <= VTNET_MAXSEGS); - /* - * Get a pointer to the rx header, and use the - * data immediately following it for the packet buffer. - */ - vrx = iov[0].iov_base; - riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); - - len = readv(sc->vsc_tapfd, riov, n); + len = netbe_recv(sc->vsc_be, iov, n); - if (len < 0 && errno == EWOULDBLOCK) { + if (len <= 0) { /* - * No more packets, but still some avail ring - * entries. Interrupt if needed/appropriate. + * No more packets (len == 0), or backend errored + * (err < 0). Return unused available buffers. */ vq_retchain(vq); - vq_endchains(vq, 0); - return; - } - - /* - * The only valid field in the rx packet header is the - * number of buffers if merged rx bufs were negotiated. - */ - memset(vrx, 0, sc->rx_vhdrlen); - - if (sc->rx_merge) { - struct virtio_net_rxhdr *vrxh; - - vrxh = vrx; - vrxh->vrh_bufs = 1; + if (len == 0) { + /* Interrupt if needed/appropriate and stop. */ + vq_endchains(vq, /*used_all_avail=*/0); + return; + } } - /* - * Release this chain and handle more chains. - */ - vq_relchain(vq, idx, len + sc->rx_vhdrlen); + /* Publish the info to the guest */ + vq_relchain(vq, idx, (uint32_t)len); } while (vq_has_descs(vq)); /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ - vq_endchains(vq, 1); -} - -static __inline int -pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt) -{ - int r, i; - int len = 0; - - for (r = nmd->cur_tx_ring; ; ) { - struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r); - uint32_t cur, idx; - char *buf; - - if (nm_ring_empty(ring)) { - r++; - if (r > nmd->last_tx_ring) - r = nmd->first_tx_ring; - if (r == nmd->cur_tx_ring) - break; - continue; - } - cur = ring->cur; - idx = ring->slot[cur].buf_idx; - buf = NETMAP_BUF(ring, idx); - - for (i = 0; i < iovcnt; i++) { - if (len + iov[i].iov_len > 2048) - break; - memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len); - len += iov[i].iov_len; - } - ring->slot[cur].len = len; - ring->head = ring->cur = nm_ring_next(ring, cur); - nmd->cur_tx_ring = r; - ioctl(nmd->fd, NIOCTXSYNC, NULL); - break; - } - - return (len); -} - -static __inline int -pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt) -{ - int len = 0; - int i = 0; - int r; - - for (r = nmd->cur_rx_ring; ; ) { - struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r); - uint32_t cur, idx; - char *buf; - size_t left; - - if (nm_ring_empty(ring)) { - r++; - if (r > nmd->last_rx_ring) - r = nmd->first_rx_ring; - if (r == nmd->cur_rx_ring) - break; - continue; - } - cur = ring->cur; - idx = ring->slot[cur].buf_idx; - buf = NETMAP_BUF(ring, idx); - left = ring->slot[cur].len; - - for (i = 0; i < iovcnt && left > 0; i++) { - if (iov[i].iov_len > left) - iov[i].iov_len = left; - memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len); - len += iov[i].iov_len; - left -= iov[i].iov_len; - } - ring->head = ring->cur = nm_ring_next(ring, cur); - nmd->cur_rx_ring = r; - ioctl(nmd->fd, NIOCRXSYNC, NULL); - break; - } - for (; i < iovcnt; i++) - iov[i].iov_len = 0; - - return (len); + vq_endchains(vq, /*used_all_avail=*/1); } /* - * Called to send a buffer chain out to the vale port + * Called when there is read activity on the backend file descriptor. + * Each buffer posted by the guest is assumed to be able to contain + * an entire ethernet frame + rx header. */ -static void -pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, - int len) -{ - static char pad[60]; /* all zero bytes */ - - if (sc->vsc_nmd == NULL) - return; - - /* - * If the length is < 60, pad out to that and add the - * extra zero'd segment to the iov. It is guaranteed that - * there is always an extra iov available by the caller. - */ - if (len < 60) { - iov[iovcnt].iov_base = pad; - iov[iovcnt].iov_len = 60 - len; - iovcnt++; - } - (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt); -} - -static void -pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc) -{ - struct iovec iov[VTNET_MAXSEGS], *riov; - struct vqueue_info *vq; - void *vrx; - int len, n; - uint16_t idx; - - /* - * Should never be called without a valid netmap descriptor - */ - assert(sc->vsc_nmd != NULL); - - /* - * But, will be called when the rx ring hasn't yet - * been set up. - */ - if (!sc->vsc_rx_ready) { - /* - * Drop the packet and try later. - */ - (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); - return; - } - - /* - * Check for available rx buffers - */ - vq = &sc->vsc_queues[VTNET_RXQ]; - if (!vq_has_descs(vq)) { - /* - * Drop the packet and try later. Interrupt on - * empty, if that's negotiated. - */ - (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); - vq_endchains(vq, 1); - return; - } - - do { - /* - * Get descriptor chain. - */ - n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); - assert(n >= 1 && n <= VTNET_MAXSEGS); - - /* - * Get a pointer to the rx header, and use the - * data immediately following it for the packet buffer. - */ - vrx = iov[0].iov_base; - riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); - - len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n); - - if (len == 0) { - /* - * No more packets, but still some avail ring - * entries. Interrupt if needed/appropriate. - */ - vq_retchain(vq); - vq_endchains(vq, 0); - return; - } - - /* - * The only valid field in the rx packet header is the - * number of buffers if merged rx bufs were negotiated. - */ - memset(vrx, 0, sc->rx_vhdrlen); - - if (sc->rx_merge) { - struct virtio_net_rxhdr *vrxh; - - vrxh = vrx; - vrxh->vrh_bufs = 1; - } - - /* - * Release this chain and handle more chains. - */ - vq_relchain(vq, idx, len + sc->rx_vhdrlen); - } while (vq_has_descs(vq)); - - /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ - vq_endchains(vq, 1); -} - static void pci_vtnet_rx_callback(int fd, enum ev_type type, void *param) { struct pci_vtnet_softc *sc = param; pthread_mutex_lock(&sc->rx_mtx); - sc->pci_vtnet_rx(sc); + pci_vtnet_rx(sc); pthread_mutex_unlock(&sc->rx_mtx); } +/* Called on RX kick. */ static void pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) { @@ -589,35 +263,35 @@ } } +/* TX virtqueue processing, called by the TX thread. */ static void pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) { struct iovec iov[VTNET_MAXSEGS + 1]; int i, n; - int plen, tlen; + uint32_t len; uint16_t idx; /* - * Obtain chain of descriptors. The first one is - * really the header descriptor, so we need to sum - * up two lengths: packet length and transfer length. + * Obtain chain of descriptors. The first descriptor also + * contains the virtio-net header. */ n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); assert(n >= 1 && n <= VTNET_MAXSEGS); - plen = 0; - tlen = iov[0].iov_len; - for (i = 1; i < n; i++) { - plen += iov[i].iov_len; - tlen += iov[i].iov_len; + /* TODO let netbe_send return the total length, so that + * we do not need to compute it here. */ + len = 0; + for (i = 0; i < n; i++) { + len += iov[i].iov_len; } - DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n)); - sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen); + netbe_send(sc->vsc_be, iov, n, len, 0 /* more */); - /* chain is processed, release it and set tlen */ - vq_relchain(vq, idx, tlen); + /* chain is processed, release it and set len */ + vq_relchain(vq, idx, len); } +/* Called on TX kick. */ static void pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) { @@ -647,6 +321,14 @@ struct vqueue_info *vq; int error; + { + struct pci_devinst *pi = sc->vsc_vs.vs_pi; + char tname[MAXCOMLEN + 1]; + snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, + pi->pi_func); + pthread_set_name_np(pthread_self(), tname); + } + vq = &sc->vsc_queues[VTNET_TXQ]; /* @@ -684,7 +366,7 @@ /* * Generate an interrupt if needed. */ - vq_endchains(vq, 1); + vq_endchains(vq, /*used_all_avail=*/1); pthread_mutex_lock(&sc->tx_mtx); } @@ -699,93 +381,28 @@ } #endif -static void -pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname) -{ - char tbuf[80]; -#ifndef WITHOUT_CAPSICUM - cap_rights_t rights; -#endif - - strcpy(tbuf, "/dev/"); - strlcat(tbuf, devname, sizeof(tbuf)); - - sc->pci_vtnet_rx = pci_vtnet_tap_rx; - sc->pci_vtnet_tx = pci_vtnet_tap_tx; - - sc->vsc_tapfd = open(tbuf, O_RDWR); - if (sc->vsc_tapfd == -1) { - WPRINTF(("open of tap device %s failed\n", tbuf)); - return; - } - - /* - * Set non-blocking and register for read - * notifications with the event loop - */ - int opt = 1; - if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { - WPRINTF(("tap device O_NONBLOCK failed\n")); - close(sc->vsc_tapfd); - sc->vsc_tapfd = -1; - } - -#ifndef WITHOUT_CAPSICUM - cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); - if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1) - errx(EX_OSERR, "Unable to apply rights for sandbox"); -#endif - - sc->vsc_mevp = mevent_add(sc->vsc_tapfd, - EVF_READ, - pci_vtnet_rx_callback, - sc); - if (sc->vsc_mevp == NULL) { - WPRINTF(("Could not register event\n")); - close(sc->vsc_tapfd); - sc->vsc_tapfd = -1; - } -} - -static void -pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname) -{ - sc->pci_vtnet_rx = pci_vtnet_netmap_rx; - sc->pci_vtnet_tx = pci_vtnet_netmap_tx; - - sc->vsc_nmd = nm_open(ifname, NULL, 0, 0); - if (sc->vsc_nmd == NULL) { - WPRINTF(("open of netmap device %s failed\n", ifname)); - return; - } - - sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd, - EVF_READ, - pci_vtnet_rx_callback, - sc); - if (sc->vsc_mevp == NULL) { - WPRINTF(("Could not register event\n")); - nm_close(sc->vsc_nmd); - sc->vsc_nmd = NULL; - } -} - static int pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { - char tname[MAXCOMLEN + 1]; struct pci_vtnet_softc *sc; - char *devname; - char *vtopts; + char tname[MAXCOMLEN + 1]; + struct virtio_consts *vc; int mac_provided; - sc = calloc(1, sizeof(struct pci_vtnet_softc)); + /* + * Allocate data structures for further virtio initializations. + * sc also contains a copy of the vtnet_vi_consts, + * because the capabilities change depending on + * the backend. + */ + /* TODO: add struct virtio_consts field to pci_vtnet_softc. */ + sc = calloc(1, sizeof(struct pci_vtnet_softc) + + sizeof(struct virtio_consts)); + vc = (struct virtio_consts *)(sc + 1); + memcpy(vc, &vtnet_vi_consts, sizeof(*vc)); pthread_mutex_init(&sc->vsc_mtx, NULL); - vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues); - sc->vsc_vs.vs_mtx = &sc->vsc_mtx; - sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; @@ -796,13 +413,13 @@ #endif /* - * Attempt to open the tap device and read the MAC address + * Attempt to open the backend device and read the MAC address * if specified */ mac_provided = 0; - sc->vsc_tapfd = -1; - sc->vsc_nmd = NULL; if (opts != NULL) { + char *devname; + char *vtopts; int err; devname = vtopts = strdup(opts); @@ -817,13 +434,12 @@ mac_provided = 1; } - if (strncmp(devname, "vale", 4) == 0) - pci_vtnet_netmap_setup(sc, devname); - if (strncmp(devname, "tap", 3) == 0 || - strncmp(devname, "vmnet", 5) == 0) - pci_vtnet_tap_setup(sc, devname); - + sc->vsc_be = netbe_init(devname, pci_vtnet_rx_callback, sc); free(devname); + if (sc->vsc_be == NULL) { + return (EINVAL); + } + vc->vc_hv_caps |= netbe_get_cap(sc->vsc_be); } if (!mac_provided) { @@ -837,10 +453,12 @@ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); - /* Link is up if we managed to open tap device or vale port. */ - sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 || - sc->vsc_nmd != NULL); + /* Link is up if we managed to open backend device. */ + sc->vsc_config.status = (opts == NULL || sc->vsc_be); + vi_softc_linkup(&sc->vsc_vs, vc, sc, pi, sc->vsc_queues); + sc->vsc_vs.vs_mtx = &sc->vsc_mtx; + /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) return (1); @@ -876,8 +494,8 @@ struct pci_vtnet_softc *sc = vsc; void *ptr; - if (offset < 6) { - assert(offset + size <= 6); + if (offset < (int)sizeof(sc->vsc_config.mac)) { + assert(offset + size <= (int)sizeof(sc->vsc_config.mac)); /* * The driver is allowed to change the MAC address */ @@ -909,14 +527,18 @@ sc->vsc_features = negotiated_features; - if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) { + if (!(negotiated_features & VIRTIO_NET_F_MRG_RXBUF)) { sc->rx_merge = 0; - /* non-merge rx header is 2 bytes shorter */ + /* Without mergeable rx buffers, virtio-net header is 2 + * bytes shorter than sizeof(struct virtio_net_rxhdr). */ sc->rx_vhdrlen -= 2; } + + /* Tell the backend to enable some capabilities it has advertised. */ + netbe_set_cap(sc->vsc_be, negotiated_features, sc->rx_vhdrlen); } -struct pci_devemu pci_de_vnet = { +static struct pci_devemu pci_de_vnet = { .pe_emu = "virtio-net", .pe_init = pci_vtnet_init, .pe_barwrite = vi_pci_write,