Changeset View
Standalone View
usr.sbin/bhyve/net_backends.c
- This file was added.
/*- | |||||
* Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org> | |||||
* | |||||
* Redistribution and use in source and binary forms, with or without | |||||
* modification, are permitted provided that the following conditions | |||||
* are met: | |||||
* 1. Redistributions of source code must retain the above copyright | |||||
* notice, this list of conditions and the following disclaimer. | |||||
* 2. Redistributions in binary form must reproduce the above copyright | |||||
* notice, this list of conditions and the following disclaimer in the | |||||
* documentation and/or other materials provided with the distribution. | |||||
* | |||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND | |||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS | |||||
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, | |||||
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT | |||||
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR | |||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, | |||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE | |||||
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, | |||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*/ | |||||
/* | |||||
* This file implements multiple network backends (tap, netmap, ...), | |||||
* to be used by network frontends such as virtio-net and e1000. | |||||
* The API to access the backend (e.g. send/receive packets, negotiate | |||||
* features) is exported by net_backends.h. | |||||
*/ | |||||
#include <sys/cdefs.h> | |||||
#include <sys/uio.h> | |||||
#include <sys/ioctl.h> | |||||
#include <sys/mman.h> | |||||
#include <sys/types.h> /* u_short etc */ | |||||
#include <net/if.h> | |||||
jhb: Sorry I missed this earlier, sys/types.h should always be first when it is used. | |||||
#include <err.h> | |||||
Done Inline ActionsIdeally the other net/*.h headers would be here with net/if.h. pci_virtio_net.c also had the capsicum headers sorted into the regular blocks. jhb: Ideally the other net/*.h headers would be here with net/if.h.
pci_virtio_net.c also had the… | |||||
Done Inline ActionsDone. I hope I did not miss anything. vmaffione: Done. I hope I did not miss anything. | |||||
#include <errno.h> | |||||
#include <fcntl.h> | |||||
#include <stdio.h> | |||||
#include <stdlib.h> | |||||
#include <stdint.h> | |||||
#include <string.h> | |||||
#include <unistd.h> | |||||
#include <sysexits.h> | |||||
#include <assert.h> | |||||
#include <pthread.h> | |||||
#include <pthread_np.h> | |||||
#include <poll.h> | |||||
#include <assert.h> | |||||
#include <net/netmap.h> | |||||
#include <net/netmap_virt.h> | |||||
#define NETMAP_WITH_LIBS | |||||
#include <net/netmap_user.h> | |||||
#ifndef WITHOUT_CAPSICUM | |||||
#include <capsicum_helpers.h> | |||||
#include <sys/capsicum.h> | |||||
#endif | |||||
#include "iov.h" | |||||
#include "mevent.h" | |||||
#include "net_backends.h" | |||||
#include <sys/linker_set.h> | |||||
/* | |||||
* Each network backend registers a set of function pointers that are | |||||
* used to implement the net backends API. | |||||
* This might need to be exposed if we implement backends in separate files. | |||||
*/ | |||||
struct net_backend { | |||||
const char *prefixes; /* prefixes matching this backend */ | |||||
/* | |||||
* Routines used to initialize and cleanup the resources needed | |||||
* by a backend. The cleanup function is used internally, | |||||
* and should not be called by the frontend. | |||||
*/ | |||||
int (*init)(struct net_backend *be, const char *devname, | |||||
net_be_rxeof_t cb, void *param); | |||||
void (*cleanup)(struct net_backend *be); | |||||
/* | |||||
* Called to serve a guest transmit request. The scatter-gather | |||||
* vector provided by the caller has 'iovcnt' elements and contains | |||||
* the packet to send. | |||||
*/ | |||||
ssize_t (*send)(struct net_backend *be, struct iovec *iov, int iovcnt); | |||||
/* | |||||
* Called to receive a packet from the backend. When the function | |||||
* returns a positive value 'len', the scatter-gather vector | |||||
* provided by the caller contains a packet with such length. | |||||
* The function returns 0 if the backend doesn't have a new packet to | |||||
* receive. | |||||
*/ | |||||
ssize_t (*recv)(struct net_backend *be, struct iovec *iov, int iovcnt); | |||||
/* | |||||
* Ask the backend for the virtio-net features it is able to | |||||
* support. Possible features are TSO, UFO and checksum offloading | |||||
* in both rx and tx direction and for both IPv4 and IPv6. | |||||
*/ | |||||
uint64_t (*get_cap)(struct net_backend *be); | |||||
/* | |||||
* Tell the backend to enable/disable the specified virtio-net | |||||
* features (capabilities). | |||||
*/ | |||||
int (*set_cap)(struct net_backend *be, uint64_t features, | |||||
unsigned int vnet_hdr_len); | |||||
struct pci_vtnet_softc *sc; | |||||
int fd; | |||||
/* Length of the virtio-net header used by the backend and the | |||||
* frontend, respectively. A zero value means that the header | |||||
* is not used. */ | |||||
jhbUnsubmitted Done Inline ActionsMost of the comments are correct, but the style would be to put the leading '/*' and trailing '*/' on their own lines. A few other comments in the patch also are not quite correct. jhb: Most of the comments are correct, but the style would be to put the leading '/*' and trailing… | |||||
vmaffioneAuthorUnsubmitted Done Inline ActionsYes, indeed. Thanks for spotting them. vmaffione: Yes, indeed. Thanks for spotting them. | |||||
unsigned int be_vnet_hdr_len; | |||||
unsigned int fe_vnet_hdr_len; | |||||
/* Size of backend-specific private data. */ | |||||
size_t priv_size; | |||||
/* Room for backend-specific data. */ | |||||
char opaque[0]; | |||||
}; | |||||
SET_DECLARE(net_backend_set, struct net_backend); | |||||
#define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) | |||||
#define WPRINTF(params) printf params | |||||
/* the tap backend */ | |||||
struct tap_priv { | |||||
struct mevent *mevp; | |||||
}; | |||||
static void | |||||
tap_cleanup(struct net_backend *be) | |||||
{ | |||||
struct tap_priv *priv = (struct tap_priv *)be->opaque; | |||||
if (priv->mevp) { | |||||
mevent_delete(priv->mevp); | |||||
} | |||||
if (be->fd != -1) { | |||||
close(be->fd); | |||||
be->fd = -1; | |||||
} | |||||
} | |||||
static int | |||||
tap_init(struct net_backend *be, const char *devname, | |||||
net_be_rxeof_t cb, void *param) | |||||
{ | |||||
struct tap_priv *priv = (struct tap_priv *)be->opaque; | |||||
char tbuf[80]; | |||||
int fd; | |||||
int opt = 1; | |||||
#ifndef WITHOUT_CAPSICUM | |||||
cap_rights_t rights; | |||||
#endif | |||||
if (cb == NULL) { | |||||
WPRINTF(("TAP backend requires non-NULL callback\n")); | |||||
return -1; | |||||
} | |||||
strcpy(tbuf, "/dev/"); | |||||
strlcat(tbuf, devname, sizeof(tbuf)); | |||||
fd = open(tbuf, O_RDWR); | |||||
if (fd == -1) { | |||||
WPRINTF(("open of tap device %s failed\n", tbuf)); | |||||
goto error; | |||||
} | |||||
/* | |||||
* Set non-blocking and register for read | |||||
* notifications with the event loop | |||||
*/ | |||||
if (ioctl(fd, FIONBIO, &opt) < 0) { | |||||
WPRINTF(("tap device O_NONBLOCK failed\n")); | |||||
goto error; | |||||
} | |||||
#ifndef WITHOUT_CAPSICUM | |||||
cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); | |||||
if (caph_rights_limit(fd, &rights) == -1) | |||||
errx(EX_OSERR, "Unable to apply rights for sandbox"); | |||||
#endif | |||||
priv->mevp = mevent_add(fd, EVF_READ, cb, param); | |||||
if (priv->mevp == NULL) { | |||||
WPRINTF(("Could not register event\n")); | |||||
goto error; | |||||
} | |||||
be->fd = fd; | |||||
return 0; | |||||
jhbUnsubmitted Done Inline ActionsFreeBSD's style is to put ()'s around return values, so: return (0); here and elsewhere throughout the patch. jhb: FreeBSD's style is to put ()'s around return values, so:
```
return (0);
```
here and… | |||||
error: | |||||
tap_cleanup(be); | |||||
return -1; | |||||
} | |||||
/* | |||||
* Called to send a buffer chain out to the tap device | |||||
*/ | |||||
static ssize_t | |||||
tap_send(struct net_backend *be, struct iovec *iov, int iovcnt) | |||||
{ | |||||
return writev(be->fd, iov, iovcnt); | |||||
} | |||||
static ssize_t | |||||
tap_recv(struct net_backend *be, struct iovec *iov, int iovcnt) | |||||
{ | |||||
Done Inline Actionsreturn() here as well. jhb: return() here as well. | |||||
Done Inline ActionsOk, I thought the rule was not valid for return statements containing a function call. vmaffione: Ok, I thought the rule was not valid for return statements containing a function call. | |||||
ssize_t ret; | |||||
/* Should never be called without a valid tap fd */ | |||||
assert(be->fd != -1); | |||||
ret = readv(be->fd, iov, iovcnt); | |||||
if (ret < 0 && errno == EWOULDBLOCK) { | |||||
return 0; | |||||
} | |||||
return ret; | |||||
} | |||||
static uint64_t | |||||
tap_get_cap(struct net_backend *be) | |||||
{ | |||||
return 0; /* no capabilities for now */ | |||||
} | |||||
static int | |||||
tap_set_cap(struct net_backend *be, uint64_t features, | |||||
unsigned vnet_hdr_len) | |||||
{ | |||||
return (features || vnet_hdr_len) ? -1 : 0; | |||||
} | |||||
static struct net_backend tap_backend = { | |||||
.prefixes = "tap|vmnet", | |||||
jhbUnsubmitted Done Inline ActionsI think it would be simpler to just have two backend structs for tap vs vmnet and then you don't need all the parsing logic for handling '|', etc. Just have a single 'prefix' and use simple 'strncmp' matching without needing all the token, etc. Having an extra struct is pretty cheap in exchange for the code simplicity. jhb: I think it would be simpler to just have two backend structs for tap vs vmnet and then you… | |||||
vmaffioneAuthorUnsubmitted Done Inline ActionsI knew this was going to come up... Yes, it makes sense. vmaffione: I knew this was going to come up... Yes, it makes sense. | |||||
.priv_size = sizeof(struct tap_priv), | |||||
.init = tap_init, | |||||
.cleanup = tap_cleanup, | |||||
.send = tap_send, | |||||
.recv = tap_recv, | |||||
.get_cap = tap_get_cap, | |||||
.set_cap = tap_set_cap, | |||||
}; | |||||
DATA_SET(net_backend_set, tap_backend); | |||||
/* | |||||
* The netmap backend | |||||
*/ | |||||
Done Inline Actionss/ot/of/ jhb: s/ot/of/ | |||||
/* The virtio-net features supported by netmap. */ | |||||
#define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ | |||||
VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ | |||||
VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ | |||||
VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) | |||||
struct netmap_priv { | |||||
char ifname[IFNAMSIZ]; | |||||
struct nm_desc *nmd; | |||||
uint16_t memid; | |||||
struct netmap_ring *rx; | |||||
struct netmap_ring *tx; | |||||
struct mevent *mevp; | |||||
net_be_rxeof_t cb; | |||||
void *cb_param; | |||||
}; | |||||
static void | |||||
nmreq_init(struct nmreq *req, char *ifname) | |||||
{ | |||||
memset(req, 0, sizeof(*req)); | |||||
strncpy(req->nr_name, ifname, sizeof(req->nr_name)); | |||||
req->nr_version = NETMAP_API; | |||||
} | |||||
static int | |||||
netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) | |||||
{ | |||||
int err; | |||||
struct nmreq req; | |||||
struct netmap_priv *priv = (struct netmap_priv *)be->opaque; | |||||
nmreq_init(&req, priv->ifname); | |||||
req.nr_cmd = NETMAP_BDG_VNET_HDR; | |||||
req.nr_arg1 = vnet_hdr_len; | |||||
err = ioctl(be->fd, NIOCREGIF, &req); | |||||
if (err) { | |||||
WPRINTF(("Unable to set vnet header length %d\n", | |||||
vnet_hdr_len)); | |||||
return err; | |||||
} | |||||
be->be_vnet_hdr_len = vnet_hdr_len; | |||||
return 0; | |||||
} | |||||
static int | |||||
netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) | |||||
{ | |||||
int prev_hdr_len = be->be_vnet_hdr_len; | |||||
int ret; | |||||
if (vnet_hdr_len == prev_hdr_len) { | |||||
return 1; | |||||
} | |||||
ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); | |||||
if (ret) { | |||||
return 0; | |||||
} | |||||
netmap_set_vnet_hdr_len(be, prev_hdr_len); | |||||
return 1; | |||||
} | |||||
static uint64_t | |||||
netmap_get_cap(struct net_backend *be) | |||||
{ | |||||
return netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? | |||||
NETMAP_FEATURES : 0; | |||||
jhbUnsubmitted Done Inline Actionsstyle(9) also does a 4-space continuation indent. jhb: style(9) also does a 4-space continuation indent. | |||||
} | |||||
static int | |||||
netmap_set_cap(struct net_backend *be, uint64_t features, | |||||
unsigned vnet_hdr_len) | |||||
{ | |||||
return netmap_set_vnet_hdr_len(be, vnet_hdr_len); | |||||
} | |||||
static int | |||||
netmap_init(struct net_backend *be, const char *devname, | |||||
net_be_rxeof_t cb, void *param) | |||||
{ | |||||
struct netmap_priv *priv = (struct netmap_priv *)be->opaque; | |||||
strncpy(priv->ifname, devname, sizeof(priv->ifname)); | |||||
jhbUnsubmitted Done Inline ActionsMaybe use strlcpy() instead? jhb: Maybe use strlcpy() instead? | |||||
vmaffioneAuthorUnsubmitted Done Inline ActionsThanks, this slipped through during my rebase. vmaffione: Thanks, this slipped through during my rebase. | |||||
Done Inline Actionsreturn () jhb: return () | |||||
priv->ifname[sizeof(priv->ifname) - 1] = '\0'; | |||||
priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); | |||||
if (priv->nmd == NULL) { | |||||
WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)\n", | |||||
devname, strerror(errno))); | |||||
free(priv); | |||||
return -1; | |||||
} | |||||
Done Inline Actionsreturn () jhb: return () | |||||
priv->memid = priv->nmd->req.nr_arg2; | |||||
priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); | |||||
priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); | |||||
priv->cb = cb; | |||||
priv->cb_param = param; | |||||
be->fd = priv->nmd->fd; | |||||
priv->mevp = mevent_add(be->fd, EVF_READ, cb, param); | |||||
if (priv->mevp == NULL) { | |||||
WPRINTF(("Could not register event\n")); | |||||
return -1; | |||||
} | |||||
return 0; | |||||
} | |||||
static void | |||||
netmap_cleanup(struct net_backend *be) | |||||
{ | |||||
struct netmap_priv *priv = (struct netmap_priv *)be->opaque; | |||||
if (priv->mevp) { | |||||
mevent_delete(priv->mevp); | |||||
} | |||||
if (priv->nmd) { | |||||
nm_close(priv->nmd); | |||||
} | |||||
be->fd = -1; | |||||
} | |||||
/* A fast copy routine only for multiples of 64 bytes, non overlapped. */ | |||||
jhbUnsubmitted Done Inline ActionsIs this really better than using memcpy() directly? Note that memcpy() in libc on both Linux and FreeBSD will use optimized routines for the current CPU. jhb: Is this really better than using memcpy() directly? Note that memcpy() in libc on both Linux… | |||||
vmaffioneAuthorUnsubmitted Done Inline ActionsAgreed. This is something actually very old that I still had around. vmaffione: Agreed. This is something actually very old that I still had around. | |||||
static inline void | |||||
pkt_copy(const void *_src, void *_dst, int l) | |||||
{ | |||||
const uint64_t *src = _src; | |||||
uint64_t *dst = _dst; | |||||
if (l >= 1024) { | |||||
bcopy(src, dst, l); | |||||
return; | |||||
} | |||||
for (; l > 0; l -= 64) { | |||||
*dst++ = *src++; | |||||
*dst++ = *src++; | |||||
*dst++ = *src++; | |||||
*dst++ = *src++; | |||||
*dst++ = *src++; | |||||
*dst++ = *src++; | |||||
*dst++ = *src++; | |||||
*dst++ = *src++; | |||||
} | |||||
} | |||||
static ssize_t | |||||
netmap_send(struct net_backend *be, struct iovec *iov, | |||||
int iovcnt) | |||||
{ | |||||
struct netmap_priv *priv = (struct netmap_priv *)be->opaque; | |||||
struct netmap_ring *ring; | |||||
ssize_t totlen = 0; | |||||
int nm_buf_size; | |||||
int nm_buf_len; | |||||
uint32_t head; | |||||
void *nm_buf; | |||||
int j; | |||||
ring = priv->tx; | |||||
head = ring->head; | |||||
if (head == ring->tail) { | |||||
WPRINTF(("No space, drop %zu bytes\n", count_iov(iov, iovcnt))); | |||||
goto txsync; | |||||
} | |||||
nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); | |||||
nm_buf_size = ring->nr_buf_size; | |||||
nm_buf_len = 0; | |||||
for (j = 0; j < iovcnt; j++) { | |||||
int iov_frag_size = iov[j].iov_len; | |||||
void *iov_frag_buf = iov[j].iov_base; | |||||
totlen += iov_frag_size; | |||||
/* Split each iovec fragment over more netmap slots, if | |||||
necessary. */ | |||||
for (;;) { | |||||
int copylen; | |||||
copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; | |||||
pkt_copy(iov_frag_buf, nm_buf, copylen); | |||||
iov_frag_buf += copylen; | |||||
iov_frag_size -= copylen; | |||||
nm_buf += copylen; | |||||
nm_buf_size -= copylen; | |||||
nm_buf_len += copylen; | |||||
if (iov_frag_size == 0) { | |||||
break; | |||||
} | |||||
ring->slot[head].len = nm_buf_len; | |||||
ring->slot[head].flags = NS_MOREFRAG; | |||||
head = nm_ring_next(ring, head); | |||||
if (head == ring->tail) { | |||||
/* We ran out of netmap slots while | |||||
* splitting the iovec fragments. */ | |||||
WPRINTF(("No space, drop %zu bytes\n", | |||||
count_iov(iov, iovcnt))); | |||||
goto txsync; | |||||
} | |||||
nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); | |||||
nm_buf_size = ring->nr_buf_size; | |||||
nm_buf_len = 0; | |||||
} | |||||
} | |||||
/* Complete the last slot, which must not have NS_MOREFRAG set. */ | |||||
ring->slot[head].len = nm_buf_len; | |||||
ring->slot[head].flags = 0; | |||||
head = nm_ring_next(ring, head); | |||||
/* Now update ring->head and ring->cur. */ | |||||
ring->head = ring->cur = head; | |||||
txsync: | |||||
ioctl(be->fd, NIOCTXSYNC, NULL); | |||||
return totlen; | |||||
} | |||||
static ssize_t | |||||
netmap_recv(struct net_backend *be, struct iovec *iov, int iovcnt) | |||||
{ | |||||
struct netmap_priv *priv = (struct netmap_priv *)be->opaque; | |||||
struct netmap_slot *slot = NULL; | |||||
struct netmap_ring *ring; | |||||
void *iov_frag_buf; | |||||
int iov_frag_size; | |||||
ssize_t totlen = 0; | |||||
uint32_t head; | |||||
assert(iovcnt); | |||||
ring = priv->rx; | |||||
head = ring->head; | |||||
iov_frag_buf = iov->iov_base; | |||||
iov_frag_size = iov->iov_len; | |||||
do { | |||||
int nm_buf_len; | |||||
void *nm_buf; | |||||
if (head == ring->tail) { | |||||
return 0; | |||||
} | |||||
slot = ring->slot + head; | |||||
nm_buf = NETMAP_BUF(ring, slot->buf_idx); | |||||
nm_buf_len = slot->len; | |||||
for (;;) { | |||||
int copylen = nm_buf_len < iov_frag_size ? nm_buf_len : iov_frag_size; | |||||
Done Inline ActionsI suspect this is longer than 80 cols? jhb: I suspect this is longer than 80 cols? | |||||
pkt_copy(nm_buf, iov_frag_buf, copylen); | |||||
nm_buf += copylen; | |||||
nm_buf_len -= copylen; | |||||
iov_frag_buf += copylen; | |||||
iov_frag_size -= copylen; | |||||
totlen += copylen; | |||||
if (nm_buf_len == 0) { | |||||
break; | |||||
} | |||||
iov++; | |||||
iovcnt--; | |||||
if (iovcnt == 0) { | |||||
/* No space to receive. */ | |||||
WPRINTF(("Short iov, drop %zd bytes\n", | |||||
totlen)); | |||||
return -ENOSPC; | |||||
} | |||||
iov_frag_buf = iov->iov_base; | |||||
iov_frag_size = iov->iov_len; | |||||
} | |||||
head = nm_ring_next(ring, head); | |||||
} while (slot->flags & NS_MOREFRAG); | |||||
/* Release slots to netmap. */ | |||||
ring->head = ring->cur = head; | |||||
return totlen; | |||||
} | |||||
static struct net_backend netmap_backend = { | |||||
.prefixes = "netmap|vale", | |||||
.priv_size = sizeof(struct netmap_priv), | |||||
.init = netmap_init, | |||||
.cleanup = netmap_cleanup, | |||||
.send = netmap_send, | |||||
.recv = netmap_recv, | |||||
.get_cap = netmap_get_cap, | |||||
.set_cap = netmap_set_cap, | |||||
}; | |||||
DATA_SET(net_backend_set, netmap_backend); | |||||
Done Inline Actionss/ot/of/ jhb: s/ot/of/ | |||||
/* | |||||
* Initialize a backend and attach to the frontend. | |||||
* This is called during frontend initialization. | |||||
* @pbe is a pointer to the backend to be initialized | |||||
* @devname is the backend-name as supplied on the command line, | |||||
* e.g. -s 2:0,frontend-name,backend-name[,other-args] | |||||
* @cb is the receive callback supplied by the frontend, | |||||
* and it is invoked in the event loop when a receive | |||||
* event is generated in the hypervisor, | |||||
* @param is a pointer to the frontend, and normally used as | |||||
* the argument for the callback. | |||||
*/ | |||||
int | |||||
netbe_init(struct net_backend **ret, const char *devname, net_be_rxeof_t cb, | |||||
void *param) | |||||
{ | |||||
struct net_backend **pbe, *nbe, *tbe = NULL; | |||||
int err; | |||||
/* | |||||
* Find the network backend that matches the user-provided | |||||
* device name. net_backend_set is built using a linker set. | |||||
*/ | |||||
SET_FOREACH(pbe, net_backend_set) { | |||||
char *token, *last; | |||||
char tokens[128]; | |||||
int match = 0; | |||||
strlcpy(tokens, (*pbe)->prefixes, sizeof(tokens)); | |||||
for (token = strtok_r(tokens, "|", &last); | |||||
token != NULL; token = strtok_r(NULL, "|", &last)) { | |||||
if (!strncmp(devname, token, strlen(token))) { | |||||
match = 1; | |||||
break; | |||||
} | |||||
} | |||||
if (match) { | |||||
Done Inline ActionsI prefer to use 'strncmp(...) == 0' rather than !strncmp, and style(9) requests to only use ! for booleans. jhb: I prefer to use 'strncmp(...) == 0' rather than !strncmp, and style(9) requests to only use ! | |||||
tbe = *pbe; | |||||
assert(tbe->init != NULL); | |||||
assert(tbe->cleanup != NULL); | |||||
assert(tbe->send != NULL); | |||||
assert(tbe->recv != NULL); | |||||
assert(tbe->get_cap != NULL); | |||||
assert(tbe->set_cap != NULL); | |||||
break; | |||||
} | |||||
} | |||||
*ret = NULL; | |||||
if (tbe == NULL) | |||||
return (EINVAL); | |||||
nbe = calloc(1, sizeof(*nbe) + tbe->priv_size); | |||||
*nbe = *tbe; /* copy the template */ | |||||
nbe->fd = -1; | |||||
nbe->sc = param; | |||||
nbe->be_vnet_hdr_len = 0; | |||||
nbe->fe_vnet_hdr_len = 0; | |||||
/* initialize the backend */ | |||||
err = nbe->init(nbe, devname, cb, param); | |||||
if (err) { | |||||
free(nbe); | |||||
return err; | |||||
} | |||||
*ret = nbe; | |||||
return 0; | |||||
} | |||||
void | |||||
netbe_cleanup(struct net_backend *be) | |||||
{ | |||||
if (be != NULL) { | |||||
be->cleanup(be); | |||||
free(be); | |||||
} | |||||
} | |||||
uint64_t | |||||
netbe_get_cap(struct net_backend *be) | |||||
{ | |||||
assert(be != NULL); | |||||
return be->get_cap(be); | |||||
Done Inline Actionsreturn () jhb: return () | |||||
} | |||||
int | |||||
netbe_set_cap(struct net_backend *be, uint64_t features, | |||||
unsigned vnet_hdr_len) | |||||
{ | |||||
int ret; | |||||
assert(be != NULL); | |||||
/* There are only three valid lengths. */ | |||||
if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN | |||||
&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) | |||||
return -1; | |||||
be->fe_vnet_hdr_len = vnet_hdr_len; | |||||
ret = be->set_cap(be, features, vnet_hdr_len); | |||||
assert(be->be_vnet_hdr_len == 0 || | |||||
be->be_vnet_hdr_len == be->fe_vnet_hdr_len); | |||||
return ret; | |||||
} | |||||
static __inline struct iovec * | |||||
iov_trim(struct iovec *iov, int *iovcnt, unsigned int tlen) | |||||
{ | |||||
struct iovec *riov; | |||||
/* XXX short-cut: assume first segment is >= tlen */ | |||||
assert(iov[0].iov_len >= tlen); | |||||
iov[0].iov_len -= tlen; | |||||
if (iov[0].iov_len == 0) { | |||||
assert(*iovcnt > 1); | |||||
*iovcnt -= 1; | |||||
riov = &iov[1]; | |||||
} else { | |||||
iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen); | |||||
riov = &iov[0]; | |||||
} | |||||
return (riov); | |||||
} | |||||
ssize_t | |||||
netbe_send(struct net_backend *be, struct iovec *iov, int iovcnt) | |||||
{ | |||||
assert(be != NULL); | |||||
if (be->be_vnet_hdr_len != be->fe_vnet_hdr_len) { | |||||
/* The frontend uses a virtio-net header, but the backend | |||||
* does not. We ignore it (as it must be all zeroes) and | |||||
* strip it. */ | |||||
assert(be->be_vnet_hdr_len == 0); | |||||
iov = iov_trim(iov, &iovcnt, be->fe_vnet_hdr_len); | |||||
} | |||||
return be->send(be, iov, iovcnt); | |||||
} | |||||
Done Inline Actionsreturn () jhb: return () | |||||
/* | |||||
* can return -1 in case of errors | |||||
*/ | |||||
ssize_t | |||||
netbe_recv(struct net_backend *be, struct iovec *iov, int iovcnt) | |||||
{ | |||||
/* Length of prepended virtio-net header. */ | |||||
unsigned int hlen = be->fe_vnet_hdr_len; | |||||
int ret; | |||||
assert(be != NULL); | |||||
if (hlen && hlen != be->be_vnet_hdr_len) { | |||||
/* The frontend uses a virtio-net header, but the backend | |||||
* does not. We need to prepend a zeroed header. */ | |||||
struct virtio_net_rxhdr *vh; | |||||
assert(be->be_vnet_hdr_len == 0); | |||||
/* | |||||
* Get a pointer to the rx header, and use the | |||||
* data immediately following it for the packet buffer. | |||||
*/ | |||||
vh = iov[0].iov_base; | |||||
iov = iov_trim(iov, &iovcnt, hlen); | |||||
/* | |||||
* The only valid field in the rx packet header is the | |||||
* number of buffers if merged rx bufs were negotiated. | |||||
*/ | |||||
memset(vh, 0, hlen); | |||||
if (hlen == VNET_HDR_LEN) { | |||||
vh->vrh_bufs = 1; | |||||
} | |||||
} | |||||
ret = be->recv(be, iov, iovcnt); | |||||
if (ret > 0) { | |||||
ret += hlen; | |||||
} | |||||
return ret; | |||||
} | |||||
/* | |||||
* Read a packet from the backend and discard it. | |||||
* Returns the size of the discarded packet or zero if no packet was available. | |||||
* A negative error code is returned in case of read error. | |||||
*/ | |||||
ssize_t | |||||
netbe_rx_discard(struct net_backend *be) | |||||
{ | |||||
/* | |||||
* MP note: the dummybuf is only used to discard frames, | |||||
* so there is no need for it to be per-vtnet or locked. | |||||
* We only make it large enough for TSO-sized segment. | |||||
*/ | |||||
static uint8_t dummybuf[65536+64]; | |||||
jhbUnsubmitted Done Inline ActionsNit: spaces around '+' jhb: Nit: spaces around '+' | |||||
Done Inline ActionsWhat do you think about adding this function to the backend API. For netmap, there is no need to copy data into a dummy buffer, we can just move ring pointers. afedorov: What do you think about adding this function to the backend API. For netmap, there is no need… | |||||
Done Inline ActionsGood point, but I think it's not convenient in practice, and for two reasons. First, once I implement receive backpressure, discard won't happen anymore (at least that's the plan). Second, making this operation faster would be counter productive, since we are giving less time to the guest to replenish. The result is more packet drops. vmaffione: Good point, but I think it's not convenient in practice, and for two reasons. First, once I… | |||||
struct iovec iov; | |||||
iov.iov_base = dummybuf; | |||||
iov.iov_len = sizeof(dummybuf); | |||||
return netbe_recv(be, &iov, 1); | |||||
} | |||||
Done Inline Actionsreturn () jhb: return () |
Sorry I missed this earlier, sys/types.h should always be first when it is used.