Changeset View
Standalone View
usr.sbin/bhyve/net_backends.c
Show All 37 Lines | |||||
__FBSDID("$FreeBSD$"); | __FBSDID("$FreeBSD$"); | ||||
#include <sys/types.h> /* u_short etc */ | #include <sys/types.h> /* u_short etc */ | ||||
#ifndef WITHOUT_CAPSICUM | #ifndef WITHOUT_CAPSICUM | ||||
#include <sys/capsicum.h> | #include <sys/capsicum.h> | ||||
#endif | #endif | ||||
#include <sys/ioctl.h> | #include <sys/ioctl.h> | ||||
#include <sys/mman.h> | #include <sys/mman.h> | ||||
#include <sys/sysctl.h> | |||||
#include <sys/uio.h> | #include <sys/uio.h> | ||||
#include <net/if.h> | #include <net/if.h> | ||||
#include <net/netmap.h> | #include <net/netmap.h> | ||||
#include <net/netmap_virt.h> | #include <net/netmap_virt.h> | ||||
#define NETMAP_WITH_LIBS | #define NETMAP_WITH_LIBS | ||||
#include <net/netmap_user.h> | #include <net/netmap_user.h> | ||||
Show All 10 Lines | |||||
#include <unistd.h> | #include <unistd.h> | ||||
#include <sysexits.h> | #include <sysexits.h> | ||||
#include <assert.h> | #include <assert.h> | ||||
#include <pthread.h> | #include <pthread.h> | ||||
#include <pthread_np.h> | #include <pthread_np.h> | ||||
#include <poll.h> | #include <poll.h> | ||||
#include <assert.h> | #include <assert.h> | ||||
#ifdef NETGRAPH | |||||
#include <sys/param.h> | |||||
#include <netgraph.h> | |||||
#endif | |||||
#include "debug.h" | #include "debug.h" | ||||
#include "iov.h" | #include "iov.h" | ||||
#include "mevent.h" | #include "mevent.h" | ||||
#include "net_backends.h" | #include "net_backends.h" | ||||
#include <sys/linker_set.h> | #include <sys/linker_set.h> | ||||
▲ Show 20 Lines • Show All 297 Lines • ▼ Show 20 Lines | static struct net_backend vmnet_backend = { | ||||
.recv_enable = tap_recv_enable, | .recv_enable = tap_recv_enable, | ||||
.recv_disable = tap_recv_disable, | .recv_disable = tap_recv_disable, | ||||
.get_cap = tap_get_cap, | .get_cap = tap_get_cap, | ||||
.set_cap = tap_set_cap, | .set_cap = tap_set_cap, | ||||
}; | }; | ||||
DATA_SET(net_backend_set, tap_backend); | DATA_SET(net_backend_set, tap_backend); | ||||
DATA_SET(net_backend_set, vmnet_backend); | DATA_SET(net_backend_set, vmnet_backend); | ||||
#ifdef NETGRAPH | |||||
/* | |||||
* Netgraph backend | |||||
*/ | |||||
#define NG_SBUF_MAX_SIZE (4 * 1024 * 1024) | |||||
static int | |||||
ng_init(struct net_backend *be, const char *devname, | |||||
net_be_rxeof_t cb, void *param) | |||||
{ | |||||
struct tap_priv *p = (struct tap_priv *)be->opaque; | |||||
struct ngm_connect ngc; | |||||
char *ngopts, *tofree; | |||||
char nodename[NG_NODESIZ]; | |||||
int sbsz; | |||||
int ctrl_sock; | |||||
int flags; | |||||
int path_provided; | |||||
int peerhook_provided; | |||||
int socket_provided; | |||||
unsigned long maxsbsz; | |||||
size_t msbsz; | |||||
#ifndef WITHOUT_CAPSICUM | |||||
cap_rights_t rights; | |||||
#endif | |||||
if (cb == NULL) { | |||||
WPRINTF(("Netgraph backend requires non-NULL callback")); | |||||
return (-1); | |||||
} | |||||
be->fd = -1; | |||||
memset(&ngc, 0, sizeof(ngc)); | |||||
strncpy(ngc.ourhook, "vmlink", NG_HOOKSIZ - 1); | |||||
tofree = ngopts = strdup(devname); | |||||
if (ngopts == NULL) { | |||||
WPRINTF(("strdup error")); | |||||
return (-1); | |||||
} | |||||
socket_provided = 0; | |||||
path_provided = 0; | |||||
peerhook_provided = 0; | |||||
(void)strsep(&ngopts, "/"); | |||||
while (ngopts != NULL) { | |||||
char *value = ngopts; | |||||
char *key; | |||||
key = strsep(&value, "="); | |||||
if (value == NULL) | |||||
break; | |||||
ngopts = value; | |||||
(void) strsep(&ngopts, "/"); | |||||
if (strcmp(key, "socket") == 0) { | |||||
strncpy(nodename, value, NG_NODESIZ - 1); | |||||
socket_provided = 1; | |||||
} else if (strcmp(key, "path") == 0) { | |||||
strncpy(ngc.path, value, NG_PATHSIZ - 1); | |||||
path_provided = 1; | |||||
} else if (strcmp(key, "hook") == 0) { | |||||
strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1); | |||||
} else if (strcmp(key, "peerhook") == 0) { | |||||
strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1); | |||||
peerhook_provided = 1; | |||||
} else { | |||||
WPRINTF(("incorrect network backend options: %s", key)); | |||||
free(tofree); | |||||
return (-1); | |||||
} | |||||
} | |||||
free(tofree); | |||||
if (!path_provided) { | |||||
WPRINTF(("path must be provided")); | |||||
return (-1); | |||||
} | |||||
if (!peerhook_provided) { | |||||
WPRINTF(("peer hook must be provided")); | |||||
return (-1); | |||||
} | |||||
if (NgMkSockNode(socket_provided ? nodename : NULL, | |||||
&ctrl_sock, &be->fd) < 0) { | |||||
WPRINTF(("can't get Netgraph sockets")); | |||||
return (-1); | |||||
} | |||||
if (NgSendMsg(ctrl_sock, ".", | |||||
NGM_GENERIC_COOKIE, | |||||
NGM_CONNECT, &ngc, sizeof(ngc)) < 0) { | |||||
WPRINTF(("can't connect to node")); | |||||
close(ctrl_sock); | |||||
goto error; | |||||
} | |||||
close(ctrl_sock); | |||||
flags = fcntl(be->fd, F_GETFL); | |||||
if (flags < 0) { | |||||
WPRINTF(("can't get socket flags")); | |||||
goto error; | |||||
} | |||||
if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) { | |||||
WPRINTF(("can't set O_NONBLOCK flag")); | |||||
goto error; | |||||
} | |||||
/* | |||||
* The default ng_socket(4) buffer's size is too low. | |||||
* Calculate the minimum value between NG_SBUF_MAX_SIZE | |||||
* and kern.ipc.maxsockbuf. | |||||
*/ | |||||
msbsz = sizeof(maxsbsz); | |||||
if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz, | |||||
NULL, 0) < 0) { | |||||
WPRINTF(("can't get 'kern.ipc.maxsockbuf' value")); | |||||
goto error; | |||||
} | |||||
/* | |||||
* We can't set the socket buffer size to kern.ipc.maxsockbuf value, | |||||
* as it takes into account the mbuf(9) overhead. | |||||
*/ | |||||
maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES); | |||||
sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz); | |||||
if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz, | |||||
sizeof(sbsz)) < 0) { | |||||
WPRINTF(("can't set TX buffer size")); | |||||
goto error; | |||||
} | |||||
if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz, | |||||
sizeof(sbsz)) < 0) { | |||||
WPRINTF(("can't set RX buffer size")); | |||||
goto error; | |||||
} | |||||
#ifndef WITHOUT_CAPSICUM | |||||
cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); | |||||
if (caph_rights_limit(be->fd, &rights) == -1) | |||||
errx(EX_OSERR, "Unable to apply rights for sandbox"); | |||||
#endif | |||||
memset(p->bbuf, 0, sizeof(p->bbuf)); | |||||
p->bbuflen = 0; | |||||
p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); | |||||
if (p->mevp == NULL) { | |||||
WPRINTF(("Could not register event")); | |||||
goto error; | |||||
} | |||||
return (0); | |||||
error: | |||||
tap_cleanup(be); | |||||
return (-1); | |||||
} | |||||
static struct net_backend ng_backend = { | |||||
.prefix = "netgraph", | |||||
.priv_size = sizeof(struct tap_priv), | |||||
.init = ng_init, | |||||
.cleanup = tap_cleanup, | |||||
.send = tap_send, | |||||
.peek_recvlen = tap_peek_recvlen, | |||||
.recv = tap_recv, | |||||
.recv_enable = tap_recv_enable, | |||||
.recv_disable = tap_recv_disable, | |||||
.get_cap = tap_get_cap, | |||||
.set_cap = tap_set_cap, | |||||
}; | |||||
DATA_SET(net_backend_set, ng_backend); | |||||
#endif /* NETGRAPH */ | |||||
/* | /* | ||||
* The netmap backend | * The netmap backend | ||||
*/ | */ | ||||
/* The virtio-net features supported by netmap. */ | /* The virtio-net features supported by netmap. */ | ||||
#define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ | #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ | ||||
VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ | VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ | ||||
▲ Show 20 Lines • Show All 348 Lines • ▼ Show 20 Lines | |||||
* e.g. -s 2:0,frontend-name,backend-name[,other-args] | * e.g. -s 2:0,frontend-name,backend-name[,other-args] | ||||
* @cb is the receive callback supplied by the frontend, | * @cb is the receive callback supplied by the frontend, | ||||
* and it is invoked in the event loop when a receive | * and it is invoked in the event loop when a receive | ||||
* event is generated in the hypervisor, | * event is generated in the hypervisor, | ||||
* @param is a pointer to the frontend, and normally used as | * @param is a pointer to the frontend, and normally used as | ||||
* the argument for the callback. | * the argument for the callback. | ||||
*/ | */ | ||||
int | int | ||||
netbe_init(struct net_backend **ret, const char *devname, net_be_rxeof_t cb, | netbe_init(struct net_backend **ret, const char *devname, net_be_rxeof_t cb, | ||||
vmaffione: Most of the code is copy-pasted from tap. And the `priv` is identical. We could reuse `struct… | |||||
void *param) | void *param) | ||||
{ | { | ||||
struct net_backend **pbe, *nbe, *tbe = NULL; | struct net_backend **pbe, *nbe, *tbe = NULL; | ||||
int err; | int err; | ||||
/* | /* | ||||
* Find the network backend that matches the user-provided | * Find the network backend that matches the user-provided | ||||
* device name. net_backend_set is built using a linker set. | * device name. net_backend_set is built using a linker set. | ||||
Show All 33 Lines | netbe_init(struct net_backend **ret, const char *devname, net_be_rxeof_t cb, | ||||
return (0); | return (0); | ||||
} | } | ||||
void | void | ||||
netbe_cleanup(struct net_backend *be) | netbe_cleanup(struct net_backend *be) | ||||
{ | { | ||||
if (be != NULL) { | if (be != NULL) { | ||||
Done Inline Actionswe haven't allocated anything here, so it may be more readable to avoid a goto and just return -1 vmaffione: we haven't allocated anything here, so it may be more readable to avoid a goto and just return… | |||||
be->cleanup(be); | be->cleanup(be); | ||||
free(be); | free(be); | ||||
} | } | ||||
} | } | ||||
uint64_t | uint64_t | ||||
netbe_get_cap(struct net_backend *be) | netbe_get_cap(struct net_backend *be) | ||||
{ | { | ||||
Not Done Inline ActionsCan we use "," as a separator? This would be consistent with the other places where we do this kind of parsing. vmaffione: Can we use "," as a separator? This would be consistent with the other places where we do this… | |||||
Done Inline ActionsThe main problem is that the frontend does not pass options separated by the “,” symbol to the backend. Therefore, if we pass “netgraph,socket=X,etc.” from the command line, ng_init() will only get “netgraph”. To change this behavior, we need to change all the backends. What do you think about this? afedorov: The main problem is that the frontend does not pass options separated by the “,” symbol to the… | |||||
Not Done Inline ActionsRight, I missed that. -s 2:0,frontend-name,backend-name;beopt1=beval1;...;beoptN=bevalN,feopt1=feval1,...,feoptN=fevalN and so with a mixture of , and ;, and the backend arguments before the frontend arguments. Rather than this, I would rather make the life easier to the user, by allowing him to use , as a separator in any case, and specify frontend and backend options in the order she wishes. The real problem here is that frontend and backend should have been separate options (like in QEMU), but in bhyve we specify them both with a single -s option. But such a change would be too invasive and not backward-compat. vmaffione: Right, I missed that.
The `/` separator, in particular, looks very confusing because it is used… | |||||
assert(be != NULL); | assert(be != NULL); | ||||
return (be->get_cap(be)); | return (be->get_cap(be)); | ||||
} | } | ||||
int | int | ||||
netbe_set_cap(struct net_backend *be, uint64_t features, | netbe_set_cap(struct net_backend *be, uint64_t features, | ||||
unsigned vnet_hdr_len) | unsigned vnet_hdr_len) | ||||
{ | { | ||||
int ret; | int ret; | ||||
assert(be != NULL); | assert(be != NULL); | ||||
/* There are only three valid lengths, i.e., 0, 10 and 12. */ | /* There are only three valid lengths, i.e., 0, 10 and 12. */ | ||||
if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN | if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN | ||||
Not Done Inline ActionsYour code assumes, that the name of the netgraph node is specified, so better call this option "nodename" instead of "path". donner: Your code assumes, that the name of the netgraph node is specified, so better call this option… | |||||
Done Inline ActionsWhat do you think about 'relpath', similar to ngctl connect options naming? afedorov: What do you think about 'relpath', similar to ngctl connect options naming? | |||||
&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) | && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) | ||||
Done Inline ActionsHow do you specify a "unnamed" node? Like lagg12:lower.vlan34 (assuming a ng_ether on lagg12, connected to an unnamed ng_vlan). Your separator is conflicting with the valid path characters. donner: How do you specify a "unnamed" node? Like lagg12:lower.vlan34 (assuming a ng_ether on lagg12… | |||||
return (-1); | return (-1); | ||||
Done Inline ActionsAssuming you have provided an argument longer than NG_PATHSIZ, so you silently truncate the path argument, correct? Furthermore appending a colon assumes, that the path to be the name of an node only. donner: Assuming you have provided an argument longer than NG_PATHSIZ, so you silently truncate the… | |||||
be->fe_vnet_hdr_len = vnet_hdr_len; | be->fe_vnet_hdr_len = vnet_hdr_len; | ||||
ret = be->set_cap(be, features, vnet_hdr_len); | ret = be->set_cap(be, features, vnet_hdr_len); | ||||
assert(be->be_vnet_hdr_len == 0 || | assert(be->be_vnet_hdr_len == 0 || | ||||
be->be_vnet_hdr_len == be->fe_vnet_hdr_len); | be->be_vnet_hdr_len == be->fe_vnet_hdr_len); | ||||
return (ret); | return (ret); | ||||
} | } | ||||
Done Inline Actionssame thing here, it's probably cleaner to just return -1 vmaffione: same thing here, it's probably cleaner to just return -1 | |||||
ssize_t | ssize_t | ||||
netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) | netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) | ||||
{ | { | ||||
return (be->send(be, iov, iovcnt)); | return (be->send(be, iov, iovcnt)); | ||||
} | } | ||||
Done Inline Actionssame as above vmaffione: same as above | |||||
ssize_t | ssize_t | ||||
netbe_peek_recvlen(struct net_backend *be) | netbe_peek_recvlen(struct net_backend *be) | ||||
{ | { | ||||
return (be->peek_recvlen(be)); | return (be->peek_recvlen(be)); | ||||
Done Inline Actionssame as above vmaffione: same as above | |||||
} | } | ||||
/* | /* | ||||
* Try to read a packet from the backend, without blocking. | * Try to read a packet from the backend, without blocking. | ||||
* If no packets are available, return 0. In case of success, return | * If no packets are available, return 0. In case of success, return | ||||
* the length of the packet just read. Return -1 in case of errors. | * the length of the packet just read. Return -1 in case of errors. | ||||
*/ | */ | ||||
ssize_t | ssize_t | ||||
netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) | netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) | ||||
{ | { | ||||
return (be->recv(be, iov, iovcnt)); | return (be->recv(be, iov, iovcnt)); | ||||
} | } | ||||
Done Inline Actionsshouldn't you close the control socket here, to avoid resource leak? vmaffione: shouldn't you close the control socket here, to avoid resource leak? | |||||
/* | /* | ||||
* Read a packet from the backend and discard it. | * Read a packet from the backend and discard it. | ||||
* Returns the size of the discarded packet or zero if no packet was available. | * Returns the size of the discarded packet or zero if no packet was available. | ||||
* A negative error code is returned in case of read error. | * A negative error code is returned in case of read error. | ||||
*/ | */ | ||||
ssize_t | ssize_t | ||||
netbe_rx_discard(struct net_backend *be) | netbe_rx_discard(struct net_backend *be) | ||||
Show All 19 Lines | netbe_rx_disable(struct net_backend *be) | ||||
return be->recv_disable(be); | return be->recv_disable(be); | ||||
} | } | ||||
void | void | ||||
netbe_rx_enable(struct net_backend *be) | netbe_rx_enable(struct net_backend *be) | ||||
{ | { | ||||
return be->recv_enable(be); | return be->recv_enable(be); | ||||
} | } | ||||
Done Inline ActionsIs your manual optimization always better than the compiler code? donner: Is your manual optimization always better than the compiler code? | |||||
Done Inline Actions+1 vmaffione: +1
Also, this is initialization code, so it would not make any difference. | |||||
Done Inline ActionsI just don't want to use float to int conversion. afedorov: I just don't want to use float to int conversion.
The problem with kern.ipc.maxsockbuf is… | |||||
Done Inline ActionsYes, I was suggesting using 3*x/4, which does not use floating point instructions. vmaffione: Yes, I was suggesting using `3*x/4`, which does not use floating point instructions. | |||||
size_t | size_t | ||||
netbe_get_vnet_hdr_len(struct net_backend *be) | netbe_get_vnet_hdr_len(struct net_backend *be) | ||||
{ | { | ||||
Done Inline ActionsI'm curious: have you tried what happens if you try to set a value greater than the maximum allowed one? Do you get a failure or a saturation (to the kernel max value)? vmaffione: I'm curious: have you tried what happens if you try to set a value greater than the maximum… | |||||
Done Inline Actionsf I try to set the values greater than or equal to kern.ipc.maxsockbuf, then setsockopt () return ENOBUFS. afedorov: f I try to set the values greater than or equal to kern.ipc.maxsockbuf, then setsockopt ()… | |||||
return (be->be_vnet_hdr_len); | return (be->be_vnet_hdr_len); | ||||
} | } | ||||
Done Inline ActionsYou have an error state, your parameters might be invalid or unset. Especially be->fd might be 0 (from memset). ng_cleanup will close this arbitrary file descriptor unconditionally. Is this really intended? donner: You have an error state, your parameters might be invalid or unset.
Especially be->fd might be… | |||||
Done Inline ActionsThe value be->fd is initialized to -1 on line 794. So I think ng_cleaunup() must work as expected. afedorov: The value be->fd is initialized to -1 on line 794. So I think ng_cleaunup() must work as… | |||||
Done Inline ActionsBetter return EINVAL, because no parameter can be set. donner: Better return EINVAL, because no parameter can be set. | |||||
Done Inline Actions+1 vmaffione: +1
we just do the same as `tap_set_cap()` |
Most of the code is copy-pasted from tap. And the priv is identical. We could reuse struct tap_priv and all the callbacks but ng_init. This would simplify this patch to just ng_init and the ng_backend struct.