Index: usr.sbin/bhyve/bhyve.8 =================================================================== --- usr.sbin/bhyve/bhyve.8 +++ usr.sbin/bhyve/bhyve.8 @@ -24,7 +24,7 @@ .\" .\" $FreeBSD$ .\" -.Dd December 13, 2019 +.Dd January 23, 2020 .Dt BHYVE 8 .Os .Sh NAME @@ -584,6 +584,12 @@ -A -H -P -m 24G bigvm .Ed .Pp +The virtio-net device accepts an additional option to explicitly enable +or disable the virtio-net `mergeable rx buffers` feature. +.br +If the option is not specified, bhyve will advertise the feature only +when the vale(4) net backend is used. +.Pp Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI CD-ROM, a single virtio network port, an AMD hostbridge, and the console port connected to an Index: usr.sbin/bhyve/net_backends.h =================================================================== --- usr.sbin/bhyve/net_backends.h +++ usr.sbin/bhyve/net_backends.h @@ -43,8 +43,9 @@ uint64_t netbe_get_cap(net_backend_t *be); int netbe_set_cap(net_backend_t *be, uint64_t cap, unsigned vnet_hdr_len); -ssize_t netbe_send(net_backend_t *be, struct iovec *iov, int iovcnt); -ssize_t netbe_recv(net_backend_t *be, struct iovec *iov, int iovcnt); +size_t netbe_get_vnet_hdr_len(net_backend_t *be); +ssize_t netbe_send(net_backend_t *be, const struct iovec *iov, int iovcnt); +ssize_t netbe_recv(net_backend_t *be, const struct iovec *iov, int iovcnt); ssize_t netbe_rx_discard(net_backend_t *be); void netbe_rx_disable(net_backend_t *be); void netbe_rx_enable(net_backend_t *be); Index: usr.sbin/bhyve/net_backends.c =================================================================== --- usr.sbin/bhyve/net_backends.c +++ usr.sbin/bhyve/net_backends.c @@ -99,7 +99,8 @@ * vector provided by the caller has 'iovcnt' elements and contains * the packet to send. */ - ssize_t (*send)(struct net_backend *be, struct iovec *iov, int iovcnt); + ssize_t (*send)(struct net_backend *be, const struct iovec *iov, + int iovcnt); /* * Called to receive a packet from the backend. When the function @@ -108,7 +109,8 @@ * The function returns 0 if the backend doesn't have a new packet to * receive. */ - ssize_t (*recv)(struct net_backend *be, struct iovec *iov, int iovcnt); + ssize_t (*recv)(struct net_backend *be, const struct iovec *iov, + int iovcnt); /* * Ask the backend to enable or disable receive operation in the @@ -238,13 +240,13 @@ * Called to send a buffer chain out to the tap device */ static ssize_t -tap_send(struct net_backend *be, struct iovec *iov, int iovcnt) +tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) { return (writev(be->fd, iov, iovcnt)); } static ssize_t -tap_recv(struct net_backend *be, struct iovec *iov, int iovcnt) +tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) { ssize_t ret; @@ -458,7 +460,7 @@ } static ssize_t -netmap_send(struct net_backend *be, struct iovec *iov, +netmap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) { struct netmap_priv *priv = (struct netmap_priv *)be->opaque; @@ -538,7 +540,7 @@ } static ssize_t -netmap_recv(struct net_backend *be, struct iovec *iov, int iovcnt) +netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) { struct netmap_priv *priv = (struct netmap_priv *)be->opaque; struct netmap_slot *slot = NULL; @@ -749,42 +751,10 @@ return (ret); } -static __inline struct iovec * -iov_trim(struct iovec *iov, int *iovcnt, unsigned int tlen) -{ - struct iovec *riov; - - /* XXX short-cut: assume first segment is >= tlen */ - assert(iov[0].iov_len >= tlen); - - iov[0].iov_len -= tlen; - if (iov[0].iov_len == 0) { - assert(*iovcnt > 1); - *iovcnt -= 1; - riov = &iov[1]; - } else { - iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen); - riov = &iov[0]; - } - - return (riov); -} - ssize_t -netbe_send(struct net_backend *be, struct iovec *iov, int iovcnt) +netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) { - assert(be != NULL); - if (be->be_vnet_hdr_len != be->fe_vnet_hdr_len) { - /* - * The frontend uses a virtio-net header, but the backend - * does not. We ignore it (as it must be all zeroes) and - * strip it. - */ - assert(be->be_vnet_hdr_len == 0); - iov = iov_trim(iov, &iovcnt, be->fe_vnet_hdr_len); - } - return (be->send(be, iov, iovcnt)); } @@ -794,46 +764,10 @@ * the length of the packet just read. Return -1 in case of errors. */ ssize_t -netbe_recv(struct net_backend *be, struct iovec *iov, int iovcnt) +netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) { - /* Length of prepended virtio-net header. */ - unsigned int hlen = be->fe_vnet_hdr_len; - int ret; - - assert(be != NULL); - - if (hlen && hlen != be->be_vnet_hdr_len) { - /* - * The frontend uses a virtio-net header, but the backend - * does not. We need to prepend a zeroed header. - */ - struct virtio_net_rxhdr *vh; - assert(be->be_vnet_hdr_len == 0); - - /* - * Get a pointer to the rx header, and use the - * data immediately following it for the packet buffer. - */ - vh = iov[0].iov_base; - iov = iov_trim(iov, &iovcnt, hlen); - - /* - * The only valid field in the rx packet header is the - * number of buffers if merged rx bufs were negotiated. - */ - memset(vh, 0, hlen); - if (hlen == VNET_HDR_LEN) { - vh->vrh_bufs = 1; - } - } - - ret = be->recv(be, iov, iovcnt); - if (ret > 0) { - ret += hlen; - } - - return (ret); + return (be->recv(be, iov, iovcnt)); } /* @@ -871,3 +805,10 @@ return be->recv_enable(be); } + +size_t +netbe_get_vnet_hdr_len(struct net_backend *be) +{ + + return (be->be_vnet_hdr_len); +} Index: usr.sbin/bhyve/net_utils.c =================================================================== --- usr.sbin/bhyve/net_utils.c +++ usr.sbin/bhyve/net_utils.c @@ -44,21 +44,19 @@ net_parsemac(char *mac_str, uint8_t *mac_addr) { struct ether_addr *ea; - char *tmpstr; char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; - tmpstr = strsep(&mac_str,"="); + if (mac_str == NULL) + return (EINVAL); - if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { - ea = ether_aton(mac_str); + ea = ether_aton(mac_str); - if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || - memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { - EPRINTLN("Invalid MAC %s", mac_str); - return (EINVAL); - } else - memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); - } + if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || + memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { + EPRINTLN("Invalid MAC %s", mac_str); + return (EINVAL); + } else + memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); return (0); } Index: usr.sbin/bhyve/pci_e82545.c =================================================================== --- usr.sbin/bhyve/pci_e82545.c +++ usr.sbin/bhyve/pci_e82545.c @@ -2328,18 +2328,36 @@ mac_provided = 0; sc->esc_be = NULL; if (opts != NULL) { - int err; + int err = 0; devname = vtopts = strdup(opts); (void) strsep(&vtopts, ","); - if (vtopts != NULL) { - err = net_parsemac(vtopts, sc->esc_mac.octet); - if (err != 0) { - free(devname); - return (err); + /* + * Parse the list of options in the form + * key1=value1,...,keyN=valueN. + */ + while (vtopts != NULL) { + char *value = vtopts; + char *key; + + key = strsep(&value, "="); + if (value == NULL) + break; + vtopts = value; + (void) strsep(&vtopts, ","); + + if (strcmp(key, "mac") == 0) { + err = net_parsemac(value, sc->esc_mac.octet); + if (err) + break; + mac_provided = 1; } - mac_provided = 1; + } + + if (err) { + free(devname); + return (err); } err = netbe_init(&sc->esc_be, devname, e82545_rx_callback, sc); Index: usr.sbin/bhyve/pci_virtio_net.c =================================================================== --- usr.sbin/bhyve/pci_virtio_net.c +++ usr.sbin/bhyve/pci_virtio_net.c @@ -117,6 +117,9 @@ pthread_cond_t tx_cond; int tx_in_progress; + size_t vhdrlen; + size_t be_vhdrlen; + struct virtio_net_config vsc_config; struct virtio_consts vsc_consts; }; @@ -180,6 +183,38 @@ pthread_mutex_unlock(&sc->rx_mtx); } +static __inline struct iovec * +iov_trim_hdr(struct iovec *iov, int *iovcnt, unsigned int tlen) +{ + struct iovec *riov; + + if (iov[0].iov_len < tlen) { + /* + * Not enough header space in the first fragment. + * That's not ok for us. + */ + return NULL; + } + + iov[0].iov_len -= tlen; + if (iov[0].iov_len == 0) { + *iovcnt -= 1; + if (*iovcnt == 0) { + /* + * Only space for the header. That's not + * enough for us. + */ + return NULL; + } + riov = &iov[1]; + } else { + iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen); + riov = &iov[0]; + } + + return (riov); +} + struct virtio_mrg_rxbuf_info { uint16_t idx; uint16_t pad; @@ -189,31 +224,34 @@ static void pci_vtnet_rx(struct pci_vtnet_softc *sc) { + int prepend_hdr_len = sc->vhdrlen - sc->be_vhdrlen; struct virtio_mrg_rxbuf_info info[VTNET_MAXSEGS]; struct iovec iov[VTNET_MAXSEGS + 1]; struct vqueue_info *vq; - uint32_t cur_iov_bytes; - struct iovec *cur_iov; - uint16_t cur_iov_len; + uint32_t riov_bytes; + struct iovec *riov; + int riov_len; uint32_t ulen; int n_chains; int len; vq = &sc->vsc_queues[VTNET_RXQ]; for (;;) { + struct virtio_net_rxhdr *hdr; + /* * Get a descriptor chain to store the next ingress * packet. In case of mergeable rx buffers, get as * many chains as necessary in order to make room * for a maximum sized LRO packet. */ - cur_iov_bytes = 0; - cur_iov_len = 0; - cur_iov = iov; + riov_bytes = 0; + riov_len = 0; + riov = iov; n_chains = 0; do { - int n = vq_getchain(vq, &info[n_chains].idx, cur_iov, - VTNET_MAXSEGS - cur_iov_len, NULL); + int n = vq_getchain(vq, &info[n_chains].idx, riov, + VTNET_MAXSEGS - riov_len, NULL); if (n == 0) { /* @@ -239,20 +277,42 @@ vq_kick_disable(vq); continue; } - assert(n >= 1 && cur_iov_len + n <= VTNET_MAXSEGS); - cur_iov_len += n; + assert(n >= 1 && riov_len + n <= VTNET_MAXSEGS); + riov_len += n; if (!sc->rx_merge) { n_chains = 1; break; } - info[n_chains].len = (uint32_t)count_iov(cur_iov, n); - cur_iov_bytes += info[n_chains].len; - cur_iov += n; + info[n_chains].len = (uint32_t)count_iov(riov, n); + riov_bytes += info[n_chains].len; + riov += n; n_chains++; - } while (cur_iov_bytes < VTNET_MAX_PKT_LEN && - cur_iov_len < VTNET_MAXSEGS); + } while (riov_bytes < VTNET_MAX_PKT_LEN && + riov_len < VTNET_MAXSEGS); + + riov = iov; + hdr = riov[0].iov_base; + if (prepend_hdr_len > 0) { + /* + * The frontend uses a virtio-net header, but the + * backend does not. We need to prepend a zeroed + * header. + */ + riov = iov_trim_hdr(riov, &riov_len, prepend_hdr_len); + if (riov == NULL) { + /* + * The first collected chain is nonsensical, + * as it is not even enough to store the + * virtio-net header. Just drop it. + */ + vq_relchain(vq, info[0].idx, 0); + vq_retchains(vq, n_chains - 1); + continue; + } + memset(hdr, 0, prepend_hdr_len); + } - len = netbe_recv(sc->vsc_be, iov, cur_iov_len); + len = netbe_recv(sc->vsc_be, riov, riov_len); if (len <= 0) { /* @@ -266,18 +326,18 @@ return; } - ulen = (uint32_t)len; /* avoid too many casts below */ + ulen = (uint32_t)(len + prepend_hdr_len); - /* Publish the used buffers to the guest. */ + /* + * Publish the used buffers to the guest, reporting the + * number of bytes that we wrote. + */ if (!sc->rx_merge) { vq_relchain(vq, info[0].idx, ulen); } else { - struct virtio_net_rxhdr *hdr = iov[0].iov_base; uint32_t iolen; int i = 0; - assert(iov[0].iov_len >= sizeof(*hdr)); - do { iolen = info[i].len; if (iolen > ulen) { @@ -333,6 +393,7 @@ pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) { struct iovec iov[VTNET_MAXSEGS + 1]; + struct iovec *siov = iov; uint16_t idx; ssize_t len; int n; @@ -344,10 +405,34 @@ n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); assert(n >= 1 && n <= VTNET_MAXSEGS); - len = netbe_send(sc->vsc_be, iov, n); + if (sc->vhdrlen != sc->be_vhdrlen) { + /* + * The frontend uses a virtio-net header, but the backend + * does not. We simply strip the header and ignore it, as + * it should be zero-filled. + */ + siov = iov_trim_hdr(siov, &n, sc->vhdrlen); + } + + if (siov == NULL) { + /* The chain is nonsensical. Just drop it. */ + len = 0; + } else { + len = netbe_send(sc->vsc_be, siov, n); + if (len < 0) { + /* + * If send failed, report that 0 bytes + * were written. + */ + len = 0; + } + } - /* chain is processed, release it and set len */ - vq_relchain(vq, idx, len > 0 ? len : 0); + /* + * Return the processed chain to the guest, reporting + * the number of bytes that we read. + */ + vq_relchain(vq, idx, len); } /* Called on TX kick. */ @@ -464,21 +549,50 @@ */ mac_provided = 0; if (opts != NULL) { + int mrg_rxbuf_enabled = 0; + int mrg_rxbuf_disabled = 0; char *devname; char *vtopts; - int err; + int err = 0; + /* Get the device name. */ devname = vtopts = strdup(opts); (void) strsep(&vtopts, ","); - if (vtopts != NULL) { - err = net_parsemac(vtopts, sc->vsc_config.mac); - if (err != 0) { - free(devname); - free(sc); - return (err); + /* + * Parse the list of options in the form + * key1=value1,...,keyN=valueN. + */ + while (vtopts != NULL) { + char *value = vtopts; + char *key; + + key = strsep(&value, "="); + if (value == NULL) + break; + vtopts = value; + (void) strsep(&vtopts, ","); + + if (strcmp(key, "mac") == 0) { + err = net_parsemac(value, sc->vsc_config.mac); + if (err) + break; + mac_provided = 1; + } else if (strcmp(key, "mrgrxbuf") == 0) { + if (strcmp(value, "on") == 0) { + mrg_rxbuf_enabled = 1; + mrg_rxbuf_disabled = 0; + } else { + mrg_rxbuf_disabled = 1; + mrg_rxbuf_enabled = 0; + } } - mac_provided = 1; + } + + if (err) { + free(devname); + free(sc); + return (err); } err = netbe_init(&sc->vsc_be, devname, pci_vtnet_rx_callback, @@ -489,6 +603,11 @@ return (err); } sc->vsc_consts.vc_hv_caps |= netbe_get_cap(sc->vsc_be); + if (mrg_rxbuf_enabled) + sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF; + else if (mrg_rxbuf_disabled) + sc->vsc_consts.vc_hv_caps &= ~VIRTIO_NET_F_MRG_RXBUF; + } if (!mac_provided) { @@ -520,6 +639,7 @@ sc->resetting = 0; sc->rx_merge = 0; + sc->vhdrlen = sizeof(struct virtio_net_rxhdr) - 2; pthread_mutex_init(&sc->rx_mtx, NULL); /* @@ -574,24 +694,25 @@ pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vtnet_softc *sc = vsc; - unsigned int rx_vhdrlen; sc->vsc_features = negotiated_features; if (negotiated_features & VIRTIO_NET_F_MRG_RXBUF) { - rx_vhdrlen = sizeof(struct virtio_net_rxhdr); + sc->vhdrlen = sizeof(struct virtio_net_rxhdr); sc->rx_merge = 1; } else { /* * Without mergeable rx buffers, virtio-net header is 2 * bytes shorter than sizeof(struct virtio_net_rxhdr). */ - rx_vhdrlen = sizeof(struct virtio_net_rxhdr) - 2; + sc->vhdrlen = sizeof(struct virtio_net_rxhdr) - 2; sc->rx_merge = 0; } /* Tell the backend to enable some capabilities it has advertised. */ - netbe_set_cap(sc->vsc_be, negotiated_features, rx_vhdrlen); + netbe_set_cap(sc->vsc_be, negotiated_features, sc->vhdrlen); + sc->be_vhdrlen = netbe_get_vnet_hdr_len(sc->vsc_be); + assert(sc->be_vhdrlen == 0 || sc->be_vhdrlen == sc->vhdrlen); } static struct pci_devemu pci_de_vnet = {