Index: head/usr.sbin/bhyve/iov.h =================================================================== --- head/usr.sbin/bhyve/iov.h +++ head/usr.sbin/bhyve/iov.h @@ -38,7 +38,7 @@ void truncate_iov(struct iovec *iov, int *niov, size_t length); size_t count_iov(const struct iovec *iov, int niov); ssize_t iov_to_buf(const struct iovec *iov, int niov, void **buf); -ssize_t buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov, - size_t seek); +ssize_t buf_to_iov(const void *buf, size_t buflen, const struct iovec *iov, + int niov, size_t seek); #endif /* _IOV_H_ */ Index: head/usr.sbin/bhyve/iov.c =================================================================== --- head/usr.sbin/bhyve/iov.c +++ head/usr.sbin/bhyve/iov.c @@ -119,24 +119,25 @@ } ssize_t -buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov, +buf_to_iov(const void *buf, size_t buflen, const struct iovec *iov, int niov, size_t seek) { struct iovec *diov; - int ndiov, i; size_t off = 0, len; + int i; if (seek > 0) { + int ndiov; + diov = malloc(sizeof(struct iovec) * niov); seek_iov(iov, niov, diov, &ndiov, seek); - } else { - diov = iov; - ndiov = niov; + iov = diov; + niov = ndiov; } - for (i = 0; i < ndiov && off < buflen; i++) { - len = MIN(diov[i].iov_len, buflen - off); - memcpy(diov[i].iov_base, buf + off, len); + for (i = 0; i < niov && off < buflen; i++) { + len = MIN(iov[i].iov_len, buflen - off); + memcpy(iov[i].iov_base, buf + off, len); off += len; } Index: head/usr.sbin/bhyve/net_backends.h =================================================================== --- head/usr.sbin/bhyve/net_backends.h +++ head/usr.sbin/bhyve/net_backends.h @@ -45,6 +45,7 @@ unsigned vnet_hdr_len); size_t netbe_get_vnet_hdr_len(net_backend_t *be); ssize_t netbe_send(net_backend_t *be, const struct iovec *iov, int iovcnt); +ssize_t netbe_peek_recvlen(net_backend_t *be); ssize_t netbe_recv(net_backend_t *be, const struct iovec *iov, int iovcnt); ssize_t netbe_rx_discard(net_backend_t *be); void netbe_rx_disable(net_backend_t *be); Index: head/usr.sbin/bhyve/net_backends.c =================================================================== --- head/usr.sbin/bhyve/net_backends.c +++ head/usr.sbin/bhyve/net_backends.c @@ -103,6 +103,13 @@ int iovcnt); /* + * Get the length of the next packet that can be received from + * the backend. If no packets are currently available, this + * function returns 0. + */ + ssize_t (*peek_recvlen)(struct net_backend *be); + + /* * Called to receive a packet from the backend. When the function * returns a positive value 'len', the scatter-gather vector * provided by the caller contains a packet with such length. @@ -167,6 +174,13 @@ struct tap_priv { struct mevent *mevp; + /* + * A bounce buffer that allows us to implement the peek_recvlen + * callback. In the future we may get the same information from + * the kevent data. + */ + char bbuf[1 << 16]; + ssize_t bbuflen; }; static void @@ -223,6 +237,9 @@ errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif + memset(priv->bbuf, 0, sizeof(priv->bbuf)); + priv->bbuflen = 0; + priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); if (priv->mevp == NULL) { WPRINTF(("Could not register event")); @@ -246,15 +263,56 @@ } static ssize_t +tap_peek_recvlen(struct net_backend *be) +{ + struct tap_priv *priv = (struct tap_priv *)be->opaque; + ssize_t ret; + + if (priv->bbuflen > 0) { + /* + * We already have a packet in the bounce buffer. + * Just return its length. + */ + return priv->bbuflen; + } + + /* + * Read the next packet (if any) into the bounce buffer, so + * that we get to know its length and we can return that + * to the caller. + */ + ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); + if (ret < 0 && errno == EWOULDBLOCK) { + return (0); + } + + if (ret > 0) + priv->bbuflen = ret; + + return (ret); +} + +static ssize_t tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) { + struct tap_priv *priv = (struct tap_priv *)be->opaque; ssize_t ret; - /* Should never be called without a valid tap fd */ - assert(be->fd != -1); + if (priv->bbuflen > 0) { + /* + * A packet is available in the bounce buffer, so + * we read it from there. + */ + ret = buf_to_iov(priv->bbuf, priv->bbuflen, + iov, iovcnt, 0); - ret = readv(be->fd, iov, iovcnt); + /* Mark the bounce buffer as empty. */ + priv->bbuflen = 0; + return (ret); + } + + ret = readv(be->fd, iov, iovcnt); if (ret < 0 && errno == EWOULDBLOCK) { return (0); } @@ -299,6 +357,7 @@ .init = tap_init, .cleanup = tap_cleanup, .send = tap_send, + .peek_recvlen = tap_peek_recvlen, .recv = tap_recv, .recv_enable = tap_recv_enable, .recv_disable = tap_recv_disable, @@ -313,6 +372,7 @@ .init = tap_init, .cleanup = tap_cleanup, .send = tap_send, + .peek_recvlen = tap_peek_recvlen, .recv = tap_recv, .recv_enable = tap_recv_enable, .recv_disable = tap_recv_disable, @@ -331,8 +391,7 @@ #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ - VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO | \ - VIRTIO_NET_F_MRG_RXBUF) + VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) struct netmap_priv { char ifname[IFNAMSIZ]; @@ -540,6 +599,26 @@ } static ssize_t +netmap_peek_recvlen(struct net_backend *be) +{ + struct netmap_priv *priv = (struct netmap_priv *)be->opaque; + struct netmap_ring *ring = priv->rx; + uint32_t head = ring->head; + ssize_t totlen = 0; + + while (head != ring->tail) { + struct netmap_slot *slot = ring->slot + head; + + totlen += slot->len; + if ((slot->flags & NS_MOREFRAG) == 0) + break; + head = nm_ring_next(ring, head); + } + + return (totlen); +} + +static ssize_t netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) { struct netmap_priv *priv = (struct netmap_priv *)be->opaque; @@ -628,6 +707,7 @@ .init = netmap_init, .cleanup = netmap_cleanup, .send = netmap_send, + .peek_recvlen = netmap_peek_recvlen, .recv = netmap_recv, .recv_enable = netmap_recv_enable, .recv_disable = netmap_recv_disable, @@ -642,6 +722,7 @@ .init = netmap_init, .cleanup = netmap_cleanup, .send = netmap_send, + .peek_recvlen = netmap_peek_recvlen, .recv = netmap_recv, .recv_enable = netmap_recv_enable, .recv_disable = netmap_recv_disable, @@ -756,6 +837,13 @@ { return (be->send(be, iov, iovcnt)); +} + +ssize_t +netbe_peek_recvlen(struct net_backend *be) +{ + + return (be->peek_recvlen(be)); } /* Index: head/usr.sbin/bhyve/pci_virtio_net.c =================================================================== --- head/usr.sbin/bhyve/pci_virtio_net.c +++ head/usr.sbin/bhyve/pci_virtio_net.c @@ -228,22 +228,34 @@ struct virtio_mrg_rxbuf_info info[VTNET_MAXSEGS]; struct iovec iov[VTNET_MAXSEGS + 1]; struct vqueue_info *vq; - uint32_t riov_bytes; - struct iovec *riov; - int riov_len; - uint32_t ulen; - int n_chains; - int len; vq = &sc->vsc_queues[VTNET_RXQ]; for (;;) { struct virtio_net_rxhdr *hdr; + uint32_t riov_bytes; + struct iovec *riov; + uint32_t ulen; + int riov_len; + int n_chains; + ssize_t rlen; + ssize_t plen; + plen = netbe_peek_recvlen(sc->vsc_be); + if (plen <= 0) { + /* + * No more packets (plen == 0), or backend errored + * (plen < 0). Interrupt if needed and stop. + */ + vq_endchains(vq, /*used_all_avail=*/0); + return; + } + plen += prepend_hdr_len; + /* * Get a descriptor chain to store the next ingress * packet. In case of mergeable rx buffers, get as * many chains as necessary in order to make room - * for a maximum sized LRO packet. + * for plen bytes. */ riov_bytes = 0; riov_len = 0; @@ -287,8 +299,7 @@ riov_bytes += info[n_chains].len; riov += n; n_chains++; - } while (riov_bytes < VTNET_MAX_PKT_LEN && - riov_len < VTNET_MAXSEGS); + } while (riov_bytes < plen && riov_len < VTNET_MAXSEGS); riov = iov; hdr = riov[0].iov_base; @@ -312,21 +323,20 @@ memset(hdr, 0, prepend_hdr_len); } - len = netbe_recv(sc->vsc_be, riov, riov_len); - - if (len <= 0) { + rlen = netbe_recv(sc->vsc_be, riov, riov_len); + if (rlen != plen - prepend_hdr_len) { /* - * No more packets (len == 0), or backend errored - * (err < 0). Return unused available buffers - * and stop. + * If this happens it means there is something + * wrong with the backend (e.g., some other + * process is stealing our packets). */ + WPRINTF(("netbe_recv: expected %zd bytes, " + "got %zd", plen - prepend_hdr_len, rlen)); vq_retchains(vq, n_chains); - /* Interrupt if needed/appropriate and stop. */ - vq_endchains(vq, /*used_all_avail=*/0); - return; + continue; } - ulen = (uint32_t)(len + prepend_hdr_len); + ulen = (uint32_t)plen; /* * Publish the used buffers to the guest, reporting the @@ -346,12 +356,11 @@ vq_relchain_prepare(vq, info[i].idx, iolen); ulen -= iolen; i++; - assert(i <= n_chains); } while (ulen > 0); hdr->vrh_bufs = i; vq_relchain_publish(vq); - vq_retchains(vq, n_chains - i); + assert(i == n_chains); } } @@ -592,7 +601,8 @@ free(sc); return (err); } - sc->vsc_consts.vc_hv_caps |= netbe_get_cap(sc->vsc_be); + sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF | + netbe_get_cap(sc->vsc_be); } if (!mac_provided) {