Index: usr.sbin/bhyve/pci_virtio_net.c =================================================================== --- usr.sbin/bhyve/pci_virtio_net.c +++ usr.sbin/bhyve/pci_virtio_net.c @@ -73,6 +73,8 @@ #define VTNET_MAXSEGS 256 +#define VTNET_MIN_AVAIL_DESC 64 + /* * Host capabilities. Note that we only offer a few of these. */ @@ -392,85 +394,107 @@ } static __inline int -pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt) +pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt, int iovsize) { - int r, i; - int len = 0; + char *buf; + int i; + int frag_size; + int iov_off; + int len; + int nm_off; + int nm_buf_size; - for (r = nmd->cur_tx_ring; ; ) { - struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r); - uint32_t cur, idx; - char *buf; + struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, nmd->cur_rx_ring); - if (nm_ring_empty(ring)) { - r++; - if (r > nmd->last_tx_ring) - r = nmd->first_tx_ring; - if (r == nmd->cur_tx_ring) - break; - continue; + if ((nm_ring_space(ring) * ring->nr_buf_size) < iovsize) { + /* + * No more avail space in TX ring, try to flush it. + */ + ioctl(nmd->fd, NIOCTXSYNC, NULL); + return (0); + } + + i = ring->cur; + buf = NETMAP_BUF(ring, ring->slot[i].buf_idx); + iov_off = 0; + len = iovsize; + nm_buf_size = ring->nr_buf_size; + nm_off = 0; + + while (iovsize) { + + if (unlikely(iov_off == iov->iov_len)) { + iov++; + iov_off = 0; } - cur = ring->cur; - idx = ring->slot[cur].buf_idx; - buf = NETMAP_BUF(ring, idx); - for (i = 0; i < iovcnt; i++) { - if (len + iov[i].iov_len > 2048) - break; - memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len); - len += iov[i].iov_len; + if (unlikely(nm_off == nm_buf_size)) { + ring->slot[i].flags = NS_MOREFRAG; + i = nm_ring_next(ring, i); + buf = NETMAP_BUF(ring, ring->slot[i].buf_idx); + nm_off = 0; } - ring->slot[cur].len = len; - ring->head = ring->cur = nm_ring_next(ring, cur); - nmd->cur_tx_ring = r; - ioctl(nmd->fd, NIOCTXSYNC, NULL); - break; + + frag_size = MIN(nm_buf_size - nm_off, iov->iov_len - iov_off); + memcpy(buf + nm_off, iov->iov_base + iov_off, frag_size); + + iovsize -= frag_size; + iov_off += frag_size; + nm_off += frag_size; + + ring->slot[i].len = nm_off; } + /* The last slot must not have NS_MOREFRAG set. */ + ring->slot[i].flags &= ~NS_MOREFRAG; + ring->head = ring->cur = nm_ring_next(ring, i); + ioctl(nmd->fd, NIOCTXSYNC, NULL); + return (len); } static __inline int -pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt) +pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt, int iovsize) { - int len = 0; - int i = 0; - int r; + char *buf; + int i; + int iov_off; + int frag_size; + int len; + int nm_off; - for (r = nmd->cur_rx_ring; ; ) { - struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r); - uint32_t cur, idx; - char *buf; - size_t left; + struct netmap_ring *r = NETMAP_RXRING(nmd->nifp, nmd->cur_rx_ring); - if (nm_ring_empty(ring)) { - r++; - if (r > nmd->last_rx_ring) - r = nmd->first_rx_ring; - if (r == nmd->cur_rx_ring) - break; - continue; + i = r->cur; + buf = NETMAP_BUF(r, r->slot[i].buf_idx); + iov_off = 0; + nm_off = 0; + len = iovsize; + + while (iovsize) { + + if (unlikely(iov_off == iov->iov_len)) { + iov++; + iov_off = 0; } - cur = ring->cur; - idx = ring->slot[cur].buf_idx; - buf = NETMAP_BUF(ring, idx); - left = ring->slot[cur].len; - for (i = 0; i < iovcnt && left > 0; i++) { - if (iov[i].iov_len > left) - iov[i].iov_len = left; - memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len); - len += iov[i].iov_len; - left -= iov[i].iov_len; + if (unlikely(nm_off == r->slot[i].len)) { + i = nm_ring_next(r, i); + buf = NETMAP_BUF(r, r->slot[i].buf_idx); + nm_off = 0; } - ring->head = ring->cur = nm_ring_next(ring, cur); - nmd->cur_rx_ring = r; - ioctl(nmd->fd, NIOCRXSYNC, NULL); - break; + + frag_size = MIN(r->slot[i].len - nm_off, iov->iov_len - iov_off); + memcpy(iov->iov_base + iov_off, buf + nm_off, frag_size); + + iovsize -= frag_size; + iov_off += frag_size; + nm_off += frag_size; } - for (; i < iovcnt; i++) - iov[i].iov_len = 0; + r->head = r->cur = nm_ring_next(r, i); + ioctl(nmd->fd, NIOCRXSYNC, NULL); + return (len); } @@ -481,32 +505,102 @@ pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, int len) { - static char pad[60]; /* all zero bytes */ - if (sc->vsc_nmd == NULL) return; - /* - * If the length is < 60, pad out to that and add the - * extra zero'd segment to the iov. It is guaranteed that - * there is always an extra iov available by the caller. - */ - if (len < 60) { - iov[iovcnt].iov_base = pad; - iov[iovcnt].iov_len = 60 - len; - iovcnt++; + (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt, len); +} + +static __inline int +vq_avail_to_iovec(struct vqueue_info *vq, struct iovec *iov, int len, int start, + int minavail) +{ + int idx; + uint16_t mask = vq->vq_qsize - 1; + volatile struct virtio_desc *vdir; + struct vmctx *ctx = vq->vq_vs->vs_pi->pi_vmctx; + + uint16_t ndesc = (uint16_t)(vq->vq_avail->va_idx - vq->vq_last_avail - start); + + if (ndesc < minavail) + return (0); + + int off = 0; + int uidx = vq->vq_used->vu_idx + start; + + for (int i = 0; i < ndesc; i++) { + idx = vq->vq_avail->va_ring[(vq->vq_last_avail + i + start) & mask]; + vdir = &vq->vq_desc[idx]; + + iov[i].iov_base = paddr_guest2host(ctx, + vdir->vd_addr, vdir->vd_len); + iov[i].iov_len = vdir->vd_len; + + off += vdir->vd_len; + + vq->vq_used->vu_ring[uidx & mask].vu_idx = idx; + vq->vq_used->vu_ring[uidx & mask].vu_tlen = + (off >= len) ? vdir->vd_len - (off - len) : vdir->vd_len; + + uidx++; + + if (off >= len) { + return (i + 1); + } } - (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt); + + return (0); } +static __inline void +vq_inc_used_idx_and_last_avail(struct vqueue_info *vq, int n) +{ + if (n > 0) { + vq->vq_last_avail += n; + vq->vq_used->vu_idx += n; + } +} + +static __inline int +netmap_next_pkt_len(struct nm_desc *nmd) +{ + int i; + int len; + struct netmap_ring *r = NETMAP_RXRING(nmd->nifp, nmd->cur_rx_ring); + + len = 0; + + for (i = r->cur; i != r->tail; i = nm_ring_next(r, i)) { + len += r->slot[i].len; + if (!(r->slot[i].flags & NS_MOREFRAG)) + break; + } + + return (len); +} + +static __inline void +netmap_drop_pkt(struct nm_desc *nmd) +{ + int i; + struct netmap_ring *r = NETMAP_RXRING(nmd->nifp, nmd->cur_rx_ring); + + for (i = r->cur; i != r->tail; i = nm_ring_next(r, i)) { + if (!(r->slot[i].flags & NS_MOREFRAG)) { + r->head = r->cur = nm_ring_next(r, i); + return; + } + } +} + static void pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc) { - struct iovec iov[VTNET_MAXSEGS], *riov; + struct iovec iov[VTNET_RINGSZ], *riov; struct vqueue_info *vq; - void *vrx; - int len, n; - uint16_t idx; + int len; + int n; + int used; /* * Should never be called without a valid netmap descriptor @@ -517,11 +611,11 @@ * But, will be called when the rx ring hasn't yet * been set up or the guest is resetting the device. */ - if (!sc->vsc_rx_ready || sc->resetting) { + if (unlikely((!sc->vsc_rx_ready || sc->resetting))) { /* * Drop the packet and try later. */ - (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); + netmap_drop_pkt(sc->vsc_nmd); return; } @@ -529,63 +623,54 @@ * Check for available rx buffers */ vq = &sc->vsc_queues[VTNET_RXQ]; - if (!vq_has_descs(vq)) { + if (unlikely(!vq_has_descs(vq))) { /* * Drop the packet and try later. Interrupt on * empty, if that's negotiated. */ - (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); + netmap_drop_pkt(sc->vsc_nmd); vq_endchains(vq, 1); return; } + used = 0; + do { - /* - * Get descriptor chain. - */ - n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); - assert(n >= 1 && n <= VTNET_MAXSEGS); + len = netmap_next_pkt_len(sc->vsc_nmd); - /* - * Get a pointer to the rx header, and use the - * data immediately following it for the packet buffer. - */ - vrx = iov[0].iov_base; - riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); + if (unlikely(len == 0)) { + vq_inc_used_idx_and_last_avail(vq, used); + vq_endchains(vq, 0); + return; + } - len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n); + n = vq_avail_to_iovec(vq, iov, len + sc->rx_vhdrlen, used, + VTNET_MIN_AVAIL_DESC); - if (len == 0) { - /* - * No more packets, but still some avail ring - * entries. Interrupt if needed/appropriate. - */ - vq_retchain(vq); + if (unlikely(n == 0)) { + vq_inc_used_idx_and_last_avail(vq, used); vq_endchains(vq, 0); return; } - /* - * The only valid field in the rx packet header is the - * number of buffers if merged rx bufs were negotiated. - */ - memset(vrx, 0, sc->rx_vhdrlen); - if (sc->rx_merge) { - struct virtio_net_rxhdr *vrxh; - - vrxh = vrx; - vrxh->vrh_bufs = 1; + struct virtio_net_rxhdr *vrxh = iov[0].iov_base; + memset(vrxh, 0, sc->rx_vhdrlen); + vrxh->vrh_bufs = n; } - /* - * Release this chain and handle more chains. - */ - vq_relchain(vq, idx, len + sc->rx_vhdrlen); + riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); + + (void)pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n, len); + + used += n; + } while (vq_has_descs(vq)); - /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ + vq_inc_used_idx_and_last_avail(vq, used); vq_endchains(vq, 1); + + return; } static void