Changeset View
Standalone View
usr.sbin/bhyve/pci_virtio_net.c
Show First 20 Lines • Show All 386 Lines • ▼ Show 20 Lines | do { | ||||
vq_relchain(vq, idx, len + sc->rx_vhdrlen); | vq_relchain(vq, idx, len + sc->rx_vhdrlen); | ||||
} while (vq_has_descs(vq)); | } while (vq_has_descs(vq)); | ||||
/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ | /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ | ||||
vq_endchains(vq, 1); | vq_endchains(vq, 1); | ||||
} | } | ||||
static __inline int | static __inline int | ||||
pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt) | pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt, int iovsize) | ||||
{ | { | ||||
int r, i; | |||||
int len = 0; | |||||
for (r = nmd->cur_tx_ring; ; ) { | |||||
struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r); | |||||
uint32_t cur, idx; | |||||
char *buf; | char *buf; | ||||
int i; | |||||
int frag_size; | |||||
int iov_off; | |||||
int len; | |||||
int nm_off; | |||||
int nm_buf_size; | |||||
if (nm_ring_empty(ring)) { | struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, nmd->cur_tx_ring); | ||||
vmaffione: nmd->cur_tx_ring
Why did you drop the iterating over all the possible tx rings? | |||||
Not Done Inline ActionsI drop it to simplify the code under the assumption that current setup code void pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname) { ... sc->vsc_nmd = nm_open(ifname, NULL, 0, 0); ... } always sets the number of TX/RX rings to one. And there is no way to change it using bhyve(8). In the future, I want to replace the legacy API and add the ability to specify not only the VALE switch, but also other types of netmap ports, and return the iteration through the rings. Or I can return this code now. What do you think? afedorov: I drop it to simplify the code under the assumption that current setup code
```
void… | |||||
Done Inline ActionsThat's fine. But still you need to replace cur_rx_ring with cur_tx_ring, for consistency. vmaffione: That's fine. But still you need to replace cur_rx_ring with cur_tx_ring, for consistency. | |||||
Done Inline ActionsVincenzo, do you have any other suggestion about netmap API usage? afedorov: Vincenzo, do you have any other suggestion about netmap API usage? | |||||
r++; | |||||
if (r > nmd->last_tx_ring) | if ((nm_ring_space(ring) * ring->nr_buf_size) < iovsize) { | ||||
r = nmd->first_tx_ring; | /* | ||||
if (r == nmd->cur_tx_ring) | * No more avail space in TX ring, try to flush it. | ||||
break; | */ | ||||
continue; | ioctl(nmd->fd, NIOCTXSYNC, NULL); | ||||
return (0); | |||||
} | } | ||||
cur = ring->cur; | |||||
idx = ring->slot[cur].buf_idx; | |||||
buf = NETMAP_BUF(ring, idx); | |||||
for (i = 0; i < iovcnt; i++) { | i = ring->cur; | ||||
if (len + iov[i].iov_len > 2048) | buf = NETMAP_BUF(ring, ring->slot[i].buf_idx); | ||||
break; | iov_off = 0; | ||||
memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len); | len = iovsize; | ||||
len += iov[i].iov_len; | nm_buf_size = ring->nr_buf_size; | ||||
nm_off = 0; | |||||
while (iovsize) { | |||||
if (unlikely(iov_off == iov->iov_len)) { | |||||
iov++; | |||||
iov_off = 0; | |||||
} | } | ||||
ring->slot[cur].len = len; | |||||
ring->head = ring->cur = nm_ring_next(ring, cur); | if (unlikely(nm_off == nm_buf_size)) { | ||||
nmd->cur_tx_ring = r; | ring->slot[i].flags = NS_MOREFRAG; | ||||
ioctl(nmd->fd, NIOCTXSYNC, NULL); | i = nm_ring_next(ring, i); | ||||
break; | buf = NETMAP_BUF(ring, ring->slot[i].buf_idx); | ||||
nm_off = 0; | |||||
} | } | ||||
frag_size = MIN(nm_buf_size - nm_off, iov->iov_len - iov_off); | |||||
memcpy(buf + nm_off, iov->iov_base + iov_off, frag_size); | |||||
iovsize -= frag_size; | |||||
iov_off += frag_size; | |||||
nm_off += frag_size; | |||||
ring->slot[i].len = nm_off; | |||||
} | |||||
/* The last slot must not have NS_MOREFRAG set. */ | |||||
ring->slot[i].flags &= ~NS_MOREFRAG; | |||||
ring->head = ring->cur = nm_ring_next(ring, i); | |||||
ioctl(nmd->fd, NIOCTXSYNC, NULL); | |||||
return (len); | return (len); | ||||
} | } | ||||
static __inline int | static __inline int | ||||
pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt) | pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt, int iovsize) | ||||
{ | { | ||||
int len = 0; | |||||
int i = 0; | |||||
int r; | |||||
for (r = nmd->cur_rx_ring; ; ) { | |||||
struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r); | |||||
uint32_t cur, idx; | |||||
char *buf; | char *buf; | ||||
size_t left; | int i; | ||||
int iov_off; | |||||
int frag_size; | |||||
int len; | |||||
int nm_off; | |||||
if (nm_ring_empty(ring)) { | struct netmap_ring *r = NETMAP_RXRING(nmd->nifp, nmd->cur_rx_ring); | ||||
r++; | |||||
if (r > nmd->last_rx_ring) | i = r->head; | ||||
r = nmd->first_rx_ring; | buf = NETMAP_BUF(r, r->slot[i].buf_idx); | ||||
if (r == nmd->cur_rx_ring) | iov_off = 0; | ||||
break; | nm_off = 0; | ||||
continue; | len = iovsize; | ||||
while (iovsize) { | |||||
if (unlikely(iov_off == iov->iov_len)) { | |||||
iov++; | |||||
iov_off = 0; | |||||
Done Inline Actionsr->head is better than r->cur, although there is no difference right now (see comment below). vmaffione: r->head is better than r->cur, although there is no difference right now (see comment below). | |||||
} | } | ||||
cur = ring->cur; | |||||
idx = ring->slot[cur].buf_idx; | |||||
buf = NETMAP_BUF(ring, idx); | |||||
left = ring->slot[cur].len; | |||||
for (i = 0; i < iovcnt && left > 0; i++) { | if (unlikely(nm_off == r->slot[i].len)) { | ||||
if (iov[i].iov_len > left) | i = nm_ring_next(r, i); | ||||
iov[i].iov_len = left; | buf = NETMAP_BUF(r, r->slot[i].buf_idx); | ||||
memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len); | nm_off = 0; | ||||
len += iov[i].iov_len; | |||||
left -= iov[i].iov_len; | |||||
} | } | ||||
ring->head = ring->cur = nm_ring_next(ring, cur); | |||||
nmd->cur_rx_ring = r; | frag_size = MIN(r->slot[i].len - nm_off, iov->iov_len - iov_off); | ||||
ioctl(nmd->fd, NIOCRXSYNC, NULL); | memcpy(iov->iov_base + iov_off, buf + nm_off, frag_size); | ||||
break; | |||||
iovsize -= frag_size; | |||||
iov_off += frag_size; | |||||
nm_off += frag_size; | |||||
} | } | ||||
for (; i < iovcnt; i++) | |||||
iov[i].iov_len = 0; | |||||
r->head = r->cur = nm_ring_next(r, i); | |||||
Done Inline ActionsIn theory this NIOCRXSYNC is not needed, because poll() or kqueue_wait() calls NIOCRXSYNC internally. This works perfectly with poll(), at least. As far as I know bhyve uses kqueue to wait on the netmap file descriptor. What happens if you remove this ioctl()? vmaffione: In theory this NIOCRXSYNC is not needed, because poll() or kqueue_wait() calls NIOCRXSYNC… | |||||
Done Inline ActionsI agree, it seems there are no need to call ioctl(..., NIOCRXSYNC, ...). And I observed a some throughput increase (~300-500 MBit/s 1500 MTU), when RX packet occupied one netmap buffer. This is explained by the fact that additional syscall was gone. But I need some time to investigate how this affects performance with a large MTU. afedorov: I agree, it seems there are no need to call ioctl(..., NIOCRXSYNC, ...). And I observed a some… | |||||
return (len); | return (len); | ||||
} | } | ||||
/* | /* | ||||
* Called to send a buffer chain out to the vale port | * Called to send a buffer chain out to the vale port | ||||
*/ | */ | ||||
static void | static void | ||||
pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, | pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, | ||||
int len) | int len) | ||||
{ | { | ||||
static char pad[60]; /* all zero bytes */ | |||||
if (sc->vsc_nmd == NULL) | if (sc->vsc_nmd == NULL) | ||||
return; | return; | ||||
/* | (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt, len); | ||||
* If the length is < 60, pad out to that and add the | |||||
* extra zero'd segment to the iov. It is guaranteed that | |||||
* there is always an extra iov available by the caller. | |||||
*/ | |||||
if (len < 60) { | |||||
iov[iovcnt].iov_base = pad; | |||||
iov[iovcnt].iov_len = 60 - len; | |||||
iovcnt++; | |||||
} | } | ||||
(void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt); | |||||
static __inline int | |||||
netmap_next_pkt_len(struct nm_desc *nmd) | |||||
{ | |||||
int i; | |||||
int len; | |||||
struct netmap_ring *r = NETMAP_RXRING(nmd->nifp, nmd->cur_rx_ring); | |||||
len = 0; | |||||
for (i = r->head; i != r->tail; i = nm_ring_next(r, i)) { | |||||
len += r->slot[i].len; | |||||
if (!(r->slot[i].flags & NS_MOREFRAG)) | |||||
Not Done Inline Actionswhy this check? if there are not enough descriptors to cover 'len' bytes the function will return 0 anyway. vmaffione: why this check? if there are not enough descriptors to cover 'len' bytes the function will… | |||||
break; | |||||
Not Done Inline ActionsYou are mixing declarations and code here. Is it allowed? vmaffione: You are mixing declarations and code here. Is it allowed? | |||||
} | } | ||||
return (len); | |||||
} | |||||
static __inline void | |||||
netmap_drop_pkt(struct nm_desc *nmd) | |||||
{ | |||||
int i; | |||||
struct netmap_ring *r = NETMAP_RXRING(nmd->nifp, nmd->cur_rx_ring); | |||||
for (i = r->head; i != r->tail; i = nm_ring_next(r, i)) { | |||||
if (!(r->slot[i].flags & NS_MOREFRAG)) { | |||||
r->head = r->cur = nm_ring_next(r, i); | |||||
return; | |||||
} | |||||
} | |||||
} | |||||
static void | static void | ||||
pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc) | pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc) | ||||
{ | { | ||||
struct iovec iov[VTNET_MAXSEGS], *riov; | struct iovec iov[VTNET_MAXSEGS], *riov; | ||||
struct virtio_used used[VTNET_MAXSEGS]; | |||||
Done Inline ActionsYou should use r->head to get the next to use. r->cur is for synchronization, and points at the next unseen, which may be ahead of r->head, although not your code. vmaffione: You should use r->head to get the next to use. r->cur is for synchronization, and points at the… | |||||
vmaffioneUnsubmitted Not Done Inline ActionsWhy are you using this temporary array instead of writing directly to the used ring? vmaffione: Why are you using this temporary array instead of writing directly to the used ring?
Used… | |||||
struct virtio_net_rxhdr *vrxh; | |||||
struct vqueue_info *vq; | struct vqueue_info *vq; | ||||
void *vrx; | |||||
int len, n; | |||||
uint16_t idx; | uint16_t idx; | ||||
int bufs, len, n; | |||||
/* | /* | ||||
* Should never be called without a valid netmap descriptor | * Should never be called without a valid netmap descriptor | ||||
*/ | */ | ||||
assert(sc->vsc_nmd != NULL); | assert(sc->vsc_nmd != NULL); | ||||
/* | /* | ||||
* But, will be called when the rx ring hasn't yet | * But, will be called when the rx ring hasn't yet | ||||
* been set up or the guest is resetting the device. | * been set up or the guest is resetting the device. | ||||
*/ | */ | ||||
if (!sc->vsc_rx_ready || sc->resetting) { | if (!sc->vsc_rx_ready || sc->resetting) { | ||||
Done Inline Actionsr->head rather than r->cur vmaffione: r->head rather than r->cur | |||||
/* | /* | ||||
* Drop the packet and try later. | * Drop the packet and try later. | ||||
*/ | */ | ||||
(void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); | netmap_drop_pkt(sc->vsc_nmd); | ||||
return; | return; | ||||
} | } | ||||
/* | /* | ||||
* Check for available rx buffers | * Check for available rx buffers | ||||
*/ | */ | ||||
vq = &sc->vsc_queues[VTNET_RXQ]; | vq = &sc->vsc_queues[VTNET_RXQ]; | ||||
if (!vq_has_descs(vq)) { | if (!vq_has_descs(vq)) { | ||||
/* | /* | ||||
* Drop the packet and try later. Interrupt on | * Drop the packet and try later. Interrupt on | ||||
* empty, if that's negotiated. | * empty, if that's negotiated. | ||||
*/ | */ | ||||
(void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); | netmap_drop_pkt(sc->vsc_nmd); | ||||
vq_endchains(vq, 1); | vq_endchains(vq, 1); | ||||
return; | return; | ||||
} | } | ||||
do { | do { | ||||
len = netmap_next_pkt_len(sc->vsc_nmd); | |||||
if (unlikely(len == 0)) { | |||||
/* | /* | ||||
* Get descriptor chain. | * No more packets, but still some avail ring | ||||
* entries. Interrupt if needed/appropriate. | |||||
*/ | */ | ||||
n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); | vq_endchains(vq, 0); | ||||
assert(n >= 1 && n <= VTNET_MAXSEGS); | return; | ||||
} | |||||
if (sc->rx_merge) { | |||||
/* | /* | ||||
* Get a pointer to the rx header, and use the | * Get mergable buffers. | ||||
* data immediately following it for the packet buffer. | |||||
*/ | */ | ||||
vrx = iov[0].iov_base; | n = vq_getbufs_mrgrx(vq, iov, VTNET_MAXSEGS, len + sc->rx_vhdrlen, | ||||
riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); | used, &bufs); | ||||
} else { | |||||
len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n); | |||||
if (len == 0) { | |||||
/* | /* | ||||
* No more packets, but still some avail ring | * Get descriptor chain. | ||||
* entries. Interrupt if needed/appropriate. | |||||
*/ | */ | ||||
vq_retchain(vq); | n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); | ||||
} | |||||
if (n <= 0) { | |||||
vq_endchains(vq, 0); | vq_endchains(vq, 0); | ||||
return; | return; | ||||
} | } | ||||
/* | /* | ||||
* The only valid field in the rx packet header is the | * Get a pointer to the rx header, and use the | ||||
* number of buffers if merged rx bufs were negotiated. | * data immediately following it for the packet buffer. | ||||
*/ | */ | ||||
memset(vrx, 0, sc->rx_vhdrlen); | vrxh = iov[0].iov_base; | ||||
memset(vrxh, 0, sc->rx_vhdrlen); | |||||
if (sc->rx_merge) { | riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); | ||||
struct virtio_net_rxhdr *vrxh; | |||||
vrxh = vrx; | (void)pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n, len); | ||||
vrxh->vrh_bufs = 1; | |||||
} | |||||
/* | /* | ||||
* Release this chain and handle more chains. | * Release used descriptors. | ||||
*/ | */ | ||||
if (sc->rx_merge) { | |||||
vrxh->vrh_bufs = bufs; | |||||
vq_relbufs_mrgrx(vq, bufs, used); | |||||
} else { | |||||
vq_relchain(vq, idx, len + sc->rx_vhdrlen); | vq_relchain(vq, idx, len + sc->rx_vhdrlen); | ||||
} | |||||
} while (vq_has_descs(vq)); | } while (vq_has_descs(vq)); | ||||
/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ | /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ | ||||
Not Done Inline ActionsI think you should call vq_inc_used_idx_and_last_avail() after each packet, otherwise you are introducing artificial latency. As a side effect, this allows you to remove the "start" argument ("used" local variable) and the used local variable. vmaffione: I think you should call vq_inc_used_idx_and_last_avail() after each packet, otherwise you are… | |||||
vq_endchains(vq, 1); | vq_endchains(vq, 1); | ||||
} | } | ||||
Not Done Inline Actionswhy this return? vmaffione: why this return? | |||||
static void | static void | ||||
pci_vtnet_rx_callback(int fd, enum ev_type type, void *param) | pci_vtnet_rx_callback(int fd, enum ev_type type, void *param) | ||||
{ | { | ||||
struct pci_vtnet_softc *sc = param; | struct pci_vtnet_softc *sc = param; | ||||
pthread_mutex_lock(&sc->rx_mtx); | pthread_mutex_lock(&sc->rx_mtx); | ||||
sc->rx_in_progress = 1; | sc->rx_in_progress = 1; | ||||
▲ Show 20 Lines • Show All 398 Lines • Show Last 20 Lines |
nmd->cur_tx_ring
Why did you drop the iterating over all the possible tx rings?