diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -403,15 +403,20 @@ * addr and len identify the netmap buffer, m is the (preallocated) * mbuf to use for transmissions. * - * We should add a reference to the mbuf so the m_freem() at the end - * of the transmission does not consume resources. + * Zero-copy transmission is possible if netmap is attached directly to a + * hardware interface: when cleaning we simply wait for the mbuf cluster + * refcount to decrement to 1, indicating that the driver has completed + * transmission and is done with the buffer. However, this approach can + * lead to queue deadlocks when attaching to software interfaces (e.g., + * if_bridge) since we cannot rely on member ports to promptly reclaim + * transmitted mbufs. Since there is no easy way to distinguish these + * cases, we currently always copy the buffer. * - * On FreeBSD, and on multiqueue cards, we can force the queue using + * On multiqueue cards, we can force the queue using * if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) * i = m->m_pkthdr.flowid % adapter->num_queues; * else * i = curcpu % adapter->num_queues; - * */ int nm_os_generic_xmit_frame(struct nm_os_gen_arg *a) @@ -421,16 +426,18 @@ if_t ifp = a->ifp; struct mbuf *m = a->m; - /* Link the external storage to - * the netmap buffer, so that no copy is necessary. */ - m->m_ext.ext_buf = m->m_data = a->addr; - m->m_ext.ext_size = len; + M_ASSERTPKTHDR(m); + if (MBUF_REFCNT(m) != 1) { + nm_prerr("invalid refcnt %d for %p", MBUF_REFCNT(m), m); + panic("in generic_xmit_frame"); + } + if (unlikely(m->m_ext.ext_size < len)) { + nm_prlim(2, "size %d < len %d", m->m_ext.ext_size, len); + len = m->m_ext.ext_size; + } - m->m_flags |= M_PKTHDR; + m_copyback(m, 0, len, a->addr); m->m_len = m->m_pkthdr.len = len; - - /* mbuf refcnt is not contended, no need to use atomic - * (a memory barrier is enough). */ SET_MBUF_REFCNT(m, 2); M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); m->m_pkthdr.flowid = a->ring_nr; @@ -441,7 +448,6 @@ return ret ? -1 : 0; } - struct netmap_adapter * netmap_getna(if_t ifp) { diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c --- a/sys/dev/netmap/netmap_generic.c +++ b/sys/dev/netmap/netmap_generic.c @@ -272,6 +272,7 @@ } for_each_tx_kring(r, kring, na) { + callout_drain(&kring->tx_event_callout); mtx_destroy(&kring->tx_event_lock); if (kring->tx_pool == NULL) { continue; @@ -357,6 +358,9 @@ } mtx_init(&kring->tx_event_lock, "tx_event_lock", NULL, MTX_SPIN); + callout_init_mtx(&kring->tx_event_callout, + &kring->tx_event_lock, + CALLOUT_RETURNUNLOCKED); } } @@ -473,7 +477,13 @@ if (++r == na->num_tx_rings) r = 0; if (r == r_orig) { +#ifndef __FreeBSD__ + /* + * On FreeBSD this situation can arise if the tx_event + * callout handler cleared a stuck packet. + */ nm_prlim(1, "Cannot match event %p", m); +#endif return; } } @@ -481,9 +491,7 @@ /* Second, wake up clients. They will reclaim the event through * txsync. */ netmap_generic_irq(na, r, NULL); -#ifdef __FreeBSD__ void_mbuf_dtor(m); -#endif } /* Record completed transmissions and update hwtail. @@ -537,7 +545,6 @@ } /* The event has been consumed, we can go * ahead. */ - } else if (MBUF_REFCNT(m) != 1) { /* This mbuf is still busy: its refcnt is 2. */ break; @@ -577,6 +584,18 @@ return e; } +#ifdef __FreeBSD__ +static void +generic_tx_callout(void *arg) +{ + struct netmap_kring *kring = arg; + + kring->tx_event = NULL; + mtx_unlock_spin(&kring->tx_event_lock); + netmap_generic_irq(kring->na, kring->ring_id, NULL); +} +#endif + static void generic_set_tx_event(struct netmap_kring *kring, u_int hwcur) { @@ -622,6 +641,22 @@ SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor); kring->tx_event = m; +#ifdef __FreeBSD__ + /* + * Handle the possibility that the transmitted buffer isn't reclaimed + * within a bounded period of time. This can arise when transmitting + * out of multiple ports via a lagg or bridge interface, since the + * member ports may legitimately only free transmitted buffers in + * batches. + * + * The callout handler clears the stuck packet from the ring, allowing + * transmission to proceed. In the common case we let + * generic_mbuf_destructor() unstick the ring, allowing mbufs to be + * reused most of the time. + */ + callout_reset_sbt_curcpu(&kring->tx_event_callout, SBT_1MS, 0, + generic_tx_callout, kring, 0); +#endif mtx_unlock_spin(&kring->tx_event_lock); kring->tx_pool[e] = NULL; @@ -631,10 +666,8 @@ /* Decrement the refcount. This will free it if we lose the race * with the driver. */ m_freem(m); - smp_mb(); } - /* * generic_netmap_txsync() transforms netmap buffers into mbufs * and passes them to the standard device driver @@ -783,9 +816,6 @@ #endif } - /* - * Second, reclaim completed buffers - */ if (!gna->txqdisc && (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring))) { /* No more available slots? Set a notification event * on a netmap slot that will be cleaned in the future. @@ -795,6 +825,9 @@ generic_set_tx_event(kring, nm_i); } + /* + * Second, reclaim completed buffers + */ generic_netmap_tx_clean(kring, gna->txqdisc); return 0; diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -503,6 +503,9 @@ struct mbuf **tx_pool; struct mbuf *tx_event; /* TX event used as a notification */ NM_LOCK_T tx_event_lock; /* protects the tx_event mbuf */ +#ifdef __FreeBSD__ + struct callout tx_event_callout; +#endif struct mbq rx_queue; /* intercepted rx mbufs. */ uint32_t users; /* existing bindings for this ring */ @@ -2394,7 +2397,11 @@ * has a KASSERT(), checking that the mbuf dtor function is not NULL. */ -static void void_mbuf_dtor(struct mbuf *m) { } +static inline void +void_mbuf_dtor(struct mbuf *m) +{ + uma_zfree(zone_clust, m->m_ext.ext_buf); +} #define SET_MBUF_DESTRUCTOR(m, fn) do { \ (m)->m_ext.ext_free = (fn != NULL) ? \ @@ -2402,22 +2409,23 @@ } while (0) static inline struct mbuf * -nm_os_get_mbuf(if_t ifp, int len) +nm_os_get_mbuf(if_t ifp __unused, int len) { struct mbuf *m; + void *buf; - (void)ifp; - (void)len; + KASSERT(len <= MCLBYTES, ("%s: len %d", __func__, len)); m = m_gethdr(M_NOWAIT, MT_DATA); - if (m == NULL) { - return m; + if (__predict_false(m == NULL)) + return (NULL); + buf = uma_zalloc(zone_clust, M_NOWAIT); + if (__predict_false(buf == NULL)) { + m_free(m); + return (NULL); } - - m_extadd(m, NULL /* buf */, 0 /* size */, void_mbuf_dtor, - NULL, NULL, 0, EXT_NET_DRV); - - return m; + m_extadd(m, buf, MCLBYTES, void_mbuf_dtor, NULL, NULL, 0, EXT_NET_DRV); + return (m); } #endif /* __FreeBSD__ */