diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -403,15 +403,20 @@ * addr and len identify the netmap buffer, m is the (preallocated) * mbuf to use for transmissions. * - * We should add a reference to the mbuf so the m_freem() at the end - * of the transmission does not consume resources. + * Zero-copy transmission is possible if netmap is attached directly to a + * hardware interface: when cleaning we simply wait for the mbuf cluster + * refcount to decrement to 1, indicating that the driver has completed + * transmission and is done with the buffer. However, this approach can + * lead to queue deadlocks when attaching to software interfaces (e.g., + * if_bridge) since we cannot rely on member ports to promptly reclaim + * transmitted mbufs. Since there is no easy way to distinguish these + * cases, we currently always copy the buffer. * - * On FreeBSD, and on multiqueue cards, we can force the queue using + * On multiqueue cards, we can force the queue using * if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) * i = m->m_pkthdr.flowid % adapter->num_queues; * else * i = curcpu % adapter->num_queues; - * */ int nm_os_generic_xmit_frame(struct nm_os_gen_arg *a) @@ -421,16 +426,21 @@ if_t ifp = a->ifp; struct mbuf *m = a->m; - /* Link the external storage to - * the netmap buffer, so that no copy is necessary. */ - m->m_ext.ext_buf = m->m_data = a->addr; - m->m_ext.ext_size = len; + M_ASSERTPKTHDR(m); + KASSERT((m->m_flags & M_EXT) != 0, + ("%s: mbuf %p has no cluster", __func__, m)); - m->m_flags |= M_PKTHDR; - m->m_len = m->m_pkthdr.len = len; + if (MBUF_REFCNT(m) != 1) { + nm_prerr("invalid refcnt %d for %p", MBUF_REFCNT(m), m); + panic("in generic_xmit_frame"); + } + if (unlikely(m->m_ext.ext_size < len)) { + nm_prlim(2, "size %d < len %d", m->m_ext.ext_size, len); + len = m->m_ext.ext_size; + } - /* mbuf refcnt is not contended, no need to use atomic - * (a memory barrier is enough). */ + m_copyback(m, 0, len, a->addr); + m->m_len = m->m_pkthdr.len = len; SET_MBUF_REFCNT(m, 2); M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); m->m_pkthdr.flowid = a->ring_nr; @@ -441,7 +451,6 @@ return ret ? -1 : 0; } - struct netmap_adapter * netmap_getna(if_t ifp) { diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c --- a/sys/dev/netmap/netmap_generic.c +++ b/sys/dev/netmap/netmap_generic.c @@ -272,6 +272,7 @@ } for_each_tx_kring(r, kring, na) { + callout_drain(&kring->tx_event_callout); mtx_destroy(&kring->tx_event_lock); if (kring->tx_pool == NULL) { continue; @@ -357,6 +358,9 @@ } mtx_init(&kring->tx_event_lock, "tx_event_lock", NULL, MTX_SPIN); + callout_init_mtx(&kring->tx_event_callout, + &kring->tx_event_lock, + CALLOUT_RETURNUNLOCKED); } } @@ -473,7 +477,13 @@ if (++r == na->num_tx_rings) r = 0; if (r == r_orig) { +#ifndef __FreeBSD__ + /* + * On FreeBSD this situation can arise if the tx_event + * callout handler cleared a stuck packet. + */ nm_prlim(1, "Cannot match event %p", m); +#endif return; } } @@ -481,9 +491,7 @@ /* Second, wake up clients. They will reclaim the event through * txsync. */ netmap_generic_irq(na, r, NULL); -#ifdef __FreeBSD__ void_mbuf_dtor(m); -#endif } /* Record completed transmissions and update hwtail. @@ -537,7 +545,6 @@ } /* The event has been consumed, we can go * ahead. */ - } else if (MBUF_REFCNT(m) != 1) { /* This mbuf is still busy: its refcnt is 2. */ break; @@ -577,6 +584,18 @@ return e; } +#ifdef __FreeBSD__ +static void +generic_tx_callout(void *arg) +{ + struct netmap_kring *kring = arg; + + kring->tx_event = NULL; + mtx_unlock_spin(&kring->tx_event_lock); + netmap_generic_irq(kring->na, kring->ring_id, NULL); +} +#endif + static void generic_set_tx_event(struct netmap_kring *kring, u_int hwcur) { @@ -622,6 +641,22 @@ SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor); kring->tx_event = m; +#ifdef __FreeBSD__ + /* + * Handle the possibility that the transmitted buffer isn't reclaimed + * within a bounded period of time. This can arise when transmitting + * out of multiple ports via a lagg or bridge interface, since the + * member ports may legitimately only free transmitted buffers in + * batches. + * + * The callout handler clears the stuck packet from the ring, allowing + * transmission to proceed. In the common case we let + * generic_mbuf_destructor() unstick the ring, allowing mbufs to be + * reused most of the time. + */ + callout_reset_sbt_curcpu(&kring->tx_event_callout, SBT_1MS, 0, + generic_tx_callout, kring, 0); +#endif mtx_unlock_spin(&kring->tx_event_lock); kring->tx_pool[e] = NULL; @@ -631,10 +666,8 @@ /* Decrement the refcount. This will free it if we lose the race * with the driver. */ m_freem(m); - smp_mb(); } - /* * generic_netmap_txsync() transforms netmap buffers into mbufs * and passes them to the standard device driver @@ -712,6 +745,8 @@ break; } IFRATE(rate_ctx.new.txrepl++); + } else { + nm_os_mbuf_reinit(m); } a.m = m; @@ -783,9 +818,6 @@ #endif } - /* - * Second, reclaim completed buffers - */ if (!gna->txqdisc && (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring))) { /* No more available slots? Set a notification event * on a netmap slot that will be cleaned in the future. @@ -795,6 +827,9 @@ generic_set_tx_event(kring, nm_i); } + /* + * Second, reclaim completed buffers + */ generic_netmap_tx_clean(kring, gna->txqdisc); return 0; diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -501,6 +501,9 @@ struct mbuf **tx_pool; struct mbuf *tx_event; /* TX event used as a notification */ NM_LOCK_T tx_event_lock; /* protects the tx_event mbuf */ +#ifdef __FreeBSD__ + struct callout tx_event_callout; +#endif struct mbq rx_queue; /* intercepted rx mbufs. */ uint32_t users; /* existing bindings for this ring */ @@ -2384,16 +2387,17 @@ * * We allocate mbufs with m_gethdr(), since the mbuf header is needed * by the driver. We also attach a customly-provided external storage, - * which in this case is a netmap buffer. When calling m_extadd(), however - * we pass a NULL address, since the real address (and length) will be - * filled in by nm_os_generic_xmit_frame() right before calling - * if_transmit(). + * which in this case is a netmap buffer. * * The dtor function does nothing, however we need it since mb_free_ext() * has a KASSERT(), checking that the mbuf dtor function is not NULL. */ -static void void_mbuf_dtor(struct mbuf *m) { } +static inline void +void_mbuf_dtor(struct mbuf *m) +{ + uma_zfree(zone_clust, m->m_ext.ext_buf); +} #define SET_MBUF_DESTRUCTOR(m, fn) do { \ (m)->m_ext.ext_free = (fn != NULL) ? \ @@ -2401,22 +2405,39 @@ } while (0) static inline struct mbuf * -nm_os_get_mbuf(if_t ifp, int len) +nm_os_get_mbuf(if_t ifp __unused, int len) { struct mbuf *m; + void *buf; - (void)ifp; - (void)len; + KASSERT(len <= MCLBYTES, ("%s: len %d", __func__, len)); m = m_gethdr(M_NOWAIT, MT_DATA); - if (m == NULL) { - return m; + if (__predict_false(m == NULL)) + return (NULL); + buf = uma_zalloc(zone_clust, M_NOWAIT); + if (__predict_false(buf == NULL)) { + m_free(m); + return (NULL); } + m_extadd(m, buf, MCLBYTES, void_mbuf_dtor, NULL, NULL, 0, EXT_NET_DRV); + return (m); +} + +static inline void +nm_os_mbuf_reinit(struct mbuf *m) +{ + void *buf; - m_extadd(m, NULL /* buf */, 0 /* size */, void_mbuf_dtor, - NULL, NULL, 0, EXT_NET_DRV); + KASSERT((m->m_flags & M_EXT) != 0, + ("%s: mbuf %p has no external storage", __func__, m)); + KASSERT(m->m_ext.ext_size == MCLBYTES, + ("%s: mbuf %p has wrong external storage size %u", __func__, m, + m->m_ext.ext_size)); - return m; + buf = m->m_ext.ext_buf; + m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR); + m_extadd(m, buf, MCLBYTES, void_mbuf_dtor, NULL, NULL, 0, EXT_NET_DRV); } #endif /* __FreeBSD__ */