Page MenuHomeFreeBSD

bpf: don't recursively enter the network epoch in bpf taps
Needs ReviewPublic

Authored by glebius on Fri, Nov 21, 10:56 PM.
Tags
None
Referenced Files
Unknown Object (File)
Fri, Dec 12, 1:00 PM
Unknown Object (File)
Sat, Nov 29, 7:09 PM
Unknown Object (File)
Sat, Nov 29, 7:02 AM
Unknown Object (File)
Sat, Nov 29, 6:40 AM
Unknown Object (File)
Sat, Nov 29, 12:08 AM
Unknown Object (File)
Fri, Nov 28, 3:07 PM
Unknown Object (File)
Thu, Nov 27, 9:51 AM
Unknown Object (File)
Wed, Nov 26, 9:36 PM

Details

Reviewers
None
Group Reviewers
network

Diff Detail

Repository
rG FreeBSD src repository
Lint
Lint Skipped
Unit
Tests Skipped
Build Status
Buildable 69032
Build 65915: arc lint + arc unit

Event Timeline

The NET_EPOCH_ENTER() in sys/dev/iicbus/if_ic.c should be adjusted,

                if_inc_counter(sc->ic_ifp, IFCOUNTER_IBYTES, len);
+++         NET_EPOCH_ENTER(et);
                BPF_TAP(sc->ic_ifp, sc->ic_ifbuf, len + ICHDRLEN);
                top = m_devget(sc->ic_ifbuf + ICHDRLEN, len, 0, sc->ic_ifp, 0);
                if (top) {
                        struct epoch_tracker et;

                        mtx_unlock(&sc->ic_lock);
                        M_SETFIB(top, if_getfib(sc->ic_ifp));
---                    NET_EPOCH_ENTER(et);
                        netisr_dispatch(NETISR_IP, top);
---                    NET_EPOCH_EXIT(et);
                        mtx_lock(&sc->ic_lock);
                }
+++         NET_EPOCH_EXIT(et);
                break;

and sys/netgraph/ng_iface.c,

        /* Note receiving interface */
        m->m_pkthdr.rcvif = ifp;

+++        NET_EPOCH_ENTER(et);
        /* Berkeley packet filter */
        ng_iface_bpftap(ifp, m, iffam->family);

        /* Send packet */
        switch (iffam->family) {
#ifdef INET
        case AF_INET:
                isr = NETISR_IP;
                break;
#endif
#ifdef INET6
        case AF_INET6:
                isr = NETISR_IPV6;
                break;
#endif
        default:
                m_freem(m);
+++        NET_EPOCH_EXIT(et);
                return (EAFNOSUPPORT);
        }
        random_harvest_queue(m, sizeof(*m), RANDOM_NET_NG);
        M_SETFIB(m, ifp->if_fib);
        CURVNET_SET(ifp->if_vnet);
---        NET_EPOCH_ENTER(et);
        netisr_dispatch(isr, m);
        NET_EPOCH_EXIT(et);
        CURVNET_RESTORE();

and sys/net80211/ieee80211_radiotap.c, needs NET_EPOCH_ENTER / NET_EPOCH_EXIT around bpf_mtap2(), as it appears they are called from thread context without net epoch.

The sys/dev/ena/ena.c is suspicious, ena_deferred_mq_start() calls ena_start_xmit() and the latter now requires net epoch.

--- TASK_INIT(&tx_ring->enqueue_task, 0, ena_deferred_mq_start, tx_ring);
+++ NET_TASK_INIT(&tx_ring->enqueue_task, 0, ena_deferred_mq_start, tx_ring);

In sys/dev/gve/gve_tx.c, the deferred task gve_xmit_tq() calls gve_xmit_br() now requires net epoch.

static void
gve_start_tx_ring(struct gve_priv *priv, int i)
{
        struct gve_tx_ring *tx = &priv->tx[i];
        struct gve_ring_com *com = &tx->com;

        atomic_store_bool(&tx->stopped, false);
        if (gve_is_gqi(priv))
                NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx);
        else
                NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq_dqo, tx);
        com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK,
            taskqueue_thread_enqueue, &com->cleanup_tq);
        taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d",
            device_get_nameunit(priv->dev), i);

---     TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx);
+++     NET_TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx);
        tx->xmit_tq = taskqueue_create_fast("gve tx xmit",
            M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq);
        taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit",
            device_get_nameunit(priv->dev), i);
}

It appears that sys/dev/xilinx/axidma.c lacks proper net epoch flag. Both xae_intr_tx() and xae_intr_rx() requires net epoch.

static int
axidma_setup_cb(device_t dev, int chan_id, void (*cb)(void *), void *arg)
{
        struct axidma_softc *sc;
        int error;

        sc = device_get_softc(dev);

        if (sc->ih[chan_id] != NULL)
                return (EEXIST);

        error = bus_setup_intr(dev, sc->res[chan_id + 1],
---         INTR_TYPE_MISC | INTR_MPSAFE, NULL, cb, arg,
+++            INTR_TYPE_NET | INTR_MPSAFE, NULL, cb, arg,
            &sc->ih[chan_id]);
        if (error)
                device_printf(dev, "Unable to alloc interrupt resource.\n");

        return (error);
}

In file sys/dev/sis/if_sis.c, the sis_startl() requires net epoch, but sis_tick() / sis_watchdog() is invoked in callout. The callout is not net epoch aware, is it ?

static void
sis_watchdog(struct sis_softc *sc)
{
+++        struct epoch_tracker et;
        SIS_LOCK_ASSERT(sc);

        if (sc->sis_watchdog_timer == 0 || --sc->sis_watchdog_timer >0)
                return;

        device_printf(sc->sis_dev, "watchdog timeout\n");
        if_inc_counter(sc->sis_ifp, IFCOUNTER_OERRORS, 1);

        if_setdrvflagbits(sc->sis_ifp, 0, IFF_DRV_RUNNING);
        sis_initl(sc);

---        if (!if_sendq_empty(sc->sis_ifp))
+++        if (!if_sendq_empty(sc->sis_ifp)) {
+++        NET_EPOCH_ENTER(et);
               sis_startl(sc->sis_ifp);
+++        NET_EPOCH_EXIT(et);
+++         }
}

To be continued ...

@zlei thanks for all these findings! It seems that at least few of them were violating epoch even before the suggested bpf change. Are you going to commit the fixes? I can help. To me some of these fixes do not look related to bpf at all.

@zlei thanks for all these findings! It seems that at least few of them were violating epoch even before the suggested bpf change. Are you going to commit the fixes? I can help. To me some of these fixes do not look related to bpf at all.

Indeed I have WIP to remove NET_EPOCH_ENTER / NET_EPOCH_EXIT from bpf.c and that is almost identical to your work. Well I must admit that I never finished it as there' re too many places to check manual.

$ grep -Er 'BPF_[M]?TAP|bpf_[m]?tap' sys
.... lots of drivers

I'll try to find them all and send you a patch. Well, this change should also be noted in UPDATING, 3rd party drivers should also be aware this.

Indeed I have WIP to remove NET_EPOCH_ENTER / NET_EPOCH_EXIT from bpf.c and that is almost identical to your work. Well I must admit that I never finished it as there' re too many places to check manual.

$ grep -Er 'BPF_[M]?TAP|bpf_[m]?tap' sys
.... lots of drivers

I'll try to find them all and send you a patch.

Let's just work on them one by one. Most of them already are in the epoch. Those that are not, are all different to each other. Should be committed separately.

Well, this change should also be noted in UPDATING, 3rd party drivers should also be aware this.

I'm not sure about that. The bpf is a network stack thing and it usually is at low level, it is supposed to be entered with epoch. The fact if entered it recursively, I guess, was just a quick fix from the painful era of transitioning to the epoch.

However, I have plans for newer more advanced KPI for bpf_attach. If that happens, then of course a documentation entry will happen.