if_inc_counter(sc->ic_ifp, IFCOUNTER_IBYTES, len);
+++         NET_EPOCH_ENTER(et);
                BPF_TAP(sc->ic_ifp, sc->ic_ifbuf, len + ICHDRLEN);
                top = m_devget(sc->ic_ifbuf + ICHDRLEN, len, 0, sc->ic_ifp, 0);
                if (top) {
                        struct epoch_tracker et;

                        mtx_unlock(&sc->ic_lock);
                        M_SETFIB(top, if_getfib(sc->ic_ifp));
---                    NET_EPOCH_ENTER(et);
                        netisr_dispatch(NETISR_IP, top);
---                    NET_EPOCH_EXIT(et);
                        mtx_lock(&sc->ic_lock);
                }
+++         NET_EPOCH_EXIT(et);
                break;

and sys/netgraph/ng_iface.c,

        /* Note receiving interface */
        m->m_pkthdr.rcvif = ifp;

+++        NET_EPOCH_ENTER(et);
        /* Berkeley packet filter */
        ng_iface_bpftap(ifp, m, iffam->family);

        /* Send packet */
        switch (iffam->family) {
#ifdef INET
        case AF_INET:
                isr = NETISR_IP;
                break;
#endif
#ifdef INET6
        case AF_INET6:
                isr = NETISR_IPV6;
                break;
#endif
        default:
                m_freem(m);
+++        NET_EPOCH_EXIT(et);
                return (EAFNOSUPPORT);
        }
        random_harvest_queue(m, sizeof(*m), RANDOM_NET_NG);
        M_SETFIB(m, ifp->if_fib);
        CURVNET_SET(ifp->if_vnet);
---        NET_EPOCH_ENTER(et);
        netisr_dispatch(isr, m);
        NET_EPOCH_EXIT(et);
        CURVNET_RESTORE();

and sys/net80211/ieee80211_radiotap.c, needs NET_EPOCH_ENTER / NET_EPOCH_EXIT around bpf_mtap2(), as it appears they are called from thread context without net epoch.

The sys/dev/ena/ena.c is suspicious, ena_deferred_mq_start() calls ena_start_xmit() and the latter now requires net epoch.

--- TASK_INIT(&tx_ring->enqueue_task, 0, ena_deferred_mq_start, tx_ring);
+++ NET_TASK_INIT(&tx_ring->enqueue_task, 0, ena_deferred_mq_start, tx_ring);

In sys/dev/gve/gve_tx.c, the deferred task gve_xmit_tq() calls gve_xmit_br() now requires net epoch.

static void
gve_start_tx_ring(struct gve_priv *priv, int i)
{
        struct gve_tx_ring *tx = &priv->tx[i];
        struct gve_ring_com *com = &tx->com;

        atomic_store_bool(&tx->stopped, false);
        if (gve_is_gqi(priv))
                NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx);
        else
                NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq_dqo, tx);
        com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK,
            taskqueue_thread_enqueue, &com->cleanup_tq);
        taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d",
            device_get_nameunit(priv->dev), i);

---     TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx);
+++     NET_TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx);
        tx->xmit_tq = taskqueue_create_fast("gve tx xmit",
            M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq);
        taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit",
            device_get_nameunit(priv->dev), i);
}

It appears that sys/dev/xilinx/axidma.c lacks proper net epoch flag. Both xae_intr_tx() and xae_intr_rx() requires net epoch.

static int
axidma_setup_cb(device_t dev, int chan_id, void (*cb)(void *), void *arg)
{
        struct axidma_softc *sc;
        int error;

        sc = device_get_softc(dev);

        if (sc->ih[chan_id] != NULL)
                return (EEXIST);

        error = bus_setup_intr(dev, sc->res[chan_id + 1],
---         INTR_TYPE_MISC | INTR_MPSAFE, NULL, cb, arg,
+++            INTR_TYPE_NET | INTR_MPSAFE, NULL, cb, arg,
            &sc->ih[chan_id]);
        if (error)
                device_printf(dev, "Unable to alloc interrupt resource.\n");

        return (error);
}

In file sys/dev/sis/if_sis.c, the sis_startl() requires net epoch, but sis_tick() / sis_watchdog() is invoked in callout. The callout is not net epoch aware, is it ?

static void
sis_watchdog(struct sis_softc *sc)
{
+++        struct epoch_tracker et;
        SIS_LOCK_ASSERT(sc);

        if (sc->sis_watchdog_timer == 0 || --sc->sis_watchdog_timer >0)
                return;

        device_printf(sc->sis_dev, "watchdog timeout\n");
        if_inc_counter(sc->sis_ifp, IFCOUNTER_OERRORS, 1);

        if_setdrvflagbits(sc->sis_ifp, 0, IFF_DRV_RUNNING);
        sis_initl(sc);

---        if (!if_sendq_empty(sc->sis_ifp))
+++        if (!if_sendq_empty(sc->sis_ifp)) {
+++        NET_EPOCH_ENTER(et);
               sis_startl(sc->sis_ifp);
+++        NET_EPOCH_EXIT(et);
+++         }
}

To be continued ...

@zlei thanks for all these findings! It seems that at least few of them were violating epoch even before the suggested bpf change. Are you going to commit the fixes? I can help. To me some of these fixes do not look related to bpf at all.

In D53871#1238154, @glebius wrote:

@zlei thanks for all these findings! It seems that at least few of them were violating epoch even before the suggested bpf change. Are you going to commit the fixes? I can help. To me some of these fixes do not look related to bpf at all.

Indeed I have WIP to remove NET_EPOCH_ENTER / NET_EPOCH_EXIT from bpf.c and that is almost identical to your work. Well I must admit that I never finished it as there' re too many places to check manual.

$ grep -Er 'BPF_[M]?TAP|bpf_[m]?tap' sys
.... lots of drivers

I'll try to find them all and send you a patch. Well, this change should also be noted in UPDATING, 3rd party drivers should also be aware this.

In D53871#1238332, @zlei wrote:
Indeed I have WIP to remove NET_EPOCH_ENTER / NET_EPOCH_EXIT from bpf.c and that is almost identical to your work. Well I must admit that I never finished it as there' re too many places to check manual.
$ grep -Er 'BPF_[M]?TAP|bpf_[m]?tap' sys
.... lots of drivers
I'll try to find them all and send you a patch.

Let's just work on them one by one. Most of them already are in the epoch. Those that are not, are all different to each other. Should be committed separately.

Well, this change should also be noted in UPDATING, 3rd party drivers should also be aware this.

I'm not sure about that. The bpf is a network stack thing and it usually is at low level, it is supposed to be entered with epoch. The fact if entered it recursively, I guess, was just a quick fix from the painful era of transitioning to the epoch.

However, I have plans for newer more advanced KPI for bpf_attach. If that happens, then of course a documentation entry will happen.

I uploaded the patch to https://reviews.freebsd.org/F139754092 . I might miss some drivers but that should be almost completed. While working on this, I'm not fully convinced this is a good approach. I'd argue the bpf part is too tightly coupled with the net epoch.

Ideally, when the net stack pass mbufs to a driver ( the hardware for example ), it is up to the driver to handle the rest work, enqueue and pass to nic's internal storage, then no net epoch required.

What if we introduce a new epoch, say bpf_epoch_preempt for bpf only ? I have not benchmarked yet so I've no idea how much cost of entering a new bpf epoch will be.

In D53871#1239268, @zlei wrote:

I uploaded the patch to https://reviews.freebsd.org/F139754092 . I might miss some drivers but that should be almost completed. While working on this, I'm not fully convinced this is a good approach. I'd argue the bpf part is too tightly coupled with the net epoch.

Thanks a lot! Do you plan to commit? Maybe the watchdogged drivers can be generalized in some way...

Ideally, when the net stack pass mbufs to a driver ( the hardware for example ), it is up to the driver to handle the rest work, enqueue and pass to nic's internal storage, then no net epoch required.

When net stack pass mbufs to a driver, we always are already in the net epoch. Hence this revision exists.

What if we introduce a new epoch, say bpf_epoch_preempt for bpf only ? I have not benchmarked yet so I've no idea how much cost of entering a new bpf epoch will be.

Why would we do that? Introducing a new instance of epoch(9) makes sense only outside of the network stack. I can't see any good reason to have two epochs in the network stack. I can imagine only a case we have some new kind of allocation that has a very high alloc/free traffic and can't use SMR, then we might want to separate it from the main epoch. But that is not going to иу bpf structures.

Rebase. USB already has been taken care of.

Harbormaster completed remote builds in B69319: Diff 168228.Dec 16 2025, 9:33 PM

In D53871#1239274, @glebius wrote:

In D53871#1239268, @zlei wrote:

I uploaded the patch to https://reviews.freebsd.org/F139754092 . I might miss some drivers but that should be almost completed. While working on this, I'm not fully convinced this is a good approach. I'd argue the bpf part is too tightly coupled with the net epoch.

Thanks a lot! Do you plan to commit? Maybe the watchdogged drivers can be generalized in some way...

I'm still checking if I missed any drivers. There is a pattern, that re-initializing the driver in net epoch context when the watchdog time out, appears wrong to me.

Ideally, when the net stack pass mbufs to a driver ( the hardware for example ), it is up to the driver to handle the rest work, enqueue and pass to nic's internal storage, then no net epoch required.

When net stack pass mbufs to a driver, we always are already in the net epoch. Hence this revision exists.

What if we introduce a new epoch, say bpf_epoch_preempt for bpf only ? I have not benchmarked yet so I've no idea how much cost of entering a new bpf epoch will be.

Why would we do that? Introducing a new instance of epoch(9) makes sense only outside of the network stack. I can't see any good reason to have two epochs in the network stack. I can imagine only a case we have some new kind of allocation that has a very high alloc/free traffic and can't use SMR, then we might want to separate it from the main epoch. But that is not going to иу bpf structures.

I'm aware of that. My main concern is that, the newly added NET_EPOCH_ENTER / NET_EPOCH_EXIT in my patch is only required only by bpf, well I know some people disabled bpf entirely. So the bpf is tightly coupled with net epoch. If a driver is not tested throughout ( in this case typically the watchdog path ) with INVARIANTS option enabled, and in production a potential silent memory corruption might not be easily caught.

So three options,

Add a sysctl knob to enable checking in_epoch(net_epoch_preempt) in production ( which should be off by default ), and turn it on when diagnosing.
Keep current logic as is, at the cost of recursively entering net epoch in the fast TX / RX path.
Use a separate epoch, say bpf_epoch_preempt, or some other non-blocking approach for bpf part.

For the last two options. I think benchmark is required to help deciding which one is preferred. The bpf is not at zero cost, so how many cycles can be saved really depends. The 3rd option might be the most expensive, since bpf_[m]tap currently tap every single mbuf. I think ideally bpf_[m]tap can batch tapping the mbufs, so to minimize the cost of entering epoch.

PS: I wonder if we can separate bpf_[m]tap into bpf_[m]tap_in and bpf_[m]tap_out for RX and TX. So that each direction can be tapped separately to minimize the impact on tapping traffic in one direction. A typical usage should be tcpdump --direction=[in,out,inout].,

Revision Contents
Changeset List

Path

Size

sys/

net/

bpf.c

15 lines

Diff 168228

View Options

bpf: don't recursively enter the network epoch in bpf tapsNeeds ReviewPublicActions

Details

Diff Detail

Event Timeline

Revision ContentsChangeset List

Diff 168228

sys/net/bpf.c

bpf: don't recursively enter the network epoch in bpf taps
Needs ReviewPublic
Actions

Revision Contents
Changeset List