Index: sys/dev/cxgbe/t4_main.c =================================================================== --- sys/dev/cxgbe/t4_main.c +++ sys/dev/cxgbe/t4_main.c @@ -2057,13 +2057,8 @@ return (rc); } #ifdef RATELIMIT - if (m->m_pkthdr.snd_tag != NULL) { - /* EAGAIN tells the stack we are not the correct interface. */ - if (__predict_false(ifp != m->m_pkthdr.snd_tag->ifp)) { - m_freem(m); - return (EAGAIN); - } - + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { + MPASS(m->m_pkthdr.snd_tag->ifp == ifp); return (ethofld_transmit(ifp, m)); } #endif Index: sys/dev/cxgbe/t4_sched.c =================================================================== --- sys/dev/cxgbe/t4_sched.c +++ sys/dev/cxgbe/t4_sched.c @@ -789,7 +789,7 @@ mtx_init(&cst->lock, "cst_lock", NULL, MTX_DEF); mbufq_init(&cst->pending_tx, INT_MAX); mbufq_init(&cst->pending_fwack, INT_MAX); - cst->com.ifp = ifp; + m_snd_tag_init(&cst->com, ifp); cst->flags |= EO_FLOWC_PENDING | EO_SND_TAG_REF; cst->adapter = sc; cst->port_id = pi->port_id; Index: sys/dev/cxgbe/t4_sge.c =================================================================== --- sys/dev/cxgbe/t4_sge.c +++ sys/dev/cxgbe/t4_sge.c @@ -2325,7 +2325,7 @@ needs_eo(struct mbuf *m) { - return (m->m_pkthdr.snd_tag != NULL); + return (m->m_pkthdr.csum_flags & CSUM_SND_TAG); } #endif @@ -2539,8 +2539,11 @@ * checksumming is enabled. needs_l4_csum happens to check for all the * right things. */ - if (__predict_false(needs_eo(m0) && !needs_l4_csum(m0))) + if (__predict_false(needs_eo(m0) && !needs_l4_csum(m0))) { + m_snd_tag_rele(m0->m_pkthdr.snd_tag); m0->m_pkthdr.snd_tag = NULL; + m0->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; + } #endif if (!needs_tso(m0) && @@ -5922,6 +5925,21 @@ cst->tx_nocompl = 0; } (void) mbufq_dequeue(&cst->pending_tx); + + /* + * Drop the mbuf's reference on the tag now rather + * than waiting until m_freem(). This ensures that + * cxgbe_snd_tag_free gets called when the inp drops + * its reference on the tag and there are no more + * mbufs in the pending_tx queue and can flush any + * pending requests. Otherwise if the last mbuf + * doesn't request a completion the etid will never be + * released. + */ + m->m_pkthdr.snd_tag = NULL; + m->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; + m_snd_tag_rele(&cst->com); + mbufq_enqueue(&cst->pending_fwack, m); } } @@ -5933,6 +5951,7 @@ int rc; MPASS(m0->m_nextpkt == NULL); + MPASS(m0->m_pkthdr.csum_flags & CSUM_SND_TAG); MPASS(m0->m_pkthdr.snd_tag != NULL); cst = mst_to_cst(m0->m_pkthdr.snd_tag); @@ -5967,8 +5986,18 @@ mbufq_enqueue(&cst->pending_tx, m0); cst->plen += m0->m_pkthdr.len; + /* + * Hold an extra reference on the tag while generating work + * requests to ensure that we don't try to free the tag during + * ethofld_tx() in case we are sending the final mbuf after + * the inp was freed. + */ + m_snd_tag_ref(&cst->com); ethofld_tx(cst); - rc = 0; + mtx_unlock(&cst->lock); + m_snd_tag_rele(&cst->com); + return (0); + done: mtx_unlock(&cst->lock); if (__predict_false(rc != 0)) @@ -6015,7 +6044,6 @@ cst->flags &= ~EO_FLUSH_RPL_PENDING; cst->tx_credits += cpl->credits; -freetag: cxgbe_snd_tag_free_locked(cst); return (0); /* cst is gone. */ } @@ -6033,22 +6061,27 @@ cst->tx_credits += cpl->credits; MPASS(cst->tx_credits <= cst->tx_total); - m = mbufq_first(&cst->pending_tx); - if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m)) - ethofld_tx(cst); - - if (__predict_false((cst->flags & EO_SND_TAG_REF) == 0) && - cst->ncompl == 0) { - if (cst->tx_credits == cst->tx_total) - goto freetag; - else { - MPASS((cst->flags & EO_FLUSH_RPL_PENDING) == 0); - send_etid_flush_wr(cst); - } + if (cst->flags & EO_SND_TAG_REF) { + /* + * As with ethofld_transmit(), hold an extra reference + * so that the tag is stable across ethold_tx(). + */ + m_snd_tag_ref(&cst->com); + m = mbufq_first(&cst->pending_tx); + if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m)) + ethofld_tx(cst); + mtx_unlock(&cst->lock); + m_snd_tag_rele(&cst->com); + } else { + /* + * There shouldn't be any pending packets if the tag + * was freed by the kernel since any pending packet + * should hold a reference to the tag. + */ + MPASS(mbufq_first(&cst->pending_tx) == NULL); + mtx_unlock(&cst->lock); } - mtx_unlock(&cst->lock); - return (0); } #endif Index: sys/dev/mlx5/mlx5_en/mlx5_en_main.c =================================================================== --- sys/dev/mlx5/mlx5_en/mlx5_en_main.c +++ sys/dev/mlx5/mlx5_en/mlx5_en_main.c @@ -885,7 +885,7 @@ wq_sz = mlx5_wq_ll_get_size(&rq->wq); - err = -tcp_lro_init_args(&rq->lro, c->tag.m_snd_tag.ifp, TCP_LRO_ENTRIES, wq_sz); + err = -tcp_lro_init_args(&rq->lro, priv->ifp, TCP_LRO_ENTRIES, wq_sz); if (err) goto err_rq_wq_destroy; @@ -915,7 +915,7 @@ #endif } - rq->ifp = c->tag.m_snd_tag.ifp; + rq->ifp = priv->ifp; rq->channel = c; rq->ix = c->ix; @@ -1773,7 +1773,6 @@ c->priv = priv; c->ix = ix; /* setup send tag */ - c->tag.m_snd_tag.ifp = priv->ifp; c->tag.type = IF_SND_TAG_TYPE_UNLIMITED; c->mkey_be = cpu_to_be32(priv->mr.key); c->num_tc = priv->num_tc; @@ -3557,6 +3556,8 @@ if (unlikely(pch->sq[0].running == 0)) return (ENXIO); mlx5e_ref_channel(priv); + MPASS(pch->tag.m_snd_tag.refcount == 0); + m_snd_tag_init(&pch->tag.m_snd_tag, ifp); *ppmt = &pch->tag.m_snd_tag; return (0); } Index: sys/dev/mlx5/mlx5_en/mlx5_en_rl.c =================================================================== --- sys/dev/mlx5/mlx5_en/mlx5_en_rl.c +++ sys/dev/mlx5/mlx5_en/mlx5_en_rl.c @@ -841,7 +841,6 @@ for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) { struct mlx5e_rl_channel *channel = rlw->channels + i; channel->worker = rlw; - channel->tag.m_snd_tag.ifp = priv->ifp; channel->tag.type = IF_SND_TAG_TYPE_RATE_LIMIT; STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry); } @@ -1125,6 +1124,8 @@ } /* store pointer to mbuf tag */ + MPASS(channel->tag.m_snd_tag.refcount == 0); + m_snd_tag_init(&channel->tag.m_snd_tag, ifp); *ppmt = &channel->tag.m_snd_tag; done: return (error); Index: sys/dev/mlx5/mlx5_en/mlx5_en_tx.c =================================================================== --- sys/dev/mlx5/mlx5_en/mlx5_en_tx.c +++ sys/dev/mlx5/mlx5_en/mlx5_en_tx.c @@ -83,10 +83,6 @@ struct mlx5e_snd_tag *ptag; struct mlx5e_sq *sq; - /* check for route change */ - if (mb->m_pkthdr.snd_tag->ifp != ifp) - return (NULL); - /* get pointer to sendqueue */ ptag = container_of(mb->m_pkthdr.snd_tag, struct mlx5e_snd_tag, m_snd_tag); @@ -609,21 +605,10 @@ struct mlx5e_sq *sq; int ret; - if (mb->m_pkthdr.snd_tag != NULL) { + if (mb->m_pkthdr.csum_flags & CSUM_SND_TAG) { + MPASS(mb->m_pkthdr.snd_tag->ifp == ifp); sq = mlx5e_select_queue_by_send_tag(ifp, mb); if (unlikely(sq == NULL)) { - /* Check for route change */ - if (mb->m_pkthdr.snd_tag->ifp != ifp) { - /* Free mbuf */ - m_freem(mb); - - /* - * Tell upper layers about route - * change and to re-transmit this - * packet: - */ - return (EAGAIN); - } goto select_queue; } } else { Index: sys/kern/kern_mbuf.c =================================================================== --- sys/kern/kern_mbuf.c +++ sys/kern/kern_mbuf.c @@ -46,8 +46,12 @@ #include #include #include +#include #include +#include +#include + #include #include #include @@ -112,6 +116,10 @@ SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, "Maximum real memory allocatable to various mbuf types"); +static counter_u64_t snd_tag_count; +SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW, + &snd_tag_count, "# of active mbuf send tags"); + /* * tunable_mbinit() has to be run before any mbuf allocations are done. */ @@ -378,6 +386,8 @@ */ EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, EVENTHANDLER_PRI_FIRST); + + snd_tag_count = counter_u64_alloc(M_WAITOK); } SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); @@ -1149,3 +1159,24 @@ while (mb != NULL) mb = m_free(mb); } + +void +m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp) +{ + + if_ref(ifp); + mst->ifp = ifp; + refcount_init(&mst->refcount, 1); + counter_u64_add(snd_tag_count, 1); +} + +void +m_snd_tag_destroy(struct m_snd_tag *mst) +{ + struct ifnet *ifp; + + ifp = mst->ifp; + ifp->if_snd_tag_free(mst); + if_rele(ifp); + counter_u64_add(snd_tag_count, -1); +} Index: sys/kern/uipc_mbuf.c =================================================================== --- sys/kern/uipc_mbuf.c +++ sys/kern/uipc_mbuf.c @@ -382,6 +382,10 @@ to->m_pkthdr = from->m_pkthdr; /* especially tags */ SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ from->m_flags &= ~M_PKTHDR; + if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) { + from->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; + from->m_pkthdr.snd_tag = NULL; + } } /* @@ -414,6 +418,8 @@ if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; + if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) + m_snd_tag_ref(from->m_pkthdr.snd_tag); SLIST_INIT(&to->m_pkthdr.tags); return (m_tag_copy_chain(to, from, how)); } @@ -924,7 +930,12 @@ return (NULL); n->m_next = m->m_next; m->m_next = NULL; - n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; + if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { + n->m_pkthdr.snd_tag = + m_snd_tag_ref(m0->m_pkthdr.snd_tag); + n->m_pkthdr.csum_flags |= CSUM_SND_TAG; + } else + n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; return (n); @@ -932,7 +943,12 @@ n = m_gethdr(wait, m0->m_type); if (n == NULL) return (NULL); - n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; + if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { + n->m_pkthdr.snd_tag = + m_snd_tag_ref(m0->m_pkthdr.snd_tag); + n->m_pkthdr.csum_flags |= CSUM_SND_TAG; + } else + n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; if (m->m_flags & M_EXT) Index: sys/net/bpf.c =================================================================== --- sys/net/bpf.c +++ sys/net/bpf.c @@ -2225,7 +2225,7 @@ int gottime; /* Skip outgoing duplicate packets. */ - if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) { + if ((m->m_flags & M_PROMISC) != 0 && m_rcvif(m) == NULL) { m->m_flags &= ~M_PROMISC; return; } @@ -2236,7 +2236,7 @@ BPFIF_RLOCK(bp); LIST_FOREACH(d, &bp->bif_dlist, bd_next) { - if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) + if (BPF_CHECK_DIRECTION(d, m_rcvif(m), bp->bif_ifp)) continue; counter_u64_add(d->bd_rcount, 1); #ifdef BPF_JITTER Index: sys/net/if.c =================================================================== --- sys/net/if.c +++ sys/net/if.c @@ -4319,6 +4319,8 @@ void if_setrcvif(struct mbuf *m, if_t ifp) { + + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (struct ifnet *)ifp; } Index: sys/net/if_ethersubr.c =================================================================== --- sys/net/if_ethersubr.c +++ sys/net/if_ethersubr.c @@ -816,6 +816,7 @@ * We will rely on rcvif being set properly in the deferred context, * so assert it is correct here. */ + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p " "rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp)); CURVNET_SET_QUIET(ifp->if_vnet); Index: sys/net/if_lagg.c =================================================================== --- sys/net/if_lagg.c +++ sys/net/if_lagg.c @@ -95,6 +95,11 @@ {0, NULL} }; +struct lagg_snd_tag { + struct m_snd_tag com; + struct m_snd_tag *tag; +}; + VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */ #define V_lagg_list VNET(lagg_list) VNET_DEFINE_STATIC(struct mtx, lagg_list_mtx); @@ -134,6 +139,10 @@ static int lagg_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); +static int lagg_snd_tag_modify(struct m_snd_tag *, + union if_snd_tag_modify_params *); +static int lagg_snd_tag_query(struct m_snd_tag *, + union if_snd_tag_query_params *); static void lagg_snd_tag_free(struct m_snd_tag *); #endif static int lagg_setmulti(struct lagg_port *); @@ -516,6 +525,8 @@ ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; #ifdef RATELIMIT ifp->if_snd_tag_alloc = lagg_snd_tag_alloc; + ifp->if_snd_tag_modify = lagg_snd_tag_modify; + ifp->if_snd_tag_query = lagg_snd_tag_query; ifp->if_snd_tag_free = lagg_snd_tag_free; #endif ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; @@ -1526,63 +1537,126 @@ } #ifdef RATELIMIT -static int -lagg_snd_tag_alloc(struct ifnet *ifp, - union if_snd_tag_alloc_params *params, - struct m_snd_tag **ppmt) +static inline struct lagg_snd_tag * +mst_to_lst(struct m_snd_tag *mst) { - struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; + + return (__containerof(mst, struct lagg_snd_tag, com)); +} + +/* + * Look up the port used by a specific flow. This only works for lagg + * protocols with deterministic port mappings (e.g. not roundrobin). + * In addition protocols which use a hash to map flows to ports must + * be configured to use the mbuf flowid rather than hashing packet + * contents. + */ +static struct lagg_port * +lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid, uint32_t flowtype) +{ + struct lagg_softc *sc; struct lagg_port *lp; struct lagg_lb *lb; uint32_t p; - LAGG_RLOCK(); + sc = ifp->if_softc; + switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: - lp = lagg_link_active(sc, sc->sc_primary); - break; + return (lagg_link_active(sc, sc->sc_primary)); case LAGG_PROTO_LOADBALANCE: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || - params->hdr.flowtype == M_HASHTYPE_NONE) { - LAGG_RUNLOCK(); - return (EOPNOTSUPP); - } - p = params->hdr.flowid >> sc->flowid_shift; + flowtype == M_HASHTYPE_NONE) + return (NULL); + p = flowid >> sc->flowid_shift; p %= sc->sc_count; lb = (struct lagg_lb *)sc->sc_psc; lp = lb->lb_ports[p]; - lp = lagg_link_active(sc, lp); - break; + return (lagg_link_active(sc, lp)); case LAGG_PROTO_LACP: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || - params->hdr.flowtype == M_HASHTYPE_NONE) { - LAGG_RUNLOCK(); - return (EOPNOTSUPP); - } - lp = lacp_select_tx_port_by_hash(sc, params->hdr.flowid); - break; + flowtype == M_HASHTYPE_NONE) + return (NULL); + return (lacp_select_tx_port_by_hash(sc, flowid)); default: - LAGG_RUNLOCK(); - return (EOPNOTSUPP); + return (NULL); } +} + +static int +lagg_snd_tag_alloc(struct ifnet *ifp, + union if_snd_tag_alloc_params *params, + struct m_snd_tag **ppmt) +{ + struct lagg_snd_tag *lst; + struct lagg_softc *sc; + struct lagg_port *lp; + struct ifnet *lp_ifp; + int error; + + sc = ifp->if_softc; + + LAGG_RLOCK(); + lp = lookup_snd_tag_port(ifp, params->hdr.flowid, params->hdr.flowtype); if (lp == NULL) { LAGG_RUNLOCK(); return (EOPNOTSUPP); } - ifp = lp->lp_ifp; + if (lp->lp_ifp == NULL || lp->lp_ifp->if_snd_tag_alloc == NULL) { + LAGG_RUNLOCK(); + return (EOPNOTSUPP); + } + lp_ifp = lp->lp_ifp; + if_ref(lp_ifp); LAGG_RUNLOCK(); - if (ifp == NULL || ifp->if_snd_tag_alloc == NULL || - (ifp->if_capenable & IFCAP_TXRTLMT) == 0) - return (EOPNOTSUPP); - /* forward allocation request */ - return (ifp->if_snd_tag_alloc(ifp, params, ppmt)); + lst = malloc(sizeof(*lst), M_LAGG, M_NOWAIT); + if (lst == NULL) { + if_rele(lp_ifp); + return (ENOMEM); + } + + error = lp_ifp->if_snd_tag_alloc(lp_ifp, params, &lst->tag); + if_rele(lp_ifp); + if (error) { + free(lst, M_LAGG); + return (error); + } + + m_snd_tag_init(&lst->com, ifp); + + *ppmt = &lst->com; + return (0); +} + +static int +lagg_snd_tag_modify(struct m_snd_tag *mst, + union if_snd_tag_modify_params *params) +{ + struct lagg_snd_tag *lst; + + lst = mst_to_lst(mst); + return (lst->tag->ifp->if_snd_tag_modify(lst->tag, params)); +} + +static int +lagg_snd_tag_query(struct m_snd_tag *mst, + union if_snd_tag_query_params *params) +{ + struct lagg_snd_tag *lst; + + lst = mst_to_lst(mst); + return (lst->tag->ifp->if_snd_tag_query(lst->tag, params)); } static void -lagg_snd_tag_free(struct m_snd_tag *tag) +lagg_snd_tag_free(struct m_snd_tag *mst) { - tag->ifp->if_snd_tag_free(tag); + struct lagg_snd_tag *lst; + + lst = mst_to_lst(mst); + m_snd_tag_rele(lst->tag); + free(lst, M_LAGG); } #endif @@ -1709,6 +1783,10 @@ struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; int error; +#ifdef RATELIMIT + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) + MPASS(m->m_pkthdr.snd_tag->ifp == ifp); +#endif LAGG_RLOCK(); /* We need a Tx algorithm and at least one port */ if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { @@ -1899,6 +1977,21 @@ lagg_enqueue(struct ifnet *ifp, struct mbuf *m) { +#ifdef RATELIMIT + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { + struct lagg_snd_tag *lst; + struct m_snd_tag *mst; + + mst = m->m_pkthdr.snd_tag; + lst = mst_to_lst(mst); + if (lst->tag->ifp != ifp) { + m_freem(m); + return (EAGAIN); + } + m->m_pkthdr.snd_tag = m_snd_tag_ref(lst->tag); + m_snd_tag_rele(mst); + } +#endif return (ifp->if_transmit)(ifp, m); } Index: sys/net/if_vlan.c =================================================================== --- sys/net/if_vlan.c +++ sys/net/if_vlan.c @@ -103,6 +103,20 @@ int refcnt; }; +#ifdef RATELIMIT +struct vlan_snd_tag { + struct m_snd_tag com; + struct m_snd_tag *tag; +}; + +static inline struct vlan_snd_tag * +mst_to_vst(struct m_snd_tag *mst) +{ + + return (__containerof(mst, struct vlan_snd_tag, com)); +} +#endif + /* * This macro provides a facility to iterate over every vlan on a trunk with * the assumption that none will be added/removed during iteration. @@ -267,7 +281,11 @@ #ifdef RATELIMIT static int vlan_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); -static void vlan_snd_tag_free(struct m_snd_tag *); +static int vlan_snd_tag_modify(struct m_snd_tag *, + union if_snd_tag_modify_params *); +static int vlan_snd_tag_query(struct m_snd_tag *, + union if_snd_tag_query_params *); +static void vlan_snd_tag_free(struct m_snd_tag *); #endif static void vlan_qflush(struct ifnet *ifp); static int vlan_setflag(struct ifnet *ifp, int flag, int status, @@ -1048,6 +1066,8 @@ ifp->if_ioctl = vlan_ioctl; #ifdef RATELIMIT ifp->if_snd_tag_alloc = vlan_snd_tag_alloc; + ifp->if_snd_tag_modify = vlan_snd_tag_modify; + ifp->if_snd_tag_query = vlan_snd_tag_query; ifp->if_snd_tag_free = vlan_snd_tag_free; #endif ifp->if_flags = VLAN_IFFLAGS; @@ -1137,6 +1157,26 @@ BPF_MTAP(ifp, m); +#ifdef RATELIMIT + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { + struct vlan_snd_tag *vst; + struct m_snd_tag *mst; + + MPASS(m->m_pkthdr.snd_tag->ifp == ifp); + mst = m->m_pkthdr.snd_tag; + vst = mst_to_vst(mst); + if (vst->tag->ifp != p) { + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + NET_EPOCH_EXIT(et); + m_freem(m); + return (EAGAIN); + } + + m->m_pkthdr.snd_tag = m_snd_tag_ref(vst->tag); + m_snd_tag_rele(mst); + } +#endif + /* * Do not run parent's if_transmit() if the parent is not up, * or parent's driver will cause a system crash. @@ -1928,18 +1968,71 @@ union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { + struct epoch_tracker et; + struct vlan_snd_tag *vst; + struct ifvlan *ifv; + struct ifnet *parent; + int error; - /* get trunk device */ - ifp = vlan_trunkdev(ifp); - if (ifp == NULL || (ifp->if_capenable & IFCAP_TXRTLMT) == 0) + NET_EPOCH_ENTER(et); + ifv = ifp->if_softc; + if (ifv->ifv_trunk != NULL) + parent = PARENT(ifv); + else + parent = NULL; + if (parent == NULL || parent->if_snd_tag_alloc == NULL) { + NET_EPOCH_EXIT(et); return (EOPNOTSUPP); - /* forward allocation request */ - return (ifp->if_snd_tag_alloc(ifp, params, ppmt)); + } + if_ref(parent); + NET_EPOCH_EXIT(et); + + vst = malloc(sizeof(*vst), M_VLAN, M_NOWAIT); + if (vst == NULL) { + if_rele(parent); + return (ENOMEM); + } + + error = parent->if_snd_tag_alloc(parent, params, &vst->tag); + if_rele(parent); + if (error) { + free(vst, M_VLAN); + return (error); + } + + m_snd_tag_init(&vst->com, ifp); + + *ppmt = &vst->com; + return (0); +} + +static int +vlan_snd_tag_modify(struct m_snd_tag *mst, + union if_snd_tag_modify_params *params) +{ + struct vlan_snd_tag *vst; + + vst = mst_to_vst(mst); + return (vst->tag->ifp->if_snd_tag_modify(vst->tag, params)); +} + +static int +vlan_snd_tag_query(struct m_snd_tag *mst, + union if_snd_tag_query_params *params) +{ + struct vlan_snd_tag *vst; + + vst = mst_to_vst(mst); + return (vst->tag->ifp->if_snd_tag_query(vst->tag, params)); } static void -vlan_snd_tag_free(struct m_snd_tag *tag) +vlan_snd_tag_free(struct m_snd_tag *mst) { - tag->ifp->if_snd_tag_free(tag); + struct vlan_snd_tag *vst; + + vst = mst_to_vst(mst); + m_snd_tag_rele(vst->tag); + free(vst, M_VLAN); } #endif Index: sys/net/netisr.c =================================================================== --- sys/net/netisr.c +++ sys/net/netisr.c @@ -839,6 +839,7 @@ ("%s: invalid policy %u for %s", __func__, npp->np_policy, npp->np_name)); + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); ifp = m->m_pkthdr.rcvif; if (ifp != NULL) *cpuidp = nws_array[(ifp->if_index + source) % nws_count]; Index: sys/net80211/ieee80211_hwmp.c =================================================================== --- sys/net80211/ieee80211_hwmp.c +++ sys/net80211/ieee80211_hwmp.c @@ -2015,6 +2015,7 @@ */ IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_HWMP, dest, "%s", "queue frame until path found"); + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (void *)(uintptr_t) ieee80211_mac_hash(ic, dest); /* XXX age chosen randomly */ Index: sys/net80211/ieee80211_mesh.c =================================================================== --- sys/net80211/ieee80211_mesh.c +++ sys/net80211/ieee80211_mesh.c @@ -1225,6 +1225,7 @@ M_WME_SETAC(mcopy, WME_AC_BE); /* XXX do we know m_nextpkt is NULL? */ + MPASS((mcopy->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); mcopy->m_pkthdr.rcvif = (void *) ni; /* Index: sys/net80211/ieee80211_output.c =================================================================== --- sys/net80211/ieee80211_output.c +++ sys/net80211/ieee80211_output.c @@ -163,6 +163,7 @@ * uses any existing value for rcvif to identify the * interface it (might have been) received on. */ + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (void *)ni; mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1: 0; @@ -528,6 +529,7 @@ * that the mbuf has the same node value that * it would if it were going via the normal path. */ + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (void *)ni; /* Index: sys/net80211/ieee80211_wds.c =================================================================== --- sys/net80211/ieee80211_wds.c +++ sys/net80211/ieee80211_wds.c @@ -299,6 +299,7 @@ continue; } mcopy->m_flags |= M_MCAST; + MPASS((mcopy->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); mcopy->m_pkthdr.rcvif = (void *) ni; err = ieee80211_parent_xmitpkt(ic, mcopy); @@ -332,6 +333,7 @@ * XXX handle overflow? * XXX per/vap beacon interval? */ + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (void *)(uintptr_t) ieee80211_mac_hash(ic, ni->ni_macaddr); (void) ieee80211_ageq_append(&ic->ic_stageq, m, Index: sys/netinet/in_pcb.c =================================================================== --- sys/netinet/in_pcb.c +++ sys/netinet/in_pcb.c @@ -3274,13 +3274,6 @@ error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag); - - /* - * At success increment the refcount on - * the send tag's network interface: - */ - if (error == 0) - if_ref(inp->inp_snd_tag->ifp); } return (error); } @@ -3293,7 +3286,6 @@ in_pcbdetach_txrtlmt(struct inpcb *inp) { struct m_snd_tag *mst; - struct ifnet *ifp; INP_WLOCK_ASSERT(inp); @@ -3303,19 +3295,7 @@ if (mst == NULL) return; - ifp = mst->ifp; - if (ifp == NULL) - return; - - /* - * If the device was detached while we still had reference(s) - * on the ifp, we assume if_snd_tag_free() was replaced with - * stubs. - */ - ifp->if_snd_tag_free(mst); - - /* release reference count on network interface */ - if_rele(ifp); + m_snd_tag_rele(mst); } /* @@ -3360,6 +3340,17 @@ */ max_pacing_rate = socket->so_max_pacing_rate; + /* + * If the existing send tag is for the wrong interface due to + * a route change, first drop the existing tag. Set the + * CHANGED flag so that we will keep trying to allocate a new + * tag if we fail to allocate one this time. + */ + if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { + in_pcbdetach_txrtlmt(inp); + inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; + } + /* * NOTE: When attaching to a network interface a reference is * made to ensure the network interface doesn't go away until Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -203,6 +203,51 @@ return 0; } +static __inline int +ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m, + const struct sockaddr_in *gw, struct route *ro) +{ + struct m_snd_tag *mst; + int error; + + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); + mst = NULL; + +#ifdef RATELIMIT + if (inp != NULL && mst == NULL) { + if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 || + (inp->inp_snd_tag != NULL && + inp->inp_snd_tag->ifp != ifp)) + in_pcboutput_txrtlmt(inp, ifp, m); + + if (inp->inp_snd_tag != NULL) + mst = inp->inp_snd_tag; + } +#endif + if (mst != NULL) { + KASSERT(m->m_pkthdr.rcvif == NULL, + ("trying to add a send tag to a forwarded packet")); + if (mst->ifp != ifp) { + error = EAGAIN; + goto done; + } + + /* stamp send tag on mbuf */ + m->m_pkthdr.snd_tag = m_snd_tag_ref(mst); + m->m_pkthdr.csum_flags |= CSUM_SND_TAG; + } + + error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); + +done: + /* Check for route change invalidating send tags. */ +#ifdef RATELIMIT + if (error == EAGAIN) + in_pcboutput_eagain(inp); +#endif + return (error); +} + /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). @@ -656,23 +701,7 @@ */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); -#ifdef RATELIMIT - if (inp != NULL) { - if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) - in_pcboutput_txrtlmt(inp, ifp, m); - /* stamp send tag on mbuf */ - m->m_pkthdr.snd_tag = inp->inp_snd_tag; - } else { - m->m_pkthdr.snd_tag = NULL; - } -#endif - error = (*ifp->if_output)(ifp, m, - (const struct sockaddr *)gw, ro); -#ifdef RATELIMIT - /* check for route change */ - if (error == EAGAIN) - in_pcboutput_eagain(inp); -#endif + error = ip_output_send(inp, ifp, m, gw, ro); goto done; } @@ -708,23 +737,7 @@ IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp, mtod(m, struct ip *), NULL); -#ifdef RATELIMIT - if (inp != NULL) { - if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) - in_pcboutput_txrtlmt(inp, ifp, m); - /* stamp send tag on mbuf */ - m->m_pkthdr.snd_tag = inp->inp_snd_tag; - } else { - m->m_pkthdr.snd_tag = NULL; - } -#endif - error = (*ifp->if_output)(ifp, m, - (const struct sockaddr *)gw, ro); -#ifdef RATELIMIT - /* check for route change */ - if (error == EAGAIN) - in_pcboutput_eagain(inp); -#endif + error = ip_output_send(inp, ifp, m, gw, ro); } else m_freem(m); } Index: sys/netinet6/ip6_output.c =================================================================== --- sys/netinet6/ip6_output.c +++ sys/netinet6/ip6_output.c @@ -228,7 +228,20 @@ IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } - m->m_flags = m0->m_flags & M_COPYFLAGS; + + /* + * Make sure the complete packet header gets copied + * from the originating mbuf to the newly created + * mbuf. This also ensures that existing firewall + * classification(s), VLAN tags and so on get copied + * to the resulting fragmented packet(s): + */ + if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) { + m_free(m); + IP6STAT_INC(ip6s_odropped); + return (ENOBUFS); + } + *mnext = m; mnext = &m->m_nextpkt; m->m_data += max_linkhdr; @@ -253,8 +266,6 @@ } m_cat(m, m_frgpart); m->m_pkthdr.len = fraglen + hlen + sizeof(*ip6f); - m->m_pkthdr.fibnum = m0->m_pkthdr.fibnum; - m->m_pkthdr.rcvif = NULL; ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; @@ -265,6 +276,51 @@ return (0); } +static __inline int +ip6_output_send(struct inpcb *inp, struct ifnet *ifp, struct ifnet *origifp, + struct mbuf *m, struct sockaddr_in6 *dst, struct route_in6 *ro) +{ + struct m_snd_tag *mst; + int error; + + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); + mst = NULL; + +#ifdef RATELIMIT + if (inp != NULL && mst == NULL) { + if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 || + (inp->inp_snd_tag != NULL && + inp->inp_snd_tag->ifp != ifp)) + in_pcboutput_txrtlmt(inp, ifp, m); + + if (inp->inp_snd_tag != NULL) + mst = inp->inp_snd_tag; + } +#endif + if (mst != NULL) { + KASSERT(m->m_pkthdr.rcvif == NULL, + ("trying to add a send tag to a forwarded packet")); + if (mst->ifp != ifp) { + error = EAGAIN; + goto done; + } + + /* stamp send tag on mbuf */ + m->m_pkthdr.snd_tag = m_snd_tag_ref(mst); + m->m_pkthdr.csum_flags |= CSUM_SND_TAG; + } + + error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); + +done: + /* Check for route change invalidating send tags. */ +#ifdef RATELIMIT + if (error == EAGAIN) + in_pcboutput_eagain(inp); +#endif + return (error); +} + /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 * header (with pri, len, nxt, hlim, src, dst). @@ -957,23 +1013,7 @@ m->m_pkthdr.len); ifa_free(&ia6->ia_ifa); } -#ifdef RATELIMIT - if (inp != NULL) { - if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) - in_pcboutput_txrtlmt(inp, ifp, m); - /* stamp send tag on mbuf */ - m->m_pkthdr.snd_tag = inp->inp_snd_tag; - } else { - m->m_pkthdr.snd_tag = NULL; - } -#endif - error = nd6_output_ifp(ifp, origifp, m, dst, - (struct route *)ro); -#ifdef RATELIMIT - /* check for route change */ - if (error == EAGAIN) - in_pcboutput_eagain(inp); -#endif + error = ip6_output_send(inp, ifp, origifp, m, dst, ro); goto done; } @@ -1072,23 +1112,7 @@ counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } -#ifdef RATELIMIT - if (inp != NULL) { - if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) - in_pcboutput_txrtlmt(inp, ifp, m); - /* stamp send tag on mbuf */ - m->m_pkthdr.snd_tag = inp->inp_snd_tag; - } else { - m->m_pkthdr.snd_tag = NULL; - } -#endif - error = nd6_output_ifp(ifp, origifp, m, dst, - (struct route *)ro); -#ifdef RATELIMIT - /* check for route change */ - if (error == EAGAIN) - in_pcboutput_eagain(inp); -#endif + error = ip6_output_send(inp, ifp, origifp, m, dst, ro); } else m_freem(m); } Index: sys/netpfil/ipfw/ip_fw2.c =================================================================== --- sys/netpfil/ipfw/ip_fw2.c +++ sys/netpfil/ipfw/ip_fw2.c @@ -1758,7 +1758,7 @@ oif = NULL; } else { MPASS(args->flags & IPFW_ARGS_OUT); - iif = mem ? NULL : m->m_pkthdr.rcvif; + iif = mem ? NULL : m_rcvif(m); oif = args->ifp; } Index: sys/sys/mbuf.h =================================================================== --- sys/sys/mbuf.h +++ sys/sys/mbuf.h @@ -40,6 +40,7 @@ #include #ifdef _KERNEL #include +#include #include #ifdef WITNESS #include @@ -138,6 +139,7 @@ */ struct m_snd_tag { struct ifnet *ifp; /* network interface tag belongs to */ + volatile u_int refcount; }; /* @@ -494,6 +496,8 @@ #define CSUM_L5_VALID 0x20000000 /* checksum is correct */ #define CSUM_COALESCED 0x40000000 /* contains merged segments */ +#define CSUM_SND_TAG 0x80000000 /* Packet header has send tag */ + /* * CSUM flag description for use with printf(9) %b identifier. */ @@ -503,7 +507,7 @@ "\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP\15CSUM_IP6_TSO" \ "\16CSUM_IP6_ISCSI" \ "\31CSUM_L3_CALC\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID" \ - "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED" + "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG" /* CSUM flags compatibility mappings. */ #define CSUM_IP_CHECKED CSUM_L3_CALC @@ -633,6 +637,8 @@ struct mbuf *m_split(struct mbuf *, int, int); struct mbuf *m_uiotombuf(struct uio *, int, int, int, int); struct mbuf *m_unshare(struct mbuf *, int); +void m_snd_tag_init(struct m_snd_tag *, struct ifnet *); +void m_snd_tag_destroy(struct m_snd_tag *); static __inline int m_gettype(int size) @@ -995,6 +1001,17 @@ */ #define MCHTYPE(m, t) m_chtype((m), (t)) +/* Return the rcvif of a packet header. */ +static __inline struct ifnet * +m_rcvif(struct mbuf *m) +{ + + M_ASSERTPKTHDR(m); + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) + return (NULL); + return (m->m_pkthdr.rcvif); +} + /* Length to m_copy to copy all. */ #define M_COPYALL 1000000000 @@ -1185,6 +1202,22 @@ m_tag_locate(m, MTAG_ABI_COMPAT, type, start)); } +static inline struct m_snd_tag * +m_snd_tag_ref(struct m_snd_tag *mst) +{ + + refcount_acquire(&mst->refcount); + return (mst); +} + +static inline void +m_snd_tag_rele(struct m_snd_tag *mst) +{ + + if (refcount_release(&mst->refcount)) + m_snd_tag_destroy(mst); +} + static __inline struct mbuf * m_free(struct mbuf *m) { @@ -1193,6 +1226,8 @@ MBUF_PROBE1(m__free, m); if ((m->m_flags & (M_PKTHDR|M_NOFREE)) == (M_PKTHDR|M_NOFREE)) m_tag_delete_chain(m, NULL); + if (m->m_flags & M_PKTHDR && m->m_pkthdr.csum_flags & CSUM_SND_TAG) + m_snd_tag_rele(m->m_pkthdr.snd_tag); if (m->m_flags & M_EXT) mb_free_ext(m); else if ((m->m_flags & M_NOFREE) == 0)