Index: sys/dev/cxgbe/adapter.h =================================================================== --- sys/dev/cxgbe/adapter.h +++ sys/dev/cxgbe/adapter.h @@ -629,6 +629,7 @@ struct ifnet *ifp; /* the interface this rxq belongs to */ struct lro_ctrl lro; /* LRO state */ + struct mbufqc mq; /* mbuf queue */ /* stats for common events first */ Index: sys/dev/cxgbe/t4_sge.c =================================================================== --- sys/dev/cxgbe/t4_sge.c +++ sys/dev/cxgbe/t4_sge.c @@ -1709,6 +1709,13 @@ tcp_lro_flush_all(lro); } } + + if (rxq->mq.start != NULL) { + struct ifnet *ifp = rxq->ifp; + ifp->if_input(ifp, rxq->mq.start); + rxq->mq.start = NULL; + rxq->mq.end = NULL; + } #endif t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | @@ -1943,6 +1950,7 @@ #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxq->lro; #endif + struct mbufqc *mq = &rxq->mq; uint16_t err_vec, tnl_type, tnlhdr_len; static const int sw_hashtype[4][2] = { {M_HASHTYPE_NONE, M_HASHTYPE_NONE}, @@ -2116,14 +2124,15 @@ (M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV4 || M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV6)) { if (sort_before_lro(lro)) { - tcp_lro_queue_mbuf(lro, m0); + if (!tcp_lro_queue_mbuf(lro, m0)) + mbufqc_enqueue(mq, m0); return (0); /* queued for sort, then LRO */ } if (tcp_lro_rx(lro, m0, 0) == 0) return (0); /* queued for LRO */ } #endif - ifp->if_input(ifp, m0); + mbufqc_enqueue(mq, m0); return (0); } Index: sys/dev/mlx5/mlx5_en/mlx5_en_rx.c =================================================================== --- sys/dev/mlx5/mlx5_en/mlx5_en_rx.c +++ sys/dev/mlx5/mlx5_en/mlx5_en_rx.c @@ -424,6 +424,8 @@ struct pfil_head *pfil; int i, rv; + struct mbufqc mq = {}; + CURVNET_SET_QUIET(rq->ifp->if_vnet); pfil = rq->channel->priv->pfil; for (i = 0; i < budget; i++) { @@ -514,19 +516,22 @@ #endif #if !defined(HAVE_TCP_LRO_RX) - tcp_lro_queue_mbuf(&rq->lro, mb); + if (!tcp_lro_queue_mbuf(&rq->lro, mb)) + mbufqc_enqueue(&mq, mb); #else if (mb->m_pkthdr.csum_flags == 0 || (rq->ifp->if_capenable & IFCAP_LRO) == 0 || rq->lro.lro_cnt == 0 || tcp_lro_rx(&rq->lro, mb, 0) != 0) { - rq->ifp->if_input(rq->ifp, mb); + mbufqc_enqueue(&mq, mb); } #endif wq_ll_pop: mlx5_wq_ll_pop(&rq->wq, wqe_counter_be, &wqe->next.next_wqe_index); } + if (mq.start != NULL) + rq->ifp->if_input(rq->ifp, mq.start); CURVNET_RESTORE(); mlx5_cqwq_update_db_record(&rq->cq.wq); Index: sys/net/if_ethersubr.c =================================================================== --- sys/net/if_ethersubr.c +++ sys/net/if_ethersubr.c @@ -124,6 +124,9 @@ #define senderr(e) do { error = (e); goto bad;} while (0) +static struct mbuf *ether_demux_internal(struct ifnet *ifp, struct mbuf *m); +static void ether_dispatch(struct mbuf *m); + static void update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) { @@ -515,21 +518,29 @@ * Process a received Ethernet packet; the packet is in the * mbuf chain m with the ethernet header at the front. */ -static void +static struct mbuf * ether_input_internal(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; u_short etype; + /* + * We will rely on rcvif being set properly in the deferred + * context, so assert it is correct here. + */ + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); + KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p " + "rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp)); + if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); - return; + return (NULL); } #ifdef DIAGNOSTIC if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n"); m_freem(m); - return; + return (NULL); } #endif if (m->m_len < ETHER_HDR_LEN) { @@ -539,7 +550,7 @@ m->m_len, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); - return; + return (NULL); } eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); @@ -555,7 +566,7 @@ case ETHERTYPE_ARP: case ETHERTYPE_REVARP: m_freem(m); - return; + return (NULL); /* NOTREACHED */ break; }; @@ -563,8 +574,6 @@ #endif #endif - CURVNET_SET_QUIET(ifp->if_vnet); - if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; @@ -602,8 +611,7 @@ /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); - CURVNET_RESTORE(); - return; + return (NULL); } /* Handle input from a lagg(4) port */ @@ -614,8 +622,7 @@ if (m != NULL) ifp = m->m_pkthdr.rcvif; else { - CURVNET_RESTORE(); - return; + return (NULL); } } @@ -634,8 +641,7 @@ if_printf(ifp, "cannot pullup VLAN header\n"); #endif if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); - CURVNET_RESTORE(); - return; + return (NULL); } evl = mtod(m, struct ether_vlan_header *); @@ -657,9 +663,9 @@ m->m_flags &= ~M_PROMISC; (*ng_ether_input_p)(ifp, &m); if (m == NULL) { - CURVNET_RESTORE(); - return; + return (NULL); } + /* XXX: check rcvif */ eh = mtod(m, struct ether_header *); } @@ -672,9 +678,9 @@ m->m_flags &= ~M_PROMISC; BRIDGE_INPUT(ifp, m); if (m == NULL) { - CURVNET_RESTORE(); - return; + return (NULL); } + /* XXX: check rcvif */ eh = mtod(m, struct ether_header *); } @@ -706,8 +712,7 @@ m->m_flags |= M_PROMISC; } - ether_demux(ifp, m); - CURVNET_RESTORE(); + return (ether_demux_internal(ifp, m)); } /* @@ -736,12 +741,126 @@ M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: NULL interface pointer", __func__)); - ether_input_internal(m->m_pkthdr.rcvif, m); + + struct ifnet *ifp = m->m_pkthdr.rcvif; + CURVNET_SET_QUIET(ifp->if_vnet); + + m = ether_input_internal(m->m_pkthdr.rcvif, m); + if (m != NULL) + ether_dispatch(m); + + CURVNET_RESTORE(); +} + +static void +netisr_wrap(int isr, struct mbuf *m, int count) +{ + struct mbuf *m_next; + + while (m != NULL) { + m_next = m->m_nextpkt; + m->m_nextpkt = NULL; + + netisr_dispatch(isr, m); + + m = m_next; + } +} + +static void +ether_dispatch_multiple(struct mbuf *m, int count) +{ + struct mbuf *ip_chain = NULL; + struct mbuf *ip6_chain = NULL; + struct mbuf *other_chain = NULL; + struct mbuf *m_next; + int ip_count = 0; + int ip6_count = 0; + int other_count = 0; + + /* mbuf chain arrives in reverse order */ + + while (m != NULL) { + m_next = m->m_nextpkt; + + struct ether_header *eh = mtod(m, struct ether_header *); + u_short ether_type = ntohs(eh->ether_type); + + switch (ether_type) { +#ifdef INET + case ETHERTYPE_IP: + m_adj(m, ETHER_HDR_LEN); + m->m_nextpkt = ip_chain; + ip_chain = m; + ip_count++; + break; +#endif +#ifdef INET6 + case ETHERTYPE_IPV6: + m_adj(m, ETHER_HDR_LEN); + m->m_nextpkt = ip6_chain; + ip6_chain = m; + ip6_count++; + break; +#endif + default: + m->m_nextpkt = other_chain; + other_chain = m; + other_count++; + } + + m = m_next; + } + + if (ip_chain != NULL) + netisr_dispatch_multiple(NETISR_IP, ip_chain, ip_count); + if (ip6_chain != NULL) + netisr_wrap(NETISR_IPV6, ip6_chain, ip6_count); + if (other_chain != NULL) { + while (other_chain != NULL) { + m = other_chain; + other_chain = other_chain->m_nextpkt; + m->m_nextpkt = NULL; + ether_dispatch(m); + } + } +} + +static void +ether_nh_input_multiple(struct mbuf *m, int count) +{ + + M_ASSERTPKTHDR(m); + KASSERT(m->m_pkthdr.rcvif != NULL, + ("%s: NULL interface pointer", __func__)); + + struct ifnet *ifp = m->m_pkthdr.rcvif; + CURVNET_SET_QUIET(ifp->if_vnet); + + struct mbuf *m_next = NULL; + struct mbuf *m_chain = NULL; + count = 0; + while (m != NULL) { + m_next = m->m_nextpkt; + m->m_nextpkt = NULL; + m = ether_input_internal(m->m_pkthdr.rcvif, m); + if (m != NULL) { + m->m_nextpkt = m_chain; + m_chain = m; + count++; + } + m = m_next; + } + if (count > 0) + ether_dispatch_multiple(m_chain, count); + + CURVNET_RESTORE(); } static struct netisr_handler ether_nh = { .nh_name = "ether", .nh_handler = ether_nh_input, + .nh_handler_m = ether_nh_input_multiple, .nh_proto = NETISR_ETHER, #ifdef RSS .nh_policy = NETISR_POLICY_CPU, @@ -803,7 +922,6 @@ ether_input(struct ifnet *ifp, struct mbuf *m) { struct epoch_tracker et; - struct mbuf *mn; bool needs_epoch; needs_epoch = !(ifp->if_flags & IFF_KNOWSEPOCH); @@ -816,20 +934,8 @@ CURVNET_SET_QUIET(ifp->if_vnet); if (__predict_false(needs_epoch)) NET_EPOCH_ENTER(et); - while (m) { - mn = m->m_nextpkt; - m->m_nextpkt = NULL; - - /* - * We will rely on rcvif being set properly in the deferred - * context, so assert it is correct here. - */ - MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); - KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p " - "rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp)); - netisr_dispatch(NETISR_ETHER, m); - m = mn; - } + /* XXX: set count to 1 instead of real number */ + netisr_dispatch_multiple(NETISR_ETHER, m, 1); if (__predict_false(needs_epoch)) NET_EPOCH_EXIT(et); CURVNET_RESTORE(); @@ -838,12 +944,10 @@ /* * Upper layer processing for a received Ethernet packet. */ -void -ether_demux(struct ifnet *ifp, struct mbuf *m) +static struct mbuf * +ether_demux_internal(struct ifnet *ifp, struct mbuf *m) { - struct ether_header *eh; - int i, isr; - u_short ether_type; + int i; NET_EPOCH_ASSERT(); KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); @@ -852,12 +956,9 @@ if (PFIL_HOOKED_IN(V_link_pfil_head) && !(m->m_flags & M_PROMISC)) { i = pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_IN, NULL); if (i != 0 || m == NULL) - return; + return (NULL); } - eh = mtod(m, struct ether_header *); - ether_type = ntohs(eh->ether_type); - /* * If this frame has a VLAN tag other than 0, call vlan_input() * if its module is loaded. Otherwise, drop. @@ -867,14 +968,14 @@ if (ifp->if_vlantrunk == NULL) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); - return; + return (NULL); } KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!", __func__)); /* Clear before possibly re-entering ether_input(). */ m->m_flags &= ~M_PROMISC; (*vlan_input_p)(ifp, m); - return; + return (NULL); } /* @@ -883,7 +984,7 @@ */ if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) { m_freem(m); - return; + return (NULL); } /* @@ -892,6 +993,27 @@ */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); + + return (m); +} + +void +ether_demux(struct ifnet *ifp, struct mbuf *m) +{ + + m = ether_demux_internal(ifp, m); + if (__predict_true(m != NULL)) + ether_dispatch(m); +} + +static void +ether_dispatch(struct mbuf *m) +{ + struct ifnet *ifp = m->m_pkthdr.rcvif; + struct ether_header *eh = mtod(m, struct ether_header *); + u_short ether_type = ntohs(eh->ether_type); + int isr; + m_adj(m, ETHER_HDR_LEN); /* Index: sys/net/netisr.h =================================================================== --- sys/net/netisr.h +++ sys/net/netisr.h @@ -177,6 +177,7 @@ */ struct mbuf; typedef void netisr_handler_t(struct mbuf *m); +typedef void netisr_handler_m_t(struct mbuf *m, int count); typedef struct mbuf *netisr_m2cpuid_t(struct mbuf *m, uintptr_t source, u_int *cpuid); typedef struct mbuf *netisr_m2flow_t(struct mbuf *m, uintptr_t source); @@ -190,6 +191,7 @@ struct netisr_handler { const char *nh_name; /* Character string protocol name. */ netisr_handler_t *nh_handler; /* Protocol handler. */ + netisr_handler_m_t *nh_handler_m;/* Batch Protocol handler. */ netisr_m2flow_t *nh_m2flow; /* Query flow for untagged packet. */ netisr_m2cpuid_t *nh_m2cpuid; /* Query CPU to process mbuf on. */ netisr_drainedcpu_t *nh_drainedcpu; /* Callback when drained a queue. */ @@ -198,7 +200,7 @@ u_int nh_policy; /* Work placement policy. */ u_int nh_dispatch; /* Dispatch policy. */ u_int nh_ispare[4]; /* For future use. */ - void *nh_pspare[4]; /* For future use. */ + void *nh_pspare[3]; /* For future use. */ }; /* @@ -222,6 +224,7 @@ * variant. */ int netisr_dispatch(u_int proto, struct mbuf *m); +int netisr_dispatch_multiple(u_int proto, struct mbuf *m, int count); int netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m); int netisr_queue(u_int proto, struct mbuf *m); int netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m); Index: sys/net/netisr.c =================================================================== --- sys/net/netisr.c +++ sys/net/netisr.c @@ -435,6 +435,7 @@ netisr_proto[proto].np_name = name; netisr_proto[proto].np_handler = nhp->nh_handler; + netisr_proto[proto].np_handler_m = nhp->nh_handler_m; netisr_proto[proto].np_m2flow = nhp->nh_m2flow; netisr_proto[proto].np_m2cpuid = nhp->nh_m2cpuid; netisr_proto[proto].np_drainedcpu = nhp->nh_drainedcpu; @@ -1036,7 +1037,13 @@ nwsp = DPCPU_ID_PTR(cpuid, nws); npwp = &nwsp->nws_work[proto]; NWS_LOCK(nwsp); - error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal); + struct mbuf *m_next; + while (m != NULL) { + m_next = m->m_nextpkt; + m->m_nextpkt = NULL; + error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal); + m = m_next; + } NWS_UNLOCK(nwsp); if (dosignal) NWS_SIGNAL(nwsp); @@ -1091,6 +1098,13 @@ return (netisr_queue_src(proto, 0, m)); } +static int +netisr_queue_src_multiple(u_int proto, uintptr_t source, struct mbuf *m, int count) +{ + + return (netisr_queue_src(proto, source, m)); +} + /* * Dispatch a packet for netisr processing; direct dispatch is permitted by * calling context. @@ -1234,6 +1248,70 @@ return (netisr_dispatch_src(proto, 0, m)); } +int +netisr_dispatch_multiple(u_int proto, struct mbuf *m, int count) +{ +#ifdef NETISR_LOCKING + struct rm_priotracker tracker; +#endif + struct netisr_workstream *nwsp; + struct netisr_proto *npp; + struct netisr_work *npwp; + int error; + u_int dispatch_policy; + + NET_EPOCH_ASSERT(); + KASSERT(proto < NETISR_MAXPROT, + ("%s: invalid proto %u", __func__, proto)); +#ifdef NETISR_LOCKING + NETISR_RLOCK(&tracker); +#endif + npp = &netisr_proto[proto]; + KASSERT(npp->np_handler != NULL, ("%s: invalid proto %u", __func__, + proto)); + +#ifdef VIMAGE + if (V_netisr_enable[proto] == 0) { + m_freem(m); + return (ENOPROTOOPT); + } +#endif + dispatch_policy = netisr_get_dispatch(npp); + if (dispatch_policy == NETISR_DISPATCH_DEFERRED) + return (netisr_queue_src_multiple(proto, 0, m, count)); + + /* + * If direct dispatch is forced, then unconditionally dispatch + * without a formal CPU selection. Borrow the current CPU's stats, + * even if there's no worker on it. In this case we don't update + * nws_flags because all netisr processing will be source ordered due + * to always being forced to directly dispatch. + */ + if (dispatch_policy == NETISR_DISPATCH_DIRECT) { + nwsp = DPCPU_PTR(nws); + npwp = &nwsp->nws_work[proto]; + npwp->nw_dispatched += count; + npwp->nw_handled += count; + netisr_proto[proto].np_handler_m(m, count); + error = 0; + goto out_unlock; + } + + /* XXX: hybrid dispatch */ + struct mbuf *m_next; + while (m != NULL) { + m_next = m->m_nextpkt; + m->m_nextpkt = NULL; + netisr_dispatch_src(proto, 0, m); + m = m_next; + } +out_unlock: +#ifdef NETISR_LOCKING + NETISR_RUNLOCK(&tracker); +#endif + return (0); +} + #ifdef DEVICE_POLLING /* * Kernel polling borrows a netisr thread to run interface polling in; this Index: sys/net/netisr_internal.h =================================================================== --- sys/net/netisr_internal.h +++ sys/net/netisr_internal.h @@ -48,6 +48,7 @@ #ifndef _KERNEL typedef void *netisr_handler_t; +typedef void *netisr_handler_m_t; typedef void *netisr_m2flow_t; typedef void *netisr_m2cpuid_t; typedef void *netisr_drainedcpu_t; @@ -61,6 +62,7 @@ struct netisr_proto { const char *np_name; /* Character string protocol name. */ netisr_handler_t *np_handler; /* Protocol handler. */ + netisr_handler_m_t *np_handler_m;/* Protocol handler. */ netisr_m2flow_t *np_m2flow; /* Query flow for untagged packet. */ netisr_m2cpuid_t *np_m2cpuid; /* Query CPU to process packet on. */ netisr_drainedcpu_t *np_drainedcpu; /* Callback when drained a queue. */ Index: sys/netinet/in_var.h =================================================================== --- sys/netinet/in_var.h +++ sys/netinet/in_var.h @@ -475,6 +475,10 @@ void in_detachhead(struct rib_head *rh); #endif +/* Mbuf batching support */ +#define MAX_IP_BATCH_SIZE 8 +int ip_tryforward_multiple(struct mbuf **, int count); + #endif /* _KERNEL */ /* INET6 stuff */ Index: sys/netinet/ip_fastfwd.c =================================================================== --- sys/netinet/ip_fastfwd.c +++ sys/netinet/ip_fastfwd.c @@ -113,6 +113,11 @@ #define V_ipsendredirects VNET(ipsendredirects) +static enum fwd_action ip_passin(struct mbuf **pm, struct in_addr *podst); +static enum fwd_action ip_fwd(struct mbuf **pm, struct in_addr odest, + struct in_addr *pdest, struct nhop_object **pnh); +static void ip_passout(struct mbuf *m, struct in_addr dest, struct nhop_object *nh); + static struct mbuf * ip_redir_alloc(struct mbuf *m, struct nhop_object *nh, struct ip *ip, in_addr_t *addr) @@ -186,6 +191,24 @@ return (0); } +enum fwd_action { + FWD_CONTINUE, + FWD_DROPPED, + FWD_OURS, +}; + +/* + * fwd_local -> return m w/ FASTFWD_OURS + * unprocessed -> return m + */ + +struct mstate { + struct mbuf *mbuf; + struct nhop_object *nh; + struct in_addr odest; + struct in_addr dest; +}; + /* * Try to forward a packet based on the destination address. * This is a fast path optimized for the plain forwarding case. @@ -196,19 +219,73 @@ struct mbuf * ip_tryforward(struct mbuf *m) { - struct ip *ip; - struct mbuf *m0 = NULL; - struct nhop_object *nh = NULL; - struct sockaddr_in dst; - struct in_addr dest, odest, rtdest; - uint16_t ip_len, ip_off; - int error = 0; - struct m_tag *fwd_tag = NULL; - struct mbuf *mcopy = NULL; - struct in_addr redest; - /* - * Are we active and forwarding packets? - */ + struct mstate state = { .mbuf = m }; + enum fwd_action action; + + action = ip_passin(&state.mbuf, &state.odest); + if (action == FWD_CONTINUE) { + action = ip_fwd(&state.mbuf, state.odest, &state.dest, &state.nh); + if (action == FWD_CONTINUE) + ip_passout(state.mbuf, state.dest, state.nh); + } + + if (action == FWD_OURS) + return (state.mbuf); + return (NULL); +} + +int +ip_tryforward_multiple(struct mbuf **mp, int count) +{ + struct mstate state[MAX_IP_BATCH_SIZE]; /* 192 bytes on stack */ + enum fwd_action action; + int xfwd = 0, xours = 0; + + //printf("s1: count %d mp[0]=%p\n", count, mp[0]); + for (int i = 0; i < count; i++) { + state[xfwd].mbuf = mp[i]; + action = ip_passin(&state[xfwd].mbuf, &state[xfwd].odest); + if (action == FWD_OURS) { + /* Save back to mp */ + mp[xours++] = state[xfwd].mbuf; + } else if (action == FWD_CONTINUE) { + /* Proceed with xwd */ + xfwd++; + } + } + //printf("s2: fwd: %d ours: %d\n", xfwd, xours); + count = xfwd; + xfwd = 0; + for (int i = 0; i < count; i++) { + //struct mbuf *m_tmp = state[i].mbuf; + action = ip_fwd(&state[i].mbuf, state[i].odest, &state[i].dest, + &state[i].nh); + //printf("m=%p ret=%d\n", m_tmp, action); + if (action == FWD_OURS) { + /* Save back to mp */ + mp[xours++] = state[i].mbuf; + } else if (action == FWD_CONTINUE) { + state[xfwd++] = state[i]; + } + } + //printf("s3: fwd: %d ours: %d\n", xfwd, xours); + for (int i = 0; i < xfwd; i++) { + ip_passout(state[i].mbuf, state[i].dest, state[i].nh); + } + + return (xours); +} + +/* + * NULL - consumed + * process further + * ip_input() + */ +static enum fwd_action +ip_passin(struct mbuf **pm, struct in_addr *podest) +{ + struct mbuf *m = *pm; + struct in_addr odest, dest; M_ASSERTVALID(m); M_ASSERTPKTHDR(m); @@ -217,22 +294,24 @@ /* * Is packet dropped by traffic conditioner? */ - if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) - goto drop; + if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) { + m_free(m); + return (FWD_DROPPED); + } #endif /* * Only IP packets without options */ - ip = mtod(m, struct ip *); + struct ip *ip = mtod(m, struct ip *); if (ip->ip_hl != (sizeof(struct ip) >> 2)) { if (V_ip_doopts == 1) - return m; + return (FWD_OURS); else if (V_ip_doopts == 2) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB, 0, 0); - return NULL; /* mbuf already free'd */ + return (FWD_DROPPED); /* mbuf already free'd */ } /* else ignore IP options and continue */ } @@ -257,13 +336,13 @@ IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) || ip->ip_src.s_addr == INADDR_ANY || ip->ip_dst.s_addr == INADDR_ANY ) - return m; + return (FWD_OURS); /* * Is it for a local address on this host? */ if (in_localip(ip->ip_dst)) - return m; + return (FWD_OURS); IPSTAT_INC(ips_total); @@ -279,10 +358,17 @@ if (!PFIL_HOOKED_IN(V_inet_pfil_head)) goto passin; - if (pfil_run_hooks(V_inet_pfil_head, &m, m->m_pkthdr.rcvif, PFIL_IN, - NULL) != PFIL_PASS) - goto drop; + if (pfil_run_hooks(V_inet_pfil_head, pm, m->m_pkthdr.rcvif, PFIL_IN, + NULL) != PFIL_PASS) { + /* XXX: consumed? */ + if (*pm) { + m_freem(*pm); + *pm = NULL; + } + return (FWD_DROPPED); + } + m = *pm; M_ASSERTVALID(m); M_ASSERTPKTHDR(m); @@ -296,8 +382,10 @@ /* * Is it now for a local address on this host? */ - if (in_localip(dest)) - goto forwardlocal; + if (in_localip(dest)) { + m->m_flags |= M_FASTFWD_OURS; + return (FWD_OURS); + } /* * Go on with new destination address */ @@ -307,7 +395,8 @@ /* * ipfw changed it for a local address on this host. */ - goto forwardlocal; + m->m_flags |= M_FASTFWD_OURS; + return (FWD_OURS); } passin: @@ -323,7 +412,7 @@ #endif if (ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); - return NULL; /* mbuf already free'd */ + return (FWD_DROPPED); /* mbuf already free'd */ } /* @@ -340,6 +429,27 @@ } #endif + *podest = odest; + + return (FWD_CONTINUE); +} + +static enum fwd_action +ip_fwd(struct mbuf **pm, struct in_addr odest, struct in_addr *pdest, + struct nhop_object **pnh) +{ + struct ip *ip; + struct in_addr dest; + struct in_addr rtdest; + struct m_tag *fwd_tag = NULL; + struct nhop_object *nh; + struct mbuf *m = *pm; + + ip = mtod(m, struct ip *); + dest = ip->ip_dst; + + //printf(" fwd %p\n", m); + /* * Next hop forced by pfil(9) hook? */ @@ -357,8 +467,9 @@ /* * Find route to destination. */ - if (ip_findroute(&nh, dest, m) != 0) - return (NULL); /* icmp unreach already sent */ + if (ip_findroute(pnh, dest, m) != 0) + return (FWD_DROPPED); /* icmp unreach already sent */ + nh = *pnh; /* * Avoid second route lookup by caching destination. @@ -369,11 +480,17 @@ * Step 5: outgoing firewall packet processing */ if (!PFIL_HOOKED_OUT(V_inet_pfil_head)) - goto passout; + return (FWD_CONTINUE); - if (pfil_run_hooks(V_inet_pfil_head, &m, nh->nh_ifp, - PFIL_OUT | PFIL_FWD, NULL) != PFIL_PASS) - goto drop; + if (pfil_run_hooks(V_inet_pfil_head, pm, nh->nh_ifp, + PFIL_OUT | PFIL_FWD, NULL) != PFIL_PASS) { + if (*pm != NULL) { + m_freem(*pm); + *pm = NULL; + } + return (FWD_DROPPED); + } + m = *pm; M_ASSERTVALID(m); M_ASSERTPKTHDR(m); @@ -393,12 +510,12 @@ * Is it now for a local address on this host? */ if (m->m_flags & M_FASTFWD_OURS || in_localip(dest)) { -forwardlocal: + /* * Return packet for processing by ip_input(). */ m->m_flags |= M_FASTFWD_OURS; - return (m); + return (FWD_OURS); } /* * Redo route lookup with new destination address @@ -410,20 +527,36 @@ m->m_flags &= ~M_IP_NEXTHOP; } if (dest.s_addr != rtdest.s_addr && - ip_findroute(&nh, dest, m) != 0) - return (NULL); /* icmp unreach already sent */ + ip_findroute(pnh, dest, m) != 0) + return (FWD_DROPPED); /* icmp unreach already sent */ + nh = *pnh; } -passout: + *pdest = dest; + return (FWD_CONTINUE); +} + +static void +ip_passout(struct mbuf *m, struct in_addr dest, struct nhop_object *nh) +{ + struct in_addr redest; + struct mbuf *mcopy = NULL; + struct mbuf *m0 = NULL; + uint16_t ip_len, ip_off; + int error = 0; + + struct ip *ip = mtod(m, struct ip *); + /* * Step 6: send off the packet */ ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); - bzero(&dst, sizeof(dst)); - dst.sin_family = AF_INET; - dst.sin_len = sizeof(dst); + struct sockaddr_in dst = { + .sin_family = AF_INET, + .sin_len = sizeof(struct sockaddr_in), + }; if (nh->nh_flags & NHF_GATEWAY) dst.sin_addr = nh->gw4_sa.sin_addr; else @@ -458,15 +591,20 @@ IPSTAT_INC(ips_cantfrag); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, nh->nh_mtu); - goto consumed; + if (mcopy != NULL) + m_freem(mcopy); + return; } else { /* * We have to fragment the packet */ m->m_pkthdr.csum_flags |= CSUM_IP; if (ip_fragment(ip, &m, nh->nh_mtu, - nh->nh_ifp->if_hwassist) != 0) - goto drop; + nh->nh_ifp->if_hwassist) != 0) { + if (m != NULL) + m_free(m); + return; + } KASSERT(m != NULL, ("null mbuf and no error")); /* * Send off the fragments via outgoing interface @@ -511,13 +649,4 @@ icmp_error(mcopy, ICMP_REDIRECT, ICMP_REDIRECT_HOST, redest.s_addr, 0); mcopy = NULL; /* Freed by caller */ } - -consumed: - if (mcopy != NULL) - m_freem(mcopy); - return NULL; -drop: - if (m) - m_freem(m); - return NULL; } Index: sys/netinet/ip_input.c =================================================================== --- sys/netinet/ip_input.c +++ sys/netinet/ip_input.c @@ -141,9 +141,12 @@ VNET_DEFINE(pfil_head_t, inet_pfil_head); /* Packet filter hooks */ +static void ip_input_multiple(struct mbuf *m, int count); + static struct netisr_handler ip_nh = { .nh_name = "ip", .nh_handler = ip_input, + .nh_handler_m = ip_input_multiple, .nh_proto = NETISR_IP, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid_v4, @@ -420,7 +423,6 @@ VNET_SYSUNINIT(ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_destroy, NULL); #endif -#ifdef RSS /* * IP direct input routine. * @@ -446,7 +448,10 @@ (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; } -#endif + +static struct mbuf *ip_input_prepare(struct mbuf *m); +static void ip_ours(struct mbuf *m); +static bool ip_passin(struct mbuf *m); /* * Ip input routine. Checksum and byte swap header. If fragmented @@ -455,28 +460,128 @@ void ip_input(struct mbuf *m) { - struct rm_priotracker in_ifa_tracker; - struct ip *ip = NULL; - struct in_ifaddr *ia = NULL; - struct ifaddr *ifa; - struct ifnet *ifp; - int checkif, hlen = 0; - uint16_t sum, ip_len; - int dchg = 0; /* dest changed after fw */ - struct in_addr odst; /* original dst address */ M_ASSERTPKTHDR(m); NET_EPOCH_ASSERT(); if (m->m_flags & M_FASTFWD_OURS) { - m->m_flags &= ~M_FASTFWD_OURS; - /* Set up some basics that will be used later. */ - ip = mtod(m, struct ip *); - hlen = ip->ip_hl << 2; - ip_len = ntohs(ip->ip_len); goto ours; } + if ((m = ip_input_prepare(m)) == NULL) + return; + + /* -- FORWARDING -- */ + /* + * Try to forward the packet, but if we fail continue. + * ip_tryforward() does not generate redirects, so fall + * through to normal processing if redirects are required. + * ip_tryforward() does inbound and outbound packet firewall + * processing. If firewall has decided that destination becomes + * our local address, it sets M_FASTFWD_OURS flag. In this + * case skip another inbound firewall processing and update + * ip pointer. + */ +if (V_ipforwarding != 0 +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + && (!IPSEC_ENABLED(ipv4) || + IPSEC_CAPS(ipv4, m, IPSEC_CAP_OPERABLE) == 0) +#endif + ) { + if ((m = ip_tryforward(m)) == NULL) + return; + if (m->m_flags & M_FASTFWD_OURS) { + goto ours; + } + } + + /* -- END FORWARDING -- */ + + if (ip_passin(m)) + goto ours; + return; +ours: + ip_ours(m); +} + +static void +ip_input_multiple_chunked(struct mbuf **mp, int count) +{ + + KASSERT((count <= MAX_IP_BATCH_SIZE), ("batch failed, %d", count)); + /* -- FORWARDING -- */ + /* + * Try to forward the packet, but if we fail continue. + * ip_tryforward() does not generate redirects, so fall + * through to normal processing if redirects are required. + * ip_tryforward() does inbound and outbound packet firewall + * processing. If firewall has decided that destination becomes + * our local address, it sets M_FASTFWD_OURS flag. In this + * case skip another inbound firewall processing and update + * ip pointer. + * XXX: IPSEC + */ + if (V_ipforwarding != 0 +#if 0 +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + && (!IPSEC_ENABLED(ipv4) || + IPSEC_CAPS(ipv4, m, IPSEC_CAP_OPERABLE) == 0) +#endif +#endif + ) { + if ((count = ip_tryforward_multiple(mp, count)) > 0) + goto ours; + return; + } + + /* -- END FORWARDING -- */ + + int off = 0; + for (int i = 0; i < count; i++) { + mp[i - off] = mp[i]; + if (!ip_passin(mp[i])) + off++; + } + count -= off; +ours: + for (int i = 0; i < count; i++) + ip_ours(mp[i]); +} + +static void +ip_input_multiple(struct mbuf *m, int count) +{ + struct mbuf *mp[MAX_IP_BATCH_SIZE]; + struct mbuf *m_next; + + NET_EPOCH_ASSERT(); + + count = 0; + while (m != NULL) { + m_next = m->m_nextpkt; + m->m_nextpkt = NULL; + M_ASSERTPKTHDR(m); + mp[count] = ip_input_prepare(m); + if (mp[count] != NULL) { + if (__predict_false(++count == MAX_IP_BATCH_SIZE)) { + ip_input_multiple_chunked(mp, MAX_IP_BATCH_SIZE); + count = 0; + } + } + m = m_next; + } + + if (count > 0) + ip_input_multiple_chunked(mp, count); +} + +static struct mbuf * +ip_input_prepare(struct mbuf *m) +{ + struct ifnet *ifp = NULL; + struct ip *ip; + uint16_t sum; + IPSTAT_INC(ips_total); if (m->m_pkthdr.len < sizeof(struct ip)) @@ -485,7 +590,7 @@ if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { IPSTAT_INC(ips_toosmall); - return; + return (NULL); } ip = mtod(m, struct ip *); @@ -494,7 +599,7 @@ goto bad; } - hlen = ip->ip_hl << 2; + int hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ IPSTAT_INC(ips_badhlen); goto bad; @@ -502,7 +607,7 @@ if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { IPSTAT_INC(ips_badhlen); - return; + return (NULL); } ip = mtod(m, struct ip *); } @@ -536,10 +641,10 @@ #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) /* packet is dropped by traffic conditioner */ - return; + return (NULL); #endif - ip_len = ntohs(ip->ip_len); + int ip_len = ntohs(ip->ip_len); if (ip_len < hlen) { IPSTAT_INC(ips_badlen); goto bad; @@ -564,30 +669,23 @@ m_adj(m, ip_len - m->m_pkthdr.len); } - /* - * Try to forward the packet, but if we fail continue. - * ip_tryforward() does not generate redirects, so fall - * through to normal processing if redirects are required. - * ip_tryforward() does inbound and outbound packet firewall - * processing. If firewall has decided that destination becomes - * our local address, it sets M_FASTFWD_OURS flag. In this - * case skip another inbound firewall processing and update - * ip pointer. - */ - if (V_ipforwarding != 0 -#if defined(IPSEC) || defined(IPSEC_SUPPORT) - && (!IPSEC_ENABLED(ipv4) || - IPSEC_CAPS(ipv4, m, IPSEC_CAP_OPERABLE) == 0) -#endif - ) { - if ((m = ip_tryforward(m)) == NULL) - return; - if (m->m_flags & M_FASTFWD_OURS) { - m->m_flags &= ~M_FASTFWD_OURS; - ip = mtod(m, struct ip *); - goto ours; - } - } + return (m); +bad: + m_freem(m); + return (NULL); +} + +static bool +ip_passin(struct mbuf *m) +{ + struct rm_priotracker in_ifa_tracker; + struct ip *ip = mtod(m, struct ip *); + struct ifnet *ifp = m->m_pkthdr.rcvif; + int checkif; + struct in_ifaddr *ia = NULL; + struct ifaddr *ifa; + int dchg = 0; + int hlen; #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* @@ -610,20 +708,19 @@ if (!PFIL_HOOKED_IN(V_inet_pfil_head)) goto passin; - odst = ip->ip_dst; + struct in_addr odst = ip->ip_dst; if (pfil_run_hooks(V_inet_pfil_head, &m, ifp, PFIL_IN, NULL) != PFIL_PASS) - return; + return (false); if (m == NULL) /* consumed by filter */ - return; + return (false); ip = mtod(m, struct ip *); dchg = (odst.s_addr != ip->ip_dst.s_addr); ifp = m->m_pkthdr.rcvif; if (m->m_flags & M_FASTFWD_OURS) { - m->m_flags &= ~M_FASTFWD_OURS; - goto ours; + return (true); } if (m->m_flags & M_IP_NEXTHOP) { if (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) { @@ -633,9 +730,10 @@ * to some other directly connected host. */ ip_forward(m, 1); - return; + return (false); } } + passin: /* @@ -644,8 +742,9 @@ * error was detected (causing an icmp message * to be sent and the original packet to be freed). */ + hlen = ip->ip_hl << 2; if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) - return; + return (false); /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no * matter if it is destined to another node, or whether it is @@ -744,7 +843,7 @@ if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { IPSTAT_INC(ips_cantforward); m_freem(m); - return; + return (false); } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { if (V_ip_mrouter) { @@ -759,7 +858,7 @@ if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { IPSTAT_INC(ips_cantforward); m_freem(m); - return; + return (false); } /* @@ -792,9 +891,18 @@ } else { ip_forward(m, dchg); } - return; - + return (false); ours: + return (true); +} + +static void +ip_ours(struct mbuf *m) +{ + struct ip *ip = mtod(m, struct ip *); + int hlen = ip->ip_hl << 2; + + m->m_flags &= ~M_FASTFWD_OURS; #ifdef IPSTEALTH /* * IPSTEALTH: Process non-routing options only @@ -818,22 +926,7 @@ hlen = ip->ip_hl << 2; } -#if defined(IPSEC) || defined(IPSEC_SUPPORT) - if (IPSEC_ENABLED(ipv4)) { - if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0) - return; - } -#endif /* IPSEC */ - - /* - * Switch out to protocol's input routine. - */ - IPSTAT_INC(ips_delivered); - - (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); - return; -bad: - m_freem(m); + ip_direct_input(m); } /* Index: sys/netinet/tcp_lro.h =================================================================== --- sys/netinet/tcp_lro.h +++ sys/netinet/tcp_lro.h @@ -128,7 +128,7 @@ void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *); void tcp_lro_flush_all(struct lro_ctrl *); int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t); -void tcp_lro_queue_mbuf(struct lro_ctrl *, struct mbuf *); +bool tcp_lro_queue_mbuf(struct lro_ctrl *, struct mbuf *); void tcp_lro_reg_mbufq(void); void tcp_lro_dereg_mbufq(void); Index: sys/netinet/tcp_lro.c =================================================================== --- sys/netinet/tcp_lro.c +++ sys/netinet/tcp_lro.c @@ -1083,6 +1083,7 @@ uint64_t seq; uint64_t nseq; unsigned x; + struct mbufqc mq = {}; /* check if no mbufs to flush */ if (lc->lro_mbuf_count == 0) @@ -1113,11 +1114,13 @@ /* add packet to LRO engine */ if (tcp_lro_rx2(lc, mb, 0, 0) != 0) { /* input packet to network layer */ - (*lc->ifp->if_input)(lc->ifp, mb); + mbufqc_enqueue(&mq, mb); lc->lro_queued++; lc->lro_flushed++; } } + if (mq.start != NULL) + (*lc->ifp->if_input)(lc->ifp, mq.start); done: /* flush active streams */ tcp_lro_rx_done(lc); @@ -1408,7 +1411,10 @@ return tcp_lro_rx2(lc, m, csum, 1); } -void +/* + * True if consumed + */ +bool tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) { struct timespec arrv; @@ -1418,15 +1424,14 @@ lc->lro_mbuf_max == 0)) { /* packet drop */ m_freem(mb); - return; + return (true); } /* check if packet is not LRO capable */ if (__predict_false(mb->m_pkthdr.csum_flags == 0 || (lc->ifp->if_capenable & IFCAP_LRO) == 0)) { /* input packet to network layer */ - (*lc->ifp->if_input) (lc->ifp, mb); - return; + return (false); } /* Arrival Stamp the packet */ @@ -1449,6 +1454,8 @@ /* flush if array is full */ if (__predict_false(++lc->lro_mbuf_count == lc->lro_mbuf_max)) tcp_lro_flush_all(lc); + + return (true); } /* end */ Index: sys/sys/mbuf.h =================================================================== --- sys/sys/mbuf.h +++ sys/sys/mbuf.h @@ -1587,6 +1587,26 @@ mq_src->mq_len = 0; } +struct mbufqc { + struct mbuf *start; + struct mbuf *end; +}; + +static inline void +mbufqc_enqueue(struct mbufqc *qc, struct mbuf *m) +{ + + m->m_nextpkt = NULL; + if (__predict_false(qc->start == NULL)) { + qc->start = m; + qc->end = m; + } else { + qc->end->m_nextpkt = m; + qc->end = m; + } +} + + #ifdef _SYS_TIMESPEC_H_ static inline void mbuf_tstmp2timespec(struct mbuf *m, struct timespec *ts)