Index: projects/vnet/sys/net/if_epair.c =================================================================== --- projects/vnet/sys/net/if_epair.c (revision 298547) +++ projects/vnet/sys/net/if_epair.c (revision 298548) @@ -1,1010 +1,1010 @@ /*- * Copyright (c) 2008 The FreeBSD Foundation * Copyright (c) 2009-2010 Bjoern A. Zeeb * All rights reserved. * * This software was developed by CK Software GmbH under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * A pair of virtual back-to-back connected ethernet like interfaces * (``two interfaces with a virtual cross-over cable''). * * This is mostly intended to be used to provide connectivity between * different virtual network stack instances. */ /* * Things to re-think once we have more experience: * - ifp->if_reassign function once we can test with vimage. Depending on * how if_vmove() is going to be improved. * - Real random etheraddrs that are checked to be uniquish; we would need * to re-do them in case we move the interface between network stacks * in a private if_reassign function. * In case we bridge to a real interface/network or between indepedent * epairs on multiple stacks/machines, we may need this. * For now let the user handle that case. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, OID_AUTO, epair, CTLFLAG_RW, 0, "epair sysctl"); #ifdef EPAIR_DEBUG static int epair_debug = 0; SYSCTL_INT(_net_link_epair, OID_AUTO, epair_debug, CTLFLAG_RW, &epair_debug, 0, "if_epair(4) debugging."); #define DPRINTF(fmt, arg...) \ if (epair_debug) \ printf("[%s:%d] " fmt, __func__, __LINE__, ##arg) #else #define DPRINTF(fmt, arg...) #endif static void epair_nh_sintr(struct mbuf *); static struct mbuf *epair_nh_m2cpuid(struct mbuf *, uintptr_t, u_int *); static void epair_nh_drainedcpu(u_int); static void epair_start_locked(struct ifnet *); static int epair_media_change(struct ifnet *); static void epair_media_status(struct ifnet *, struct ifmediareq *); static int epair_clone_match(struct if_clone *, const char *); static int epair_clone_create(struct if_clone *, char *, size_t, caddr_t); static int epair_clone_destroy(struct if_clone *, struct ifnet *); static const char epairname[] = "epair"; /* Netisr related definitions and sysctl. */ static struct netisr_handler epair_nh = { .nh_name = epairname, .nh_proto = NETISR_EPAIR, .nh_policy = NETISR_POLICY_CPU, .nh_handler = epair_nh_sintr, .nh_m2cpuid = epair_nh_m2cpuid, .nh_drainedcpu = epair_nh_drainedcpu, }; static int sysctl_epair_netisr_maxqlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&epair_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&epair_nh, qlimit)); } SYSCTL_PROC(_net_link_epair, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_epair_netisr_maxqlen, "I", "Maximum if_epair(4) netisr \"hw\" queue length"); struct epair_softc { struct ifnet *ifp; /* This ifp. */ struct ifnet *oifp; /* other ifp of pair. */ struct ifmedia media; /* Media config (fake). */ u_int refcount; /* # of mbufs in flight. */ u_int cpuid; /* CPU ID assigned upon creation. */ void (*if_qflush)(struct ifnet *); /* Original if_qflush routine. */ }; /* * Per-CPU list of ifps with data in the ifq that needs to be flushed * to the netisr ``hw'' queue before we allow any further direct queuing * to the ``hw'' queue. */ struct epair_ifp_drain { STAILQ_ENTRY(epair_ifp_drain) ifp_next; struct ifnet *ifp; }; STAILQ_HEAD(eid_list, epair_ifp_drain); #define EPAIR_LOCK_INIT(dpcpu) mtx_init(&(dpcpu)->if_epair_mtx, \ "if_epair", NULL, MTX_DEF) #define EPAIR_LOCK_DESTROY(dpcpu) mtx_destroy(&(dpcpu)->if_epair_mtx) #define EPAIR_LOCK_ASSERT(dpcpu) mtx_assert(&(dpcpu)->if_epair_mtx, \ MA_OWNED) #define EPAIR_LOCK(dpcpu) mtx_lock(&(dpcpu)->if_epair_mtx) #define EPAIR_UNLOCK(dpcpu) mtx_unlock(&(dpcpu)->if_epair_mtx) #ifdef INVARIANTS #define EPAIR_REFCOUNT_INIT(r, v) refcount_init((r), (v)) #define EPAIR_REFCOUNT_AQUIRE(r) refcount_acquire((r)) #define EPAIR_REFCOUNT_RELEASE(r) refcount_release((r)) #define EPAIR_REFCOUNT_ASSERT(a, p) KASSERT(a, p) #else #define EPAIR_REFCOUNT_INIT(r, v) #define EPAIR_REFCOUNT_AQUIRE(r) #define EPAIR_REFCOUNT_RELEASE(r) #define EPAIR_REFCOUNT_ASSERT(a, p) #endif static MALLOC_DEFINE(M_EPAIR, epairname, "Pair of virtual cross-over connected Ethernet-like interfaces"); static VNET_DEFINE(struct if_clone *, epair_cloner); #define V_epair_cloner VNET(epair_cloner) /* * DPCPU area and functions. */ struct epair_dpcpu { struct mtx if_epair_mtx; /* Per-CPU locking. */ int epair_drv_flags; /* Per-CPU ``hw'' drv flags. */ struct eid_list epair_ifp_drain_list; /* Per-CPU list of ifps with * data in the ifq. */ }; DPCPU_DEFINE(struct epair_dpcpu, epair_dpcpu); static void epair_dpcpu_init(void) { struct epair_dpcpu *epair_dpcpu; struct eid_list *s; u_int cpuid; CPU_FOREACH(cpuid) { epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu); /* Initialize per-cpu lock. */ EPAIR_LOCK_INIT(epair_dpcpu); /* Driver flags are per-cpu as are our netisr "hw" queues. */ epair_dpcpu->epair_drv_flags = 0; /* * Initialize per-cpu drain list. * Manually do what STAILQ_HEAD_INITIALIZER would do. */ s = &epair_dpcpu->epair_ifp_drain_list; s->stqh_first = NULL; s->stqh_last = &s->stqh_first; } } static void epair_dpcpu_detach(void) { struct epair_dpcpu *epair_dpcpu; u_int cpuid; CPU_FOREACH(cpuid) { epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu); /* Destroy per-cpu lock. */ EPAIR_LOCK_DESTROY(epair_dpcpu); } } /* * Helper functions. */ static u_int cpuid_from_ifp(struct ifnet *ifp) { struct epair_softc *sc; if (ifp == NULL) return (0); sc = ifp->if_softc; return (sc->cpuid); } /* * Netisr handler functions. */ static void epair_nh_sintr(struct mbuf *m) { struct ifnet *ifp; struct epair_softc *sc; ifp = m->m_pkthdr.rcvif; (*ifp->if_input)(ifp, m); sc = ifp->if_softc; EPAIR_REFCOUNT_RELEASE(&sc->refcount); EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, ("%s: ifp=%p sc->refcount not >= 1: %d", __func__, ifp, sc->refcount)); DPRINTF("ifp=%p refcount=%u\n", ifp, sc->refcount); } static struct mbuf * epair_nh_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid) { *cpuid = cpuid_from_ifp(m->m_pkthdr.rcvif); return (m); } static void epair_nh_drainedcpu(u_int cpuid) { struct epair_dpcpu *epair_dpcpu; struct epair_ifp_drain *elm, *tvar; struct ifnet *ifp; epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu); EPAIR_LOCK(epair_dpcpu); /* * Assume our "hw" queue and possibly ifq will be emptied * again. In case we will overflow the "hw" queue while * draining, epair_start_locked will set IFF_DRV_OACTIVE * again and we will stop and return. */ STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list, ifp_next, tvar) { ifp = elm->ifp; epair_dpcpu->epair_drv_flags &= ~IFF_DRV_OACTIVE; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; epair_start_locked(ifp); IFQ_LOCK(&ifp->if_snd); if (IFQ_IS_EMPTY(&ifp->if_snd)) { struct epair_softc *sc; STAILQ_REMOVE(&epair_dpcpu->epair_ifp_drain_list, elm, epair_ifp_drain, ifp_next); /* The cached ifp goes off the list. */ sc = ifp->if_softc; EPAIR_REFCOUNT_RELEASE(&sc->refcount); EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, ("%s: ifp=%p sc->refcount not >= 1: %d", __func__, ifp, sc->refcount)); free(elm, M_EPAIR); } IFQ_UNLOCK(&ifp->if_snd); if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) != 0) { /* Our "hw"q overflew again. */ epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE; DPRINTF("hw queue length overflow at %u\n", epair_nh.nh_qlimit); break; } } EPAIR_UNLOCK(epair_dpcpu); } /* * Network interface (`if') related functions. */ static void epair_remove_ifp_from_draining(struct ifnet *ifp) { struct epair_dpcpu *epair_dpcpu; struct epair_ifp_drain *elm, *tvar; u_int cpuid; CPU_FOREACH(cpuid) { epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu); EPAIR_LOCK(epair_dpcpu); STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list, ifp_next, tvar) { if (ifp == elm->ifp) { struct epair_softc *sc; STAILQ_REMOVE( &epair_dpcpu->epair_ifp_drain_list, elm, epair_ifp_drain, ifp_next); /* The cached ifp goes off the list. */ sc = ifp->if_softc; EPAIR_REFCOUNT_RELEASE(&sc->refcount); EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, ("%s: ifp=%p sc->refcount not >= 1: %d", __func__, ifp, sc->refcount)); free(elm, M_EPAIR); } } EPAIR_UNLOCK(epair_dpcpu); } } static int epair_add_ifp_for_draining(struct ifnet *ifp) { struct epair_dpcpu *epair_dpcpu; struct epair_softc *sc; struct epair_ifp_drain *elm = NULL; sc = ifp->if_softc; epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu); EPAIR_LOCK_ASSERT(epair_dpcpu); STAILQ_FOREACH(elm, &epair_dpcpu->epair_ifp_drain_list, ifp_next) if (elm->ifp == ifp) break; /* If the ifp is there already, return success. */ if (elm != NULL) return (0); elm = malloc(sizeof(struct epair_ifp_drain), M_EPAIR, M_NOWAIT|M_ZERO); if (elm == NULL) return (ENOMEM); elm->ifp = ifp; /* Add a reference for the ifp pointer on the list. */ EPAIR_REFCOUNT_AQUIRE(&sc->refcount); STAILQ_INSERT_TAIL(&epair_dpcpu->epair_ifp_drain_list, elm, ifp_next); return (0); } static void epair_start_locked(struct ifnet *ifp) { struct epair_dpcpu *epair_dpcpu; struct mbuf *m; struct epair_softc *sc; struct ifnet *oifp; int error; DPRINTF("ifp=%p\n", ifp); sc = ifp->if_softc; epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu); EPAIR_LOCK_ASSERT(epair_dpcpu); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; if ((ifp->if_flags & IFF_UP) == 0) return; /* * We get patckets here from ether_output via if_handoff() * and ned to put them into the input queue of the oifp * and call oifp->if_input() via netisr/epair_sintr(). */ oifp = sc->oifp; sc = oifp->if_softc; for (;;) { IFQ_DEQUEUE(&ifp->if_snd, m); if (m == NULL) break; BPF_MTAP(ifp, m); /* * In case the outgoing interface is not usable, * drop the packet. */ if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (oifp->if_flags & IFF_UP) ==0) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); continue; } DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname); /* * Add a reference so the interface cannot go while the * packet is in transit as we rely on rcvif to stay valid. */ EPAIR_REFCOUNT_AQUIRE(&sc->refcount); m->m_pkthdr.rcvif = oifp; CURVNET_SET_QUIET(oifp->if_vnet); error = netisr_queue(NETISR_EPAIR, m); CURVNET_RESTORE(); if (!error) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* Someone else received the packet. */ if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1); } else { /* The packet was freed already. */ epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE; ifp->if_drv_flags |= IFF_DRV_OACTIVE; (void) epair_add_ifp_for_draining(ifp); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); EPAIR_REFCOUNT_RELEASE(&sc->refcount); EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, ("%s: ifp=%p sc->refcount not >= 1: %d", __func__, oifp, sc->refcount)); } } } static void epair_start(struct ifnet *ifp) { struct epair_dpcpu *epair_dpcpu; epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu); EPAIR_LOCK(epair_dpcpu); epair_start_locked(ifp); EPAIR_UNLOCK(epair_dpcpu); } static int epair_transmit_locked(struct ifnet *ifp, struct mbuf *m) { struct epair_dpcpu *epair_dpcpu; struct epair_softc *sc; struct ifnet *oifp; int error, len; short mflags; DPRINTF("ifp=%p m=%p\n", ifp, m); sc = ifp->if_softc; epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu); EPAIR_LOCK_ASSERT(epair_dpcpu); if (m == NULL) return (0); /* * We are not going to use the interface en/dequeue mechanism * on the TX side. We are called from ether_output_frame() * and will put the packet into the incoming queue of the * other interface of our pair via the netsir. */ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { m_freem(m); return (ENXIO); } if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return (ENETDOWN); } BPF_MTAP(ifp, m); /* * In case the outgoing interface is not usable, * drop the packet. */ oifp = sc->oifp; if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (oifp->if_flags & IFF_UP) ==0) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (0); } len = m->m_pkthdr.len; mflags = m->m_flags; DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname); #ifdef ALTQ /* Support ALTQ via the clasic if_start() path. */ IF_LOCK(&ifp->if_snd); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { ALTQ_ENQUEUE(&ifp->if_snd, m, NULL, error); if (error) if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); IF_UNLOCK(&ifp->if_snd); if (!error) { if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if (mflags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) epair_start_locked(ifp); else (void)epair_add_ifp_for_draining(ifp); } return (error); } IF_UNLOCK(&ifp->if_snd); #endif if ((epair_dpcpu->epair_drv_flags & IFF_DRV_OACTIVE) != 0) { /* * Our hardware queue is full, try to fall back * queuing to the ifq but do not call ifp->if_start. * Either we are lucky or the packet is gone. */ IFQ_ENQUEUE(&ifp->if_snd, m, error); if (!error) (void)epair_add_ifp_for_draining(ifp); return (error); } sc = oifp->if_softc; /* * Add a reference so the interface cannot go while the * packet is in transit as we rely on rcvif to stay valid. */ EPAIR_REFCOUNT_AQUIRE(&sc->refcount); m->m_pkthdr.rcvif = oifp; CURVNET_SET_QUIET(oifp->if_vnet); error = netisr_queue(NETISR_EPAIR, m); CURVNET_RESTORE(); if (!error) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* * IFQ_HANDOFF_ADJ/ip_handoff() update statistics, * but as we bypass all this we have to duplicate * the logic another time. */ if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if (mflags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); /* Someone else received the packet. */ if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1); } else { /* The packet was freed already. */ epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE; ifp->if_drv_flags |= IFF_DRV_OACTIVE; if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); EPAIR_REFCOUNT_RELEASE(&sc->refcount); EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, ("%s: ifp=%p sc->refcount not >= 1: %d", __func__, oifp, sc->refcount)); } return (error); } static int epair_transmit(struct ifnet *ifp, struct mbuf *m) { struct epair_dpcpu *epair_dpcpu; int error; epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu); EPAIR_LOCK(epair_dpcpu); error = epair_transmit_locked(ifp, m); EPAIR_UNLOCK(epair_dpcpu); return (error); } static void epair_qflush(struct ifnet *ifp) { struct epair_softc *sc; sc = ifp->if_softc; KASSERT(sc != NULL, ("%s: ifp=%p, epair_softc gone? sc=%p\n", __func__, ifp, sc)); /* * Remove this ifp from all backpointer lists. The interface will not * usable for flushing anyway nor should it have anything to flush * after if_qflush(). */ epair_remove_ifp_from_draining(ifp); if (sc->if_qflush) sc->if_qflush(ifp); } static int epair_media_change(struct ifnet *ifp __unused) { /* Do nothing. */ return (0); } static void epair_media_status(struct ifnet *ifp __unused, struct ifmediareq *imr) { imr->ifm_status = IFM_AVALID | IFM_ACTIVE; imr->ifm_active = IFM_ETHER | IFM_10G_T | IFM_FDX; } static int epair_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct epair_softc *sc; struct ifreq *ifr; int error; ifr = (struct ifreq *)data; switch (cmd) { case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: error = 0; break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: sc = ifp->if_softc; error = ifmedia_ioctl(ifp, ifr, &sc->media, cmd); break; case SIOCSIFMTU: /* We basically allow all kinds of MTUs. */ ifp->if_mtu = ifr->ifr_mtu; error = 0; break; default: /* Let the common ethernet handler process this. */ error = ether_ioctl(ifp, cmd, data); break; } return (error); } static void epair_init(void *dummy __unused) { } /* * Interface cloning functions. * We use our private ones so that we can create/destroy our secondary * device along with the primary one. */ static int epair_clone_match(struct if_clone *ifc, const char *name) { const char *cp; DPRINTF("name='%s'\n", name); /* * Our base name is epair. * Our interfaces will be named epair[ab]. * So accept anything of the following list: * - epair * - epair * but not the epair[ab] versions. */ if (strncmp(epairname, name, sizeof(epairname)-1) != 0) return (0); for (cp = name + sizeof(epairname) - 1; *cp != '\0'; cp++) { if (*cp < '0' || *cp > '9') return (0); } return (1); } static int epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { struct epair_softc *sca, *scb; struct ifnet *ifp; char *dp; int error, unit, wildcard; uint8_t eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ /* * We are abusing params to create our second interface. * Actually we already created it and called if_clone_create() * for it to do the official insertion procedure the moment we knew * it cannot fail anymore. So just do attach it here. */ if (params) { scb = (struct epair_softc *)params; ifp = scb->ifp; /* Assign a hopefully unique, locally administered etheraddr. */ eaddr[0] = 0x02; eaddr[3] = (ifp->if_index >> 8) & 0xff; eaddr[4] = ifp->if_index & 0xff; eaddr[5] = 0x0b; ether_ifattach(ifp, eaddr); /* Correctly set the name for the cloner list. */ strlcpy(name, scb->ifp->if_xname, len); return (0); } /* Try to see if a special unit was requested. */ error = ifc_name2unit(name, &unit); if (error != 0) return (error); wildcard = (unit < 0); error = ifc_alloc_unit(ifc, &unit); if (error != 0) return (error); /* * If no unit had been given, we need to adjust the ifName. * Also make sure there is space for our extra [ab] suffix. */ for (dp = name; *dp != '\0'; dp++); if (wildcard) { error = snprintf(dp, len - (dp - name), "%d", unit); if (error > len - (dp - name) - 1) { /* ifName too long. */ ifc_free_unit(ifc, unit); return (ENOSPC); } dp += error; } if (len - (dp - name) - 1 < 1) { /* No space left for our [ab] suffix. */ ifc_free_unit(ifc, unit); return (ENOSPC); } *dp = 'b'; /* Must not change dp so we can replace 'a' by 'b' later. */ *(dp+1) = '\0'; /* Check if 'a' and 'b' interfaces already exist. */ if (ifunit(name) != NULL) return (EEXIST); *dp = 'a'; if (ifunit(name) != NULL) return (EEXIST); /* Allocate memory for both [ab] interfaces */ sca = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO); EPAIR_REFCOUNT_INIT(&sca->refcount, 1); sca->ifp = if_alloc(IFT_ETHER); if (sca->ifp == NULL) { free(sca, M_EPAIR); ifc_free_unit(ifc, unit); return (ENOSPC); } scb = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO); EPAIR_REFCOUNT_INIT(&scb->refcount, 1); scb->ifp = if_alloc(IFT_ETHER); if (scb->ifp == NULL) { free(scb, M_EPAIR); if_free(sca->ifp); free(sca, M_EPAIR); ifc_free_unit(ifc, unit); return (ENOSPC); } /* * Cross-reference the interfaces so we will be able to free both. */ sca->oifp = scb->ifp; scb->oifp = sca->ifp; /* * Calculate the cpuid for netisr queueing based on the * ifIndex of the interfaces. As long as we cannot configure * this or use cpuset information easily we cannot guarantee * cache locality but we can at least allow parallelism. */ sca->cpuid = netisr_get_cpuid(sca->ifp->if_index % netisr_get_cpucount()); scb->cpuid = netisr_get_cpuid(scb->ifp->if_index % netisr_get_cpucount()); /* Initialise pseudo media types. */ ifmedia_init(&sca->media, 0, epair_media_change, epair_media_status); ifmedia_add(&sca->media, IFM_ETHER | IFM_10G_T, 0, NULL); ifmedia_set(&sca->media, IFM_ETHER | IFM_10G_T); ifmedia_init(&scb->media, 0, epair_media_change, epair_media_status); ifmedia_add(&scb->media, IFM_ETHER | IFM_10G_T, 0, NULL); ifmedia_set(&scb->media, IFM_ETHER | IFM_10G_T); /* Finish initialization of interface a. */ ifp = sca->ifp; ifp->if_softc = sca; strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = epairname; ifp->if_dunit = unit; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_capabilities = IFCAP_VLAN_MTU; ifp->if_capenable = IFCAP_VLAN_MTU; ifp->if_start = epair_start; ifp->if_ioctl = epair_ioctl; ifp->if_init = epair_init; ifp->if_snd.ifq_maxlen = ifqmaxlen; /* Assign a hopefully unique, locally administered etheraddr. */ eaddr[0] = 0x02; eaddr[3] = (ifp->if_index >> 8) & 0xff; eaddr[4] = ifp->if_index & 0xff; eaddr[5] = 0x0a; ether_ifattach(ifp, eaddr); sca->if_qflush = ifp->if_qflush; ifp->if_qflush = epair_qflush; ifp->if_transmit = epair_transmit; ifp->if_baudrate = IF_Gbps(10); /* arbitrary maximum */ /* Swap the name and finish initialization of interface b. */ *dp = 'b'; ifp = scb->ifp; ifp->if_softc = scb; strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = epairname; ifp->if_dunit = unit; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_capabilities = IFCAP_VLAN_MTU; ifp->if_capenable = IFCAP_VLAN_MTU; ifp->if_start = epair_start; ifp->if_ioctl = epair_ioctl; ifp->if_init = epair_init; ifp->if_snd.ifq_maxlen = ifqmaxlen; /* We need to play some tricks here for the second interface. */ strlcpy(name, epairname, len); error = if_clone_create(name, len, (caddr_t)scb); if (error) panic("%s: if_clone_create() for our 2nd iface failed: %d", __func__, error); scb->if_qflush = ifp->if_qflush; ifp->if_qflush = epair_qflush; ifp->if_transmit = epair_transmit; ifp->if_baudrate = IF_Gbps(10); /* arbitrary maximum */ /* * Restore name to a as the ifp for this will go into the * cloner list for the initial call. */ strlcpy(name, sca->ifp->if_xname, len); DPRINTF("name='%s/%db' created sca=%p scb=%p\n", name, unit, sca, scb); /* Tell the world, that we are ready to rock. */ sca->ifp->if_drv_flags |= IFF_DRV_RUNNING; scb->ifp->if_drv_flags |= IFF_DRV_RUNNING; if_link_state_change(sca->ifp, LINK_STATE_UP); if_link_state_change(scb->ifp, LINK_STATE_UP); return (0); } static int epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) { struct ifnet *oifp; struct epair_softc *sca, *scb; int unit, error; DPRINTF("ifp=%p\n", ifp); /* * In case we called into if_clone_destroyif() ourselves * again to remove the second interface, the softc will be * NULL. In that case so not do anything but return success. */ if (ifp->if_softc == NULL) return (0); unit = ifp->if_dunit; sca = ifp->if_softc; oifp = sca->oifp; scb = oifp->if_softc; DPRINTF("ifp=%p oifp=%p\n", ifp, oifp); if_link_state_change(ifp, LINK_STATE_DOWN); if_link_state_change(oifp, LINK_STATE_DOWN); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; oifp->if_drv_flags &= ~IFF_DRV_RUNNING; /* * Get rid of our second half. As the other of the two * interfaces may reside in a different vnet, we need to * switch before freeing them. */ CURVNET_SET_QUIET(oifp->if_vnet); ether_ifdetach(oifp); /* * Wait for all packets to be dispatched to if_input. * The numbers can only go down as the interface is * detached so there is no need to use atomics. */ DPRINTF("scb refcnt=%u\n", scb->refcount); EPAIR_REFCOUNT_ASSERT(scb->refcount == 1, ("%s: ifp=%p scb->refcount!=1: %d", __func__, oifp, scb->refcount)); oifp->if_softc = NULL; error = if_clone_destroyif(ifc, oifp); if (error) panic("%s: if_clone_destroyif() for our 2nd iface failed: %d", __func__, error); if_free(oifp); ifmedia_removeall(&scb->media); free(scb, M_EPAIR); CURVNET_RESTORE(); ether_ifdetach(ifp); /* * Wait for all packets to be dispatched to if_input. */ DPRINTF("sca refcnt=%u\n", sca->refcount); EPAIR_REFCOUNT_ASSERT(sca->refcount == 1, ("%s: ifp=%p sca->refcount!=1: %d", __func__, ifp, sca->refcount)); if_free(ifp); ifmedia_removeall(&sca->media); free(sca, M_EPAIR); ifc_free_unit(ifc, unit); return (0); } static void vnet_epair_init(const void *unused __unused) { V_epair_cloner = if_clone_advanced(epairname, 0, epair_clone_match, epair_clone_create, epair_clone_destroy); + netisr_register(&epair_nh); } VNET_SYSINIT(vnet_epair_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_epair_init, NULL); static void vnet_epair_uninit(const void *unused __unused) { + netisr_unregister(&epair_nh); if_clone_detach(V_epair_cloner); } VNET_SYSUNINIT(vnet_epair_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_epair_uninit, NULL); static int epair_modevent(module_t mod, int type, void *data) { int qlimit; switch (type) { case MOD_LOAD: /* For now limit us to one global mutex and one inq. */ epair_dpcpu_init(); epair_nh.nh_qlimit = 42 * ifqmaxlen; /* 42 shall be the number. */ if (TUNABLE_INT_FETCH("net.link.epair.netisr_maxqlen", &qlimit)) epair_nh.nh_qlimit = qlimit; - netisr_register(&epair_nh); if (bootverbose) printf("%s initialized.\n", epairname); break; case MOD_UNLOAD: - netisr_unregister(&epair_nh); epair_dpcpu_detach(); if (bootverbose) printf("%s unloaded.\n", epairname); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t epair_mod = { "if_epair", epair_modevent, 0 }; -DECLARE_MODULE(if_epair, epair_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +DECLARE_MODULE(if_epair, epair_mod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE); MODULE_VERSION(if_epair, 1); Index: projects/vnet/sys/net/if_ethersubr.c =================================================================== --- projects/vnet/sys/net/if_ethersubr.c (revision 298547) +++ projects/vnet/sys/net/if_ethersubr.c (revision 298548) @@ -1,1198 +1,1200 @@ /*- * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_netgraph.h" #include "opt_mbuf_profiling.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #endif #ifdef INET6 #include #endif #include #ifdef CTASSERT CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2); CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); #endif VNET_DEFINE(struct pfil_head, link_pfil_hook); /* Packet filter hooks */ /* netgraph node hooks for ng_ether(4) */ void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); int (*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_attach_p)(struct ifnet *ifp); void (*ng_ether_detach_p)(struct ifnet *ifp); void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* if_bridge(4) support */ struct mbuf *(*bridge_input_p)(struct ifnet *, struct mbuf *); int (*bridge_output_p)(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); void (*bridge_dn_p)(struct mbuf *, struct ifnet *); /* if_lagg(4) support */ struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static int ether_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); #ifdef VIMAGE static void ether_reassign(struct ifnet *, struct vnet *, char *); #endif static int ether_requestencap(struct ifnet *, struct if_encap_req *); #define ETHER_IS_BROADCAST(addr) \ (bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0) #define senderr(e) do { error = (e); goto bad;} while (0) static void update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) { int csum_flags = 0; if (src->m_pkthdr.csum_flags & CSUM_IP) csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA) csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); if (src->m_pkthdr.csum_flags & CSUM_SCTP) csum_flags |= CSUM_SCTP_VALID; dst->m_pkthdr.csum_flags |= csum_flags; if (csum_flags & CSUM_DATA_VALID) dst->m_pkthdr.csum_data = 0xffff; } /* * Handle link-layer encapsulation requests. */ static int ether_requestencap(struct ifnet *ifp, struct if_encap_req *req) { struct ether_header *eh; struct arphdr *ah; uint16_t etype; const u_char *lladdr; if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < ETHER_HDR_LEN) return (ENOMEM); eh = (struct ether_header *)req->buf; lladdr = req->lladdr; req->lladdr_off = 0; switch (req->family) { case AF_INET: etype = htons(ETHERTYPE_IP); break; case AF_INET6: etype = htons(ETHERTYPE_IPV6); break; case AF_ARP: ah = (struct arphdr *)req->hdata; ah->ar_hrd = htons(ARPHRD_ETHER); switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: etype = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: etype = htons(ETHERTYPE_ARP); break; } if (req->flags & IFENCAP_FLAG_BROADCAST) lladdr = ifp->if_broadcastaddr; break; default: return (EAFNOSUPPORT); } memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type)); memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); req->bufsize = sizeof(struct ether_header); return (0); } static int ether_resolve_addr(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro, u_char *phdr, uint32_t *pflags) { struct ether_header *eh; uint32_t lleflags = 0; int error = 0; #if defined(INET) || defined(INET6) uint16_t etype; #endif eh = (struct ether_header *)phdr; switch (dst->sa_family) { #ifdef INET case AF_INET: if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) error = arpresolve(ifp, 0, m, dst, phdr, &lleflags); else { if (m->m_flags & M_BCAST) memcpy(eh->ether_dhost, ifp->if_broadcastaddr, ETHER_ADDR_LEN); else { const struct in_addr *a; a = &(((const struct sockaddr_in *)dst)->sin_addr); ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost); } etype = htons(ETHERTYPE_IP); memcpy(&eh->ether_type, &etype, sizeof(etype)); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); } break; #endif #ifdef INET6 case AF_INET6: if ((m->m_flags & M_MCAST) == 0) error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags); else { const struct in6_addr *a6; a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr); ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost); etype = htons(ETHERTYPE_IPV6); memcpy(&eh->ether_type, &etype, sizeof(etype)); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); } break; #endif default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); if (m != NULL) m_freem(m); return (EAFNOSUPPORT); } if (error == EHOSTDOWN) { if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0) error = EHOSTUNREACH; } if (error != 0) return (error); *pflags = RT_MAY_LOOP; if (lleflags & LLE_IFADDR) *pflags |= RT_L2_ME; return (0); } /* * Ethernet output routine. * Encapsulate a packet of type family for the local net. * Use trailer local net encapsulation if enough data in first * packet leaves a multiple of 512 bytes of data in remainder. */ int ether_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { int error = 0; char linkhdr[ETHER_HDR_LEN], *phdr; struct ether_header *eh; struct pf_mtag *t; int loop_copy = 1; int hlen; /* link layer header length */ uint32_t pflags; phdr = NULL; pflags = 0; if (ro != NULL) { phdr = ro->ro_prepend; hlen = ro->ro_plen; pflags = ro->ro_flags; } #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) senderr(error); #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) senderr(ENETDOWN); if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) senderr(ENETDOWN); if (phdr == NULL) { /* No prepend data supplied. Try to calculate ourselves. */ phdr = linkhdr; hlen = ETHER_HDR_LEN; error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags); if (error != 0) return (error == EWOULDBLOCK ? 0 : error); } if ((pflags & RT_L2_ME) != 0) { update_mbuf_csumflags(m, m); return (if_simloop(ifp, m, dst->sa_family, 0)); } loop_copy = pflags & RT_MAY_LOOP; /* * Add local net header. If no space in first mbuf, * allocate another. * * Note that we do prepend regardless of RT_HAS_HEADER flag. * This is done because BPF code shifts m_data pointer * to the end of ethernet header prior to calling if_output(). */ M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); if ((pflags & RT_HAS_HEADER) == 0) { eh = mtod(m, struct ether_header *); memcpy(eh, phdr, hlen); } /* * If a simplex interface, and the packet is being sent to our * Ethernet address or a broadcast address, loopback a copy. * XXX To make a simplex device behave exactly like a duplex * device, we should copy in the case of sending to our own * ethernet address (thus letting the original actually appear * on the wire). However, we don't do that here for security * reasons and compatibility with the original behavior. */ if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) && ((t = pf_find_mtag(m)) == NULL || !t->routed)) { struct mbuf *n; /* * Because if_simloop() modifies the packet, we need a * writable copy through m_dup() instead of a readonly * one as m_copy[m] would give us. The alternative would * be to modify if_simloop() to handle the readonly mbuf, * but performancewise it is mostly equivalent (trading * extra data copying vs. extra locking). * * XXX This is a local workaround. A number of less * often used kernel parts suffer from the same bug. * See PR kern/105943 for a proposed general solution. */ if ((n = m_dup(m, M_NOWAIT)) != NULL) { update_mbuf_csumflags(m, n); (void)if_simloop(ifp, n, dst->sa_family, hlen); } else if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); } /* * Bridges require special output handling. */ if (ifp->if_bridge) { BRIDGE_OUTPUT(ifp, m, error); return (error); } #if defined(INET) || defined(INET6) if (ifp->if_carp && (error = (*carp_output_p)(ifp, m, dst))) goto bad; #endif /* Handle ng_ether(4) processing, if any */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_output_p != NULL, ("ng_ether_output_p is NULL")); if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) { bad: if (m != NULL) m_freem(m); return (error); } if (m == NULL) return (0); } /* Continue with link-layer output */ return ether_output_frame(ifp, m); } /* * Ethernet link layer output routine to send a raw frame to the device. * * This assumes that the 14 byte Ethernet header is present and contiguous * in the first mbuf (if BRIDGE'ing). */ int ether_output_frame(struct ifnet *ifp, struct mbuf *m) { int i; if (PFIL_HOOKED(&V_link_pfil_hook)) { i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_OUT, NULL); if (i != 0) return (EACCES); if (m == NULL) return (0); } /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ return ((ifp->if_transmit)(ifp, m)); } #if defined(INET) || defined(INET6) #endif /* * Process a received Ethernet packet; the packet is in the * mbuf chain m with the ethernet header at the front. */ static void ether_input_internal(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; u_short etype; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } #ifdef DIAGNOSTIC if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n"); m_freem(m); return; } #endif if (m->m_len < ETHER_HDR_LEN) { /* XXX maybe should pullup? */ if_printf(ifp, "discard frame w/o leading ethernet " "header (len %u pkt len %u)\n", m->m_len, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); random_harvest_queue(m, sizeof(*m), 2, RANDOM_NET_ETHER); CURVNET_SET_QUIET(ifp->if_vnet); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* * Give bpf a chance at the packet. */ ETHER_BPF_MTAP(ifp, m); /* * If the CRC is still on the packet, trim it off. We do this once * and once only in case we are re-entered. Nothing else on the * Ethernet receive path expects to see the FCS. */ if (m->m_flags & M_HASFCS) { m_adj(m, -ETHER_CRC_LEN); m->m_flags &= ~M_HASFCS; } if (!(ifp->if_capenable & IFCAP_HWSTATS)) if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); CURVNET_RESTORE(); return; } /* Handle input from a lagg(4) port */ if (ifp->if_type == IFT_IEEE8023ADLAG) { KASSERT(lagg_input_p != NULL, ("%s: if_lagg not loaded!", __func__)); m = (*lagg_input_p)(ifp, m); if (m != NULL) ifp = m->m_pkthdr.rcvif; else { CURVNET_RESTORE(); return; } } /* * If the hardware did not process an 802.1Q tag, do this now, * to allow 802.1P priority frames to be passed to the main input * path correctly. * TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels. */ if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) { struct ether_vlan_header *evl; if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { #ifdef DIAGNOSTIC if_printf(ifp, "cannot pullup VLAN header\n"); #endif if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); CURVNET_RESTORE(); return; } evl = mtod(m, struct ether_vlan_header *); m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); m->m_flags |= M_VLANTAG; bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); eh = mtod(m, struct ether_header *); } M_SETFIB(m, ifp->if_fib); /* Allow ng_ether(4) to claim this frame. */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_p != NULL, ("%s: ng_ether_input_p is NULL", __func__)); m->m_flags &= ~M_PROMISC; (*ng_ether_input_p)(ifp, &m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } /* * Allow if_bridge(4) to claim this frame. * The BRIDGE_INPUT() macro will update ifp if the bridge changed it * and the frame should be delivered locally. */ if (ifp->if_bridge != NULL) { m->m_flags &= ~M_PROMISC; BRIDGE_INPUT(ifp, m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } #if defined(INET) || defined(INET6) /* * Clear M_PROMISC on frame so that carp(4) will see it when the * mbuf flows up to Layer 3. * FreeBSD's implementation of carp(4) uses the inprotosw * to dispatch IPPROTO_CARP. carp(4) also allocates its own * Ethernet addresses of the form 00:00:5e:00:01:xx, which * is outside the scope of the M_PROMISC test below. * TODO: Maintain a hash table of ethernet addresses other than * ether_dhost which may be active on this ifp. */ if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) { m->m_flags &= ~M_PROMISC; } else #endif { /* * If the frame received was not for our MAC address, set the * M_PROMISC flag on the mbuf chain. The frame may need to * be seen by the rest of the Ethernet input path in case of * re-entry (e.g. bridge, vlan, netgraph) but should not be * seen by upper protocol layers. */ if (!ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0) m->m_flags |= M_PROMISC; } ether_demux(ifp, m); CURVNET_RESTORE(); } /* * Ethernet input dispatch; by default, direct dispatch here regardless of * global configuration. However, if RSS is enabled, hook up RSS affinity * so that when deferred or hybrid dispatch is enabled, we can redistribute * load based on RSS. * * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or * not it had already done work distribution via multi-queue. Then we could * direct dispatch in the event load balancing was already complete and * handle the case of interfaces with different capabilities better. * * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions * at multiple layers? * * XXXRW: For now, enable all this only if RSS is compiled in, although it * works fine without RSS. Need to characterise the performance overhead * of the detour through the netisr code in the event the result is always * direct dispatch. */ static void ether_nh_input(struct mbuf *m) { M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: NULL interface pointer", __func__)); ether_input_internal(m->m_pkthdr.rcvif, m); } static struct netisr_handler ether_nh = { .nh_name = "ether", .nh_handler = ether_nh_input, .nh_proto = NETISR_ETHER, #ifdef RSS .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_DIRECT, .nh_m2cpuid = rss_m2cpuid, #else .nh_policy = NETISR_POLICY_SOURCE, .nh_dispatch = NETISR_DISPATCH_DIRECT, #endif }; static void -ether_init(__unused void *arg) -{ - - netisr_register(ðer_nh); -} -SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL); - -static void vnet_ether_init(__unused void *arg) { int i; /* Initialize packet filter hooks. */ V_link_pfil_hook.ph_type = PFIL_TYPE_AF; V_link_pfil_hook.ph_af = AF_LINK; if ((i = pfil_head_register(&V_link_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil link hook, " "error %d\n", __func__, i); + netisr_register(ðer_nh); } VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_init, NULL); static void -vnet_ether_destroy(__unused void *arg) +vnet_ether_pfil_destroy(__unused void *arg) { int i; if ((i = pfil_head_unregister(&V_link_pfil_hook)) != 0) printf("%s: WARNING: unable to unregister pfil link hook, " "error %d\n", __func__, i); } -VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY, - vnet_ether_destroy, NULL); +VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY, + vnet_ether_pfil_destroy, NULL); +static void +vnet_ether_destroy(__unused void *arg) +{ + netisr_unregister(ðer_nh); +} +VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, + vnet_ether_destroy, NULL); static void ether_input(struct ifnet *ifp, struct mbuf *m) { struct mbuf *mn; /* * The drivers are allowed to pass in a chain of packets linked with * m_nextpkt. We split them up into separate packets here and pass * them up. This allows the drivers to amortize the receive lock. */ while (m) { mn = m->m_nextpkt; m->m_nextpkt = NULL; /* * We will rely on rcvif being set properly in the deferred context, * so assert it is correct here. */ KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch", __func__)); + CURVNET_SET_QUIET(ifp->if_vnet); netisr_dispatch(NETISR_ETHER, m); + CURVNET_RESTORE(); m = mn; } } /* * Upper layer processing for a received Ethernet packet. */ void ether_demux(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; int i, isr; u_short ether_type; KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); /* Do not grab PROMISC frames in case we are re-entered. */ if (PFIL_HOOKED(&V_link_pfil_hook) && !(m->m_flags & M_PROMISC)) { i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_IN, NULL); if (i != 0 || m == NULL) return; } eh = mtod(m, struct ether_header *); ether_type = ntohs(eh->ether_type); /* * If this frame has a VLAN tag other than 0, call vlan_input() * if its module is loaded. Otherwise, drop. */ if ((m->m_flags & M_VLANTAG) && EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) { if (ifp->if_vlantrunk == NULL) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!", __func__)); /* Clear before possibly re-entering ether_input(). */ m->m_flags &= ~M_PROMISC; (*vlan_input_p)(ifp, m); return; } /* * Pass promiscuously received frames to the upper layer if the user * requested this by setting IFF_PPROMISC. Otherwise, drop them. */ if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) { m_freem(m); return; } /* * Reset layer specific mbuf flags to avoid confusing upper layers. * Strip off Ethernet header. */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); m_adj(m, ETHER_HDR_LEN); /* * Dispatch frame to upper layer. */ switch (ether_type) { #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif default: goto discard; } netisr_dispatch(isr, m); return; discard: /* * Packet is to be discarded. If netgraph is present, * hand the packet to it for last chance processing; * otherwise dispose of it. */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_orphan_p != NULL, ("ng_ether_input_orphan_p is NULL")); /* * Put back the ethernet header so netgraph has a * consistent view of inbound packets. */ M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); (*ng_ether_input_orphan_p)(ifp, m); return; } m_freem(m); } /* * Convert Ethernet address to printable (loggable) representation. * This routine is for compatibility; it's better to just use * * printf("%6D", , ":"); * * since there's no static buffer involved. */ char * ether_sprintf(const u_char *ap) { static char etherbuf[18]; snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":"); return (etherbuf); } /* * Perform common duties while attaching to interface list */ void ether_ifattach(struct ifnet *ifp, const u_int8_t *lla) { int i; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifp->if_addrlen = ETHER_ADDR_LEN; ifp->if_hdrlen = ETHER_HDR_LEN; if_attach(ifp); ifp->if_mtu = ETHERMTU; ifp->if_output = ether_output; ifp->if_input = ether_input; ifp->if_resolvemulti = ether_resolvemulti; ifp->if_requestencap = ether_requestencap; #ifdef VIMAGE ifp->if_reassign = ether_reassign; #endif if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Mbps(10); /* just a default */ ifp->if_broadcastaddr = etherbroadcastaddr; ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_ETHER; sdl->sdl_alen = ifp->if_addrlen; bcopy(lla, LLADDR(sdl), ifp->if_addrlen); bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); if (ng_ether_attach_p != NULL) (*ng_ether_attach_p)(ifp); /* Announce Ethernet MAC address if non-zero. */ for (i = 0; i < ifp->if_addrlen; i++) if (lla[i] != 0) break; if (i != ifp->if_addrlen) if_printf(ifp, "Ethernet address: %6D\n", lla, ":"); uuid_ether_add(LLADDR(sdl)); } /* * Perform common duties while detaching an Ethernet interface */ void ether_ifdetach(struct ifnet *ifp) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr); uuid_ether_del(LLADDR(sdl)); if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } bpfdetach(ifp); if_detach(ifp); } #ifdef VIMAGE void ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused) { if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } if (ng_ether_attach_p != NULL) { CURVNET_SET_QUIET(new_vnet); (*ng_ether_attach_p)(ifp); CURVNET_RESTORE(); } } #endif SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); #if 0 /* * This is for reference. We have a table-driven version * of the little-endian crc32 generator, which is faster * than the double-loop. */ uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { size_t i; uint32_t crc; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = (crc ^ data) & 1; crc >>= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_LE); } } return (crc); } #else uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { static const uint32_t crctab[] = { 0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c }; size_t i; uint32_t crc; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { crc ^= buf[i]; crc = (crc >> 4) ^ crctab[crc & 0xf]; crc = (crc >> 4) ^ crctab[crc & 0xf]; } return (crc); } #endif uint32_t ether_crc32_be(const uint8_t *buf, size_t len) { size_t i; uint32_t crc, carry; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01); crc <<= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_BE) | carry; } } return (crc); } int ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; switch (command) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: { struct sockaddr *sa; sa = (struct sockaddr *) & ifr->ifr_data; bcopy(IF_LLADDR(ifp), (caddr_t) sa->sa_data, ETHER_ADDR_LEN); } break; case SIOCSIFMTU: /* * Set the interface MTU. */ if (ifr->ifr_mtu > ETHERMTU) { error = EINVAL; } else { ifp->if_mtu = ifr->ifr_mtu; } break; default: error = EINVAL; /* XXX netbsd has ENOTTY??? */ break; } return (error); } static int ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!ETHER_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = 0; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to all * of the Ethernet multicast address used for IP6. * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; *llsa = 0; return 0; } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: /* * Well, the text isn't quite right, but it's the name * that counts... */ return EAFNOSUPPORT; } } static moduledata_t ether_mod = { .name = "ether", }; void ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen) { struct ether_vlan_header vlan; struct mbuf mv, mb; KASSERT((m->m_flags & M_VLANTAG) != 0, ("%s: vlan information not present", __func__)); KASSERT(m->m_len >= sizeof(struct ether_header), ("%s: mbuf not large enough for header", __func__)); bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header)); vlan.evl_proto = vlan.evl_encap_proto; vlan.evl_encap_proto = htons(ETHERTYPE_VLAN); vlan.evl_tag = htons(m->m_pkthdr.ether_vtag); m->m_len -= sizeof(struct ether_header); m->m_data += sizeof(struct ether_header); /* * If a data link has been supplied by the caller, then we will need to * re-create a stack allocated mbuf chain with the following structure: * * (1) mbuf #1 will contain the supplied data link * (2) mbuf #2 will contain the vlan header * (3) mbuf #3 will contain the original mbuf's packet data * * Otherwise, submit the packet and vlan header via bpf_mtap2(). */ if (data != NULL) { mv.m_next = m; mv.m_data = (caddr_t)&vlan; mv.m_len = sizeof(vlan); mb.m_next = &mv; mb.m_data = data; mb.m_len = dlen; bpf_mtap(bp, &mb); } else bpf_mtap2(bp, &vlan, sizeof(vlan), m); m->m_len += sizeof(struct ether_header); m->m_data -= sizeof(struct ether_header); } struct mbuf * ether_vlanencap(struct mbuf *m, uint16_t tag) { struct ether_vlan_header *evl; M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT); if (m == NULL) return (NULL); /* M_PREPEND takes care of m_len, m_pkthdr.len for us */ if (m->m_len < sizeof(*evl)) { m = m_pullup(m, sizeof(*evl)); if (m == NULL) return (NULL); } /* * Transform the Ethernet header into an Ethernet header * with 802.1Q encapsulation. */ evl = mtod(m, struct ether_vlan_header *); bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN, (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN); evl->evl_encap_proto = htons(ETHERTYPE_VLAN); evl->evl_tag = htons(tag); return (m); } DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(ether, 1); Index: projects/vnet/sys/net/netisr.c =================================================================== --- projects/vnet/sys/net/netisr.c (revision 298547) +++ projects/vnet/sys/net/netisr.c (revision 298548) @@ -1,1372 +1,1380 @@ /*- * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * This software was developed by Robert N. M. Watson under contract * to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * netisr is a packet dispatch service, allowing synchronous (directly * dispatched) and asynchronous (deferred dispatch) processing of packets by * registered protocol handlers. Callers pass a protocol identifier and * packet to netisr, along with a direct dispatch hint, and work will either * be immediately processed by the registered handler, or passed to a * software interrupt (SWI) thread for deferred dispatch. Callers will * generally select one or the other based on: * * - Whether directly dispatching a netisr handler lead to code reentrance or * lock recursion, such as entering the socket code from the socket code. * - Whether directly dispatching a netisr handler lead to recursive * processing, such as when decapsulating several wrapped layers of tunnel * information (IPSEC within IPSEC within ...). * * Maintaining ordering for protocol streams is a critical design concern. * Enforcing ordering limits the opportunity for concurrency, but maintains * the strong ordering requirements found in some protocols, such as TCP. Of * related concern is CPU affinity--it is desirable to process all data * associated with a particular stream on the same CPU over time in order to * avoid acquiring locks associated with the connection on different CPUs, * keep connection data in one cache, and to generally encourage associated * user threads to live on the same CPU as the stream. It's also desirable * to avoid lock migration and contention where locks are associated with * more than one flow. * * netisr supports several policy variations, represented by the * NETISR_POLICY_* constants, allowing protocols to play various roles in * identifying flows, assigning work to CPUs, etc. These are described in * netisr.h. */ #include "opt_ddb.h" #include "opt_device_polling.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #define _WANT_NETISR_INTERNAL /* Enable definitions from netisr_internal.h */ #include #include #include #include #include /*- * Synchronize use and modification of the registered netisr data structures; * acquire a read lock while modifying the set of registered protocols to * prevent partially registered or unregistered protocols from being run. * * The following data structures and fields are protected by this lock: * * - The netisr_proto array, including all fields of struct netisr_proto. * - The nws array, including all fields of struct netisr_worker. * - The nws_array array. * * Note: the NETISR_LOCKING define controls whether read locks are acquired * in packet processing paths requiring netisr registration stability. This * is disabled by default as it can lead to measurable performance * degradation even with rmlocks (3%-6% for loopback ping-pong traffic), and * because netisr registration and unregistration is extremely rare at * runtime. If it becomes more common, this decision should be revisited. * * XXXRW: rmlocks don't support assertions. */ static struct rmlock netisr_rmlock; #define NETISR_LOCK_INIT() rm_init_flags(&netisr_rmlock, "netisr", \ RM_NOWITNESS) #define NETISR_LOCK_ASSERT() #define NETISR_RLOCK(tracker) rm_rlock(&netisr_rmlock, (tracker)) #define NETISR_RUNLOCK(tracker) rm_runlock(&netisr_rmlock, (tracker)) #define NETISR_WLOCK() rm_wlock(&netisr_rmlock) #define NETISR_WUNLOCK() rm_wunlock(&netisr_rmlock) /* #define NETISR_LOCKING */ static SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr"); /*- * Three global direct dispatch policies are supported: * * NETISR_DISPATCH_DEFERRED: All work is deferred for a netisr, regardless of * context (may be overriden by protocols). * * NETISR_DISPATCH_HYBRID: If the executing context allows direct dispatch, * and we're running on the CPU the work would be performed on, then direct * dispatch it if it wouldn't violate ordering constraints on the workstream. * * NETISR_DISPATCH_DIRECT: If the executing context allows direct dispatch, * always direct dispatch. (The default.) * * Notice that changing the global policy could lead to short periods of * misordered processing, but this is considered acceptable as compared to * the complexity of enforcing ordering during policy changes. Protocols can * override the global policy (when they're not doing that, they select * NETISR_DISPATCH_DEFAULT). */ #define NETISR_DISPATCH_POLICY_DEFAULT NETISR_DISPATCH_DIRECT #define NETISR_DISPATCH_POLICY_MAXSTR 20 /* Used for temporary buffers. */ static u_int netisr_dispatch_policy = NETISR_DISPATCH_POLICY_DEFAULT; static int sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_net_isr, OID_AUTO, dispatch, CTLTYPE_STRING | CTLFLAG_RWTUN, 0, 0, sysctl_netisr_dispatch_policy, "A", "netisr dispatch policy"); /* * Allow the administrator to limit the number of threads (CPUs) to use for * netisr. We don't check netisr_maxthreads before creating the thread for * CPU 0. This must be set at boot. We will create at most one thread per CPU. * By default we initialize this to 1 which would assign just 1 cpu (cpu0) and * therefore only 1 workstream. If set to -1, netisr would use all cpus * (mp_ncpus) and therefore would have those many workstreams. One workstream * per thread (CPU). */ static int netisr_maxthreads = 1; /* Max number of threads. */ SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RDTUN, &netisr_maxthreads, 0, "Use at most this many CPUs for netisr processing"); static int netisr_bindthreads = 0; /* Bind threads to CPUs. */ SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RDTUN, &netisr_bindthreads, 0, "Bind netisr threads to CPUs."); /* * Limit per-workstream mbuf queue limits s to at most net.isr.maxqlimit, * both for initial configuration and later modification using * netisr_setqlimit(). */ #define NETISR_DEFAULT_MAXQLIMIT 10240 static u_int netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT; SYSCTL_UINT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RDTUN, &netisr_maxqlimit, 0, "Maximum netisr per-protocol, per-CPU queue depth."); /* * The default per-workstream mbuf queue limit for protocols that don't * initialize the nh_qlimit field of their struct netisr_handler. If this is * set above netisr_maxqlimit, we truncate it to the maximum during boot. */ #define NETISR_DEFAULT_DEFAULTQLIMIT 256 static u_int netisr_defaultqlimit = NETISR_DEFAULT_DEFAULTQLIMIT; SYSCTL_UINT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RDTUN, &netisr_defaultqlimit, 0, "Default netisr per-protocol, per-CPU queue limit if not set by protocol"); /* * Store and export the compile-time constant NETISR_MAXPROT limit on the * number of protocols that can register with netisr at a time. This is * required for crashdump analysis, as it sizes netisr_proto[]. */ static u_int netisr_maxprot = NETISR_MAXPROT; SYSCTL_UINT(_net_isr, OID_AUTO, maxprot, CTLFLAG_RD, &netisr_maxprot, 0, "Compile-time limit on the number of protocols supported by netisr."); /* * The netisr_proto array describes all registered protocols, indexed by * protocol number. See netisr_internal.h for more details. */ -static struct netisr_proto netisr_proto[NETISR_MAXPROT]; +static VNET_DEFINE(struct netisr_proto, netisr_proto[NETISR_MAXPROT]); +#define V_netisr_proto VNET(netisr_proto) /* * Per-CPU workstream data. See netisr_internal.h for more details. */ DPCPU_DEFINE(struct netisr_workstream, nws); /* * Map contiguous values between 0 and nws_count into CPU IDs appropriate for * accessing workstreams. This allows constructions of the form * DPCPU_ID_GET(nws_array[arbitraryvalue % nws_count], nws). */ static u_int nws_array[MAXCPU]; /* * Number of registered workstreams. Will be at most the number of running * CPUs once fully started. */ static u_int nws_count; SYSCTL_UINT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD, &nws_count, 0, "Number of extant netisr threads."); /* * Synchronization for each workstream: a mutex protects all mutable fields * in each stream, including per-protocol state (mbuf queues). The SWI is * woken up if asynchronous dispatch is required. */ #define NWS_LOCK(s) mtx_lock(&(s)->nws_mtx) #define NWS_LOCK_ASSERT(s) mtx_assert(&(s)->nws_mtx, MA_OWNED) #define NWS_UNLOCK(s) mtx_unlock(&(s)->nws_mtx) #define NWS_SIGNAL(s) swi_sched((s)->nws_swi_cookie, 0) /* * Utility routines for protocols that implement their own mapping of flows * to CPUs. */ u_int netisr_get_cpucount(void) { return (nws_count); } u_int netisr_get_cpuid(u_int cpunumber) { KASSERT(cpunumber < nws_count, ("%s: %u > %u", __func__, cpunumber, nws_count)); return (nws_array[cpunumber]); } /* * The default implementation of flow -> CPU ID mapping. * * Non-static so that protocols can use it to map their own work to specific * CPUs in a manner consistent to netisr for affinity purposes. */ u_int netisr_default_flow2cpu(u_int flowid) { return (nws_array[flowid % nws_count]); } /* * Dispatch tunable and sysctl configuration. */ struct netisr_dispatch_table_entry { u_int ndte_policy; const char *ndte_policy_str; }; static const struct netisr_dispatch_table_entry netisr_dispatch_table[] = { { NETISR_DISPATCH_DEFAULT, "default" }, { NETISR_DISPATCH_DEFERRED, "deferred" }, { NETISR_DISPATCH_HYBRID, "hybrid" }, { NETISR_DISPATCH_DIRECT, "direct" }, }; static const u_int netisr_dispatch_table_len = (sizeof(netisr_dispatch_table) / sizeof(netisr_dispatch_table[0])); static void netisr_dispatch_policy_to_str(u_int dispatch_policy, char *buffer, u_int buflen) { const struct netisr_dispatch_table_entry *ndtep; const char *str; u_int i; str = "unknown"; for (i = 0; i < netisr_dispatch_table_len; i++) { ndtep = &netisr_dispatch_table[i]; if (ndtep->ndte_policy == dispatch_policy) { str = ndtep->ndte_policy_str; break; } } snprintf(buffer, buflen, "%s", str); } static int netisr_dispatch_policy_from_str(const char *str, u_int *dispatch_policyp) { const struct netisr_dispatch_table_entry *ndtep; u_int i; for (i = 0; i < netisr_dispatch_table_len; i++) { ndtep = &netisr_dispatch_table[i]; if (strcmp(ndtep->ndte_policy_str, str) == 0) { *dispatch_policyp = ndtep->ndte_policy; return (0); } } return (EINVAL); } static int sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS) { char tmp[NETISR_DISPATCH_POLICY_MAXSTR]; u_int dispatch_policy; int error; netisr_dispatch_policy_to_str(netisr_dispatch_policy, tmp, sizeof(tmp)); error = sysctl_handle_string(oidp, tmp, sizeof(tmp), req); if (error == 0 && req->newptr != NULL) { error = netisr_dispatch_policy_from_str(tmp, &dispatch_policy); if (error == 0 && dispatch_policy == NETISR_DISPATCH_DEFAULT) error = EINVAL; if (error == 0) netisr_dispatch_policy = dispatch_policy; } return (error); } /* * Register a new netisr handler, which requires initializing per-protocol * fields for each workstream. All netisr work is briefly suspended while * the protocol is installed. */ void netisr_register(const struct netisr_handler *nhp) { struct netisr_work *npwp; const char *name; u_int i, proto; proto = nhp->nh_proto; name = nhp->nh_name; /* * Test that the requested registration is valid. */ KASSERT(nhp->nh_name != NULL, ("%s: nh_name NULL for %u", __func__, proto)); KASSERT(nhp->nh_handler != NULL, ("%s: nh_handler NULL for %s", __func__, name)); KASSERT(nhp->nh_policy == NETISR_POLICY_SOURCE || nhp->nh_policy == NETISR_POLICY_FLOW || nhp->nh_policy == NETISR_POLICY_CPU, ("%s: unsupported nh_policy %u for %s", __func__, nhp->nh_policy, name)); KASSERT(nhp->nh_policy == NETISR_POLICY_FLOW || nhp->nh_m2flow == NULL, ("%s: nh_policy != FLOW but m2flow defined for %s", __func__, name)); KASSERT(nhp->nh_policy == NETISR_POLICY_CPU || nhp->nh_m2cpuid == NULL, ("%s: nh_policy != CPU but m2cpuid defined for %s", __func__, name)); KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL, ("%s: nh_policy == CPU but m2cpuid not defined for %s", __func__, name)); KASSERT(nhp->nh_dispatch == NETISR_DISPATCH_DEFAULT || nhp->nh_dispatch == NETISR_DISPATCH_DEFERRED || nhp->nh_dispatch == NETISR_DISPATCH_HYBRID || nhp->nh_dispatch == NETISR_DISPATCH_DIRECT, ("%s: invalid nh_dispatch (%u)", __func__, nhp->nh_dispatch)); KASSERT(proto < NETISR_MAXPROT, ("%s(%u, %s): protocol too big", __func__, proto, name)); /* * Test that no existing registration exists for this protocol. */ NETISR_WLOCK(); - KASSERT(netisr_proto[proto].np_name == NULL, + KASSERT(V_netisr_proto[proto].np_name == NULL, ("%s(%u, %s): name present", __func__, proto, name)); - KASSERT(netisr_proto[proto].np_handler == NULL, + KASSERT(V_netisr_proto[proto].np_handler == NULL, ("%s(%u, %s): handler present", __func__, proto, name)); - netisr_proto[proto].np_name = name; - netisr_proto[proto].np_handler = nhp->nh_handler; - netisr_proto[proto].np_m2flow = nhp->nh_m2flow; - netisr_proto[proto].np_m2cpuid = nhp->nh_m2cpuid; - netisr_proto[proto].np_drainedcpu = nhp->nh_drainedcpu; + V_netisr_proto[proto].np_name = name; + V_netisr_proto[proto].np_handler = nhp->nh_handler; + V_netisr_proto[proto].np_m2flow = nhp->nh_m2flow; + V_netisr_proto[proto].np_m2cpuid = nhp->nh_m2cpuid; + V_netisr_proto[proto].np_drainedcpu = nhp->nh_drainedcpu; if (nhp->nh_qlimit == 0) - netisr_proto[proto].np_qlimit = netisr_defaultqlimit; + V_netisr_proto[proto].np_qlimit = netisr_defaultqlimit; else if (nhp->nh_qlimit > netisr_maxqlimit) { printf("%s: %s requested queue limit %u capped to " "net.isr.maxqlimit %u\n", __func__, name, nhp->nh_qlimit, netisr_maxqlimit); - netisr_proto[proto].np_qlimit = netisr_maxqlimit; + V_netisr_proto[proto].np_qlimit = netisr_maxqlimit; } else - netisr_proto[proto].np_qlimit = nhp->nh_qlimit; - netisr_proto[proto].np_policy = nhp->nh_policy; - netisr_proto[proto].np_dispatch = nhp->nh_dispatch; + V_netisr_proto[proto].np_qlimit = nhp->nh_qlimit; + V_netisr_proto[proto].np_policy = nhp->nh_policy; + V_netisr_proto[proto].np_dispatch = nhp->nh_dispatch; CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; bzero(npwp, sizeof(*npwp)); - npwp->nw_qlimit = netisr_proto[proto].np_qlimit; + npwp->nw_qlimit = V_netisr_proto[proto].np_qlimit; } NETISR_WUNLOCK(); } /* * Clear drop counters across all workstreams for a protocol. */ void netisr_clearqdrops(const struct netisr_handler *nhp) { struct netisr_work *npwp; #ifdef INVARIANTS const char *name; #endif u_int i, proto; proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_WLOCK(); - KASSERT(netisr_proto[proto].np_handler != NULL, + KASSERT(V_netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; npwp->nw_qdrops = 0; } NETISR_WUNLOCK(); } /* * Query current drop counters across all workstreams for a protocol. */ void netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp) { struct netisr_work *npwp; struct rm_priotracker tracker; #ifdef INVARIANTS const char *name; #endif u_int i, proto; *qdropp = 0; proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_RLOCK(&tracker); - KASSERT(netisr_proto[proto].np_handler != NULL, + KASSERT(V_netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; *qdropp += npwp->nw_qdrops; } NETISR_RUNLOCK(&tracker); } /* * Query current per-workstream queue limit for a protocol. */ void netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp) { struct rm_priotracker tracker; #ifdef INVARIANTS const char *name; #endif u_int proto; proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_RLOCK(&tracker); - KASSERT(netisr_proto[proto].np_handler != NULL, + KASSERT(V_netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); - *qlimitp = netisr_proto[proto].np_qlimit; + *qlimitp = V_netisr_proto[proto].np_qlimit; NETISR_RUNLOCK(&tracker); } /* * Update the queue limit across per-workstream queues for a protocol. We * simply change the limits, and don't drain overflowed packets as they will * (hopefully) take care of themselves shortly. */ int netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit) { struct netisr_work *npwp; #ifdef INVARIANTS const char *name; #endif u_int i, proto; if (qlimit > netisr_maxqlimit) return (EINVAL); proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_WLOCK(); - KASSERT(netisr_proto[proto].np_handler != NULL, + KASSERT(V_netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); - netisr_proto[proto].np_qlimit = qlimit; + V_netisr_proto[proto].np_qlimit = qlimit; CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; npwp->nw_qlimit = qlimit; } NETISR_WUNLOCK(); return (0); } /* * Drain all packets currently held in a particular protocol work queue. */ static void netisr_drain_proto(struct netisr_work *npwp) { struct mbuf *m; /* * We would assert the lock on the workstream but it's not passed in. */ while ((m = npwp->nw_head) != NULL) { npwp->nw_head = m->m_nextpkt; m->m_nextpkt = NULL; if (npwp->nw_head == NULL) npwp->nw_tail = NULL; npwp->nw_len--; m_freem(m); } KASSERT(npwp->nw_tail == NULL, ("%s: tail", __func__)); KASSERT(npwp->nw_len == 0, ("%s: len", __func__)); } /* * Remove the registration of a network protocol, which requires clearing * per-protocol fields across all workstreams, including freeing all mbufs in * the queues at time of unregister. All work in netisr is briefly suspended * while this takes place. */ void netisr_unregister(const struct netisr_handler *nhp) { struct netisr_work *npwp; #ifdef INVARIANTS const char *name; #endif u_int i, proto; proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_WLOCK(); - KASSERT(netisr_proto[proto].np_handler != NULL, + KASSERT(V_netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); - netisr_proto[proto].np_name = NULL; - netisr_proto[proto].np_handler = NULL; - netisr_proto[proto].np_m2flow = NULL; - netisr_proto[proto].np_m2cpuid = NULL; - netisr_proto[proto].np_qlimit = 0; - netisr_proto[proto].np_policy = 0; + V_netisr_proto[proto].np_name = NULL; + V_netisr_proto[proto].np_handler = NULL; + V_netisr_proto[proto].np_m2flow = NULL; + V_netisr_proto[proto].np_m2cpuid = NULL; + V_netisr_proto[proto].np_qlimit = 0; + V_netisr_proto[proto].np_policy = 0; CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; netisr_drain_proto(npwp); bzero(npwp, sizeof(*npwp)); } NETISR_WUNLOCK(); } /* * Compose the global and per-protocol policies on dispatch, and return the * dispatch policy to use. */ static u_int netisr_get_dispatch(struct netisr_proto *npp) { /* * Protocol-specific configuration overrides the global default. */ if (npp->np_dispatch != NETISR_DISPATCH_DEFAULT) return (npp->np_dispatch); return (netisr_dispatch_policy); } /* * Look up the workstream given a packet and source identifier. Do this by * checking the protocol's policy, and optionally call out to the protocol * for assistance if required. */ static struct mbuf * netisr_select_cpuid(struct netisr_proto *npp, u_int dispatch_policy, uintptr_t source, struct mbuf *m, u_int *cpuidp) { struct ifnet *ifp; u_int policy; NETISR_LOCK_ASSERT(); /* * In the event we have only one worker, shortcut and deliver to it * without further ado. */ if (nws_count == 1) { *cpuidp = nws_array[0]; return (m); } /* * What happens next depends on the policy selected by the protocol. * If we want to support per-interface policies, we should do that * here first. */ policy = npp->np_policy; if (policy == NETISR_POLICY_CPU) { m = npp->np_m2cpuid(m, source, cpuidp); if (m == NULL) return (NULL); /* * It's possible for a protocol not to have a good idea about * where to process a packet, in which case we fall back on * the netisr code to decide. In the hybrid case, return the * current CPU ID, which will force an immediate direct * dispatch. In the queued case, fall back on the SOURCE * policy. */ if (*cpuidp != NETISR_CPUID_NONE) return (m); if (dispatch_policy == NETISR_DISPATCH_HYBRID) { *cpuidp = curcpu; return (m); } policy = NETISR_POLICY_SOURCE; } if (policy == NETISR_POLICY_FLOW) { if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE && npp->np_m2flow != NULL) { m = npp->np_m2flow(m, source); if (m == NULL) return (NULL); } if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { *cpuidp = netisr_default_flow2cpu(m->m_pkthdr.flowid); return (m); } policy = NETISR_POLICY_SOURCE; } KASSERT(policy == NETISR_POLICY_SOURCE, ("%s: invalid policy %u for %s", __func__, npp->np_policy, npp->np_name)); ifp = m->m_pkthdr.rcvif; if (ifp != NULL) *cpuidp = nws_array[(ifp->if_index + source) % nws_count]; else *cpuidp = nws_array[source % nws_count]; return (m); } /* * Process packets associated with a workstream and protocol. For reasons of * fairness, we process up to one complete netisr queue at a time, moving the * queue to a stack-local queue for processing, but do not loop refreshing * from the global queue. The caller is responsible for deciding whether to * loop, and for setting the NWS_RUNNING flag. The passed workstream will be * locked on entry and relocked before return, but will be released while * processing. The number of packets processed is returned. */ static u_int netisr_process_workstream_proto(struct netisr_workstream *nwsp, u_int proto) { struct netisr_work local_npw, *npwp; u_int handled; struct mbuf *m; NETISR_LOCK_ASSERT(); NWS_LOCK_ASSERT(nwsp); KASSERT(nwsp->nws_flags & NWS_RUNNING, ("%s(%u): not running", __func__, proto)); KASSERT(proto >= 0 && proto < NETISR_MAXPROT, ("%s(%u): invalid proto\n", __func__, proto)); npwp = &nwsp->nws_work[proto]; if (npwp->nw_len == 0) return (0); /* * Move the global work queue to a thread-local work queue. * * Notice that this means the effective maximum length of the queue * is actually twice that of the maximum queue length specified in * the protocol registration call. */ handled = npwp->nw_len; local_npw = *npwp; npwp->nw_head = NULL; npwp->nw_tail = NULL; npwp->nw_len = 0; nwsp->nws_pendingbits &= ~(1 << proto); NWS_UNLOCK(nwsp); while ((m = local_npw.nw_head) != NULL) { local_npw.nw_head = m->m_nextpkt; m->m_nextpkt = NULL; if (local_npw.nw_head == NULL) local_npw.nw_tail = NULL; local_npw.nw_len--; VNET_ASSERT(m->m_pkthdr.rcvif != NULL, ("%s:%d rcvif == NULL: m=%p", __func__, __LINE__, m)); CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); - netisr_proto[proto].np_handler(m); + V_netisr_proto[proto].np_handler(m); CURVNET_RESTORE(); } KASSERT(local_npw.nw_len == 0, ("%s(%u): len %u", __func__, proto, local_npw.nw_len)); - if (netisr_proto[proto].np_drainedcpu) - netisr_proto[proto].np_drainedcpu(nwsp->nws_cpu); + /* We can just use the one from the default VNET. */ + CURVNET_SET_QUIET(vnet0); + if (V_netisr_proto[proto].np_drainedcpu) + V_netisr_proto[proto].np_drainedcpu(nwsp->nws_cpu); + CURVNET_RESTORE(); NWS_LOCK(nwsp); npwp->nw_handled += handled; return (handled); } /* * SWI handler for netisr -- processes packets in a set of workstreams that * it owns, woken up by calls to NWS_SIGNAL(). If this workstream is already * being direct dispatched, go back to sleep and wait for the dispatching * thread to wake us up again. */ static void swi_net(void *arg) { #ifdef NETISR_LOCKING struct rm_priotracker tracker; #endif struct netisr_workstream *nwsp; u_int bits, prot; nwsp = arg; #ifdef DEVICE_POLLING KASSERT(nws_count == 1, ("%s: device_polling but nws_count != 1", __func__)); netisr_poll(); #endif #ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); #endif NWS_LOCK(nwsp); KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running")); if (nwsp->nws_flags & NWS_DISPATCHING) goto out; nwsp->nws_flags |= NWS_RUNNING; nwsp->nws_flags &= ~NWS_SCHEDULED; while ((bits = nwsp->nws_pendingbits) != 0) { while ((prot = ffs(bits)) != 0) { prot--; bits &= ~(1 << prot); (void)netisr_process_workstream_proto(nwsp, prot); } } nwsp->nws_flags &= ~NWS_RUNNING; out: NWS_UNLOCK(nwsp); #ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); #endif #ifdef DEVICE_POLLING netisr_pollmore(); #endif } static int netisr_queue_workstream(struct netisr_workstream *nwsp, u_int proto, struct netisr_work *npwp, struct mbuf *m, int *dosignalp) { NWS_LOCK_ASSERT(nwsp); *dosignalp = 0; if (npwp->nw_len < npwp->nw_qlimit) { m->m_nextpkt = NULL; if (npwp->nw_head == NULL) { npwp->nw_head = m; npwp->nw_tail = m; } else { npwp->nw_tail->m_nextpkt = m; npwp->nw_tail = m; } npwp->nw_len++; if (npwp->nw_len > npwp->nw_watermark) npwp->nw_watermark = npwp->nw_len; /* * We must set the bit regardless of NWS_RUNNING, so that * swi_net() keeps calling netisr_process_workstream_proto(). */ nwsp->nws_pendingbits |= (1 << proto); if (!(nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED))) { nwsp->nws_flags |= NWS_SCHEDULED; *dosignalp = 1; /* Defer until unlocked. */ } npwp->nw_queued++; return (0); } else { m_freem(m); npwp->nw_qdrops++; return (ENOBUFS); } } static int netisr_queue_internal(u_int proto, struct mbuf *m, u_int cpuid) { struct netisr_workstream *nwsp; struct netisr_work *npwp; int dosignal, error; #ifdef NETISR_LOCKING NETISR_LOCK_ASSERT(); #endif KASSERT(cpuid <= mp_maxid, ("%s: cpuid too big (%u, %u)", __func__, cpuid, mp_maxid)); KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); dosignal = 0; error = 0; nwsp = DPCPU_ID_PTR(cpuid, nws); npwp = &nwsp->nws_work[proto]; NWS_LOCK(nwsp); error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal); NWS_UNLOCK(nwsp); if (dosignal) NWS_SIGNAL(nwsp); return (error); } int netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m) { #ifdef NETISR_LOCKING struct rm_priotracker tracker; #endif u_int cpuid; int error; KASSERT(proto < NETISR_MAXPROT, ("%s: invalid proto %u", __func__, proto)); #ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); #endif - KASSERT(netisr_proto[proto].np_handler != NULL, - ("%s: invalid proto %u", __func__, proto)); + if (V_netisr_proto[proto].np_handler == NULL) { + m_freem(m); + return (ENOPROTOOPT); + } - m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_DEFERRED, + m = netisr_select_cpuid(&V_netisr_proto[proto], NETISR_DISPATCH_DEFERRED, source, m, &cpuid); if (m != NULL) { KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); error = netisr_queue_internal(proto, m, cpuid); } else error = ENOBUFS; #ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); #endif return (error); } int netisr_queue(u_int proto, struct mbuf *m) { return (netisr_queue_src(proto, 0, m)); } /* * Dispatch a packet for netisr processing; direct dispatch is permitted by * calling context. */ int netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m) { #ifdef NETISR_LOCKING struct rm_priotracker tracker; #endif struct netisr_workstream *nwsp; struct netisr_proto *npp; struct netisr_work *npwp; int dosignal, error; u_int cpuid, dispatch_policy; KASSERT(proto < NETISR_MAXPROT, ("%s: invalid proto %u", __func__, proto)); #ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); #endif - npp = &netisr_proto[proto]; - KASSERT(npp->np_handler != NULL, ("%s: invalid proto %u", __func__, - proto)); + npp = &V_netisr_proto[proto]; + if (npp->np_handler == NULL) { + m_freem(m); + return (ENOPROTOOPT); + } dispatch_policy = netisr_get_dispatch(npp); if (dispatch_policy == NETISR_DISPATCH_DEFERRED) return (netisr_queue_src(proto, source, m)); /* * If direct dispatch is forced, then unconditionally dispatch * without a formal CPU selection. Borrow the current CPU's stats, * even if there's no worker on it. In this case we don't update * nws_flags because all netisr processing will be source ordered due * to always being forced to directly dispatch. */ if (dispatch_policy == NETISR_DISPATCH_DIRECT) { nwsp = DPCPU_PTR(nws); npwp = &nwsp->nws_work[proto]; npwp->nw_dispatched++; npwp->nw_handled++; - netisr_proto[proto].np_handler(m); + V_netisr_proto[proto].np_handler(m); error = 0; goto out_unlock; } KASSERT(dispatch_policy == NETISR_DISPATCH_HYBRID, ("%s: unknown dispatch policy (%u)", __func__, dispatch_policy)); /* * Otherwise, we execute in a hybrid mode where we will try to direct * dispatch if we're on the right CPU and the netisr worker isn't * already running. */ sched_pin(); - m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_HYBRID, + m = netisr_select_cpuid(&V_netisr_proto[proto], NETISR_DISPATCH_HYBRID, source, m, &cpuid); if (m == NULL) { error = ENOBUFS; goto out_unpin; } KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); if (cpuid != curcpu) goto queue_fallback; nwsp = DPCPU_PTR(nws); npwp = &nwsp->nws_work[proto]; /*- * We are willing to direct dispatch only if three conditions hold: * * (1) The netisr worker isn't already running, * (2) Another thread isn't already directly dispatching, and * (3) The netisr hasn't already been woken up. */ NWS_LOCK(nwsp); if (nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED)) { error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal); NWS_UNLOCK(nwsp); if (dosignal) NWS_SIGNAL(nwsp); goto out_unpin; } /* * The current thread is now effectively the netisr worker, so set * the dispatching flag to prevent concurrent processing of the * stream from another thread (even the netisr worker), which could * otherwise lead to effective misordering of the stream. */ nwsp->nws_flags |= NWS_DISPATCHING; NWS_UNLOCK(nwsp); - netisr_proto[proto].np_handler(m); + V_netisr_proto[proto].np_handler(m); NWS_LOCK(nwsp); nwsp->nws_flags &= ~NWS_DISPATCHING; npwp->nw_handled++; npwp->nw_hybrid_dispatched++; /* * If other work was enqueued by another thread while we were direct * dispatching, we need to signal the netisr worker to do that work. * In the future, we might want to do some of that work in the * current thread, rather than trigger further context switches. If * so, we'll want to establish a reasonable bound on the work done in * the "borrowed" context. */ if (nwsp->nws_pendingbits != 0) { nwsp->nws_flags |= NWS_SCHEDULED; dosignal = 1; } else dosignal = 0; NWS_UNLOCK(nwsp); if (dosignal) NWS_SIGNAL(nwsp); error = 0; goto out_unpin; queue_fallback: error = netisr_queue_internal(proto, m, cpuid); out_unpin: sched_unpin(); out_unlock: #ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); #endif return (error); } int netisr_dispatch(u_int proto, struct mbuf *m) { return (netisr_dispatch_src(proto, 0, m)); } #ifdef DEVICE_POLLING /* * Kernel polling borrows a netisr thread to run interface polling in; this * function allows kernel polling to request that the netisr thread be * scheduled even if no packets are pending for protocols. */ void netisr_sched_poll(void) { struct netisr_workstream *nwsp; nwsp = DPCPU_ID_PTR(nws_array[0], nws); NWS_SIGNAL(nwsp); } #endif static void netisr_start_swi(u_int cpuid, struct pcpu *pc) { char swiname[12]; struct netisr_workstream *nwsp; int error; KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); nwsp = DPCPU_ID_PTR(cpuid, nws); mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF); nwsp->nws_cpu = cpuid; snprintf(swiname, sizeof(swiname), "netisr %u", cpuid); error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, nwsp, SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie); if (error) panic("%s: swi_add %d", __func__, error); pc->pc_netisr = nwsp->nws_intr_event; if (netisr_bindthreads) { error = intr_event_bind(nwsp->nws_intr_event, cpuid); if (error != 0) printf("%s: cpu %u: intr_event_bind: %d", __func__, cpuid, error); } NETISR_WLOCK(); nws_array[nws_count] = nwsp->nws_cpu; nws_count++; NETISR_WUNLOCK(); } /* * Initialize the netisr subsystem. We rely on BSS and static initialization * of most fields in global data structures. * * Start a worker thread for the boot CPU so that we can support network * traffic immediately in case the network stack is used before additional * CPUs are started (for example, diskless boot). */ static void netisr_init(void *arg) { KASSERT(curcpu == 0, ("%s: not on CPU 0", __func__)); NETISR_LOCK_INIT(); if (netisr_maxthreads == 0 || netisr_maxthreads < -1 ) netisr_maxthreads = 1; /* default behavior */ else if (netisr_maxthreads == -1) netisr_maxthreads = mp_ncpus; /* use max cpus */ if (netisr_maxthreads > mp_ncpus) { printf("netisr_init: forcing maxthreads from %d to %d\n", netisr_maxthreads, mp_ncpus); netisr_maxthreads = mp_ncpus; } if (netisr_defaultqlimit > netisr_maxqlimit) { printf("netisr_init: forcing defaultqlimit from %d to %d\n", netisr_defaultqlimit, netisr_maxqlimit); netisr_defaultqlimit = netisr_maxqlimit; } #ifdef DEVICE_POLLING /* * The device polling code is not yet aware of how to deal with * multiple netisr threads, so for the time being compiling in device * polling disables parallel netisr workers. */ if (netisr_maxthreads != 1 || netisr_bindthreads != 0) { printf("netisr_init: forcing maxthreads to 1 and " "bindthreads to 0 for device polling\n"); netisr_maxthreads = 1; netisr_bindthreads = 0; } #endif netisr_start_swi(curcpu, pcpu_find(curcpu)); } SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL); /* * Start worker threads for additional CPUs. No attempt to gracefully handle * work reassignment, we don't yet support dynamic reconfiguration. */ static void netisr_start(void *arg) { struct pcpu *pc; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (nws_count >= netisr_maxthreads) break; /* Worker will already be present for boot CPU. */ if (pc->pc_netisr != NULL) continue; netisr_start_swi(pc->pc_cpuid, pc); } } SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL); /* * Sysctl monitoring for netisr: query a list of registered protocols. */ static int sysctl_netisr_proto(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; struct sysctl_netisr_proto *snpp, *snp_array; struct netisr_proto *npp; u_int counter, proto; int error; if (req->newptr != NULL) return (EINVAL); snp_array = malloc(sizeof(*snp_array) * NETISR_MAXPROT, M_TEMP, M_ZERO | M_WAITOK); counter = 0; NETISR_RLOCK(&tracker); for (proto = 0; proto < NETISR_MAXPROT; proto++) { - npp = &netisr_proto[proto]; + npp = &V_netisr_proto[proto]; if (npp->np_name == NULL) continue; snpp = &snp_array[counter]; snpp->snp_version = sizeof(*snpp); strlcpy(snpp->snp_name, npp->np_name, NETISR_NAMEMAXLEN); snpp->snp_proto = proto; snpp->snp_qlimit = npp->np_qlimit; snpp->snp_policy = npp->np_policy; snpp->snp_dispatch = npp->np_dispatch; if (npp->np_m2flow != NULL) snpp->snp_flags |= NETISR_SNP_FLAGS_M2FLOW; if (npp->np_m2cpuid != NULL) snpp->snp_flags |= NETISR_SNP_FLAGS_M2CPUID; if (npp->np_drainedcpu != NULL) snpp->snp_flags |= NETISR_SNP_FLAGS_DRAINEDCPU; counter++; } NETISR_RUNLOCK(&tracker); KASSERT(counter <= NETISR_MAXPROT, ("sysctl_netisr_proto: counter too big (%d)", counter)); error = SYSCTL_OUT(req, snp_array, sizeof(*snp_array) * counter); free(snp_array, M_TEMP); return (error); } SYSCTL_PROC(_net_isr, OID_AUTO, proto, CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_proto, "S,sysctl_netisr_proto", "Return list of protocols registered with netisr"); /* * Sysctl monitoring for netisr: query a list of workstreams. */ static int sysctl_netisr_workstream(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; struct sysctl_netisr_workstream *snwsp, *snws_array; struct netisr_workstream *nwsp; u_int counter, cpuid; int error; if (req->newptr != NULL) return (EINVAL); snws_array = malloc(sizeof(*snws_array) * MAXCPU, M_TEMP, M_ZERO | M_WAITOK); counter = 0; NETISR_RLOCK(&tracker); CPU_FOREACH(cpuid) { nwsp = DPCPU_ID_PTR(cpuid, nws); if (nwsp->nws_intr_event == NULL) continue; NWS_LOCK(nwsp); snwsp = &snws_array[counter]; snwsp->snws_version = sizeof(*snwsp); /* * For now, we equate workstream IDs and CPU IDs in the * kernel, but expose them independently to userspace in case * that assumption changes in the future. */ snwsp->snws_wsid = cpuid; snwsp->snws_cpu = cpuid; if (nwsp->nws_intr_event != NULL) snwsp->snws_flags |= NETISR_SNWS_FLAGS_INTR; NWS_UNLOCK(nwsp); counter++; } NETISR_RUNLOCK(&tracker); KASSERT(counter <= MAXCPU, ("sysctl_netisr_workstream: counter too big (%d)", counter)); error = SYSCTL_OUT(req, snws_array, sizeof(*snws_array) * counter); free(snws_array, M_TEMP); return (error); } SYSCTL_PROC(_net_isr, OID_AUTO, workstream, CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_workstream, "S,sysctl_netisr_workstream", "Return list of workstreams implemented by netisr"); /* * Sysctl monitoring for netisr: query per-protocol data across all * workstreams. */ static int sysctl_netisr_work(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; struct sysctl_netisr_work *snwp, *snw_array; struct netisr_workstream *nwsp; struct netisr_proto *npp; struct netisr_work *nwp; u_int counter, cpuid, proto; int error; if (req->newptr != NULL) return (EINVAL); snw_array = malloc(sizeof(*snw_array) * MAXCPU * NETISR_MAXPROT, M_TEMP, M_ZERO | M_WAITOK); counter = 0; NETISR_RLOCK(&tracker); CPU_FOREACH(cpuid) { nwsp = DPCPU_ID_PTR(cpuid, nws); if (nwsp->nws_intr_event == NULL) continue; NWS_LOCK(nwsp); for (proto = 0; proto < NETISR_MAXPROT; proto++) { - npp = &netisr_proto[proto]; + npp = &V_netisr_proto[proto]; if (npp->np_name == NULL) continue; nwp = &nwsp->nws_work[proto]; snwp = &snw_array[counter]; snwp->snw_version = sizeof(*snwp); snwp->snw_wsid = cpuid; /* See comment above. */ snwp->snw_proto = proto; snwp->snw_len = nwp->nw_len; snwp->snw_watermark = nwp->nw_watermark; snwp->snw_dispatched = nwp->nw_dispatched; snwp->snw_hybrid_dispatched = nwp->nw_hybrid_dispatched; snwp->snw_qdrops = nwp->nw_qdrops; snwp->snw_queued = nwp->nw_queued; snwp->snw_handled = nwp->nw_handled; counter++; } NWS_UNLOCK(nwsp); } KASSERT(counter <= MAXCPU * NETISR_MAXPROT, ("sysctl_netisr_work: counter too big (%d)", counter)); NETISR_RUNLOCK(&tracker); error = SYSCTL_OUT(req, snw_array, sizeof(*snw_array) * counter); free(snw_array, M_TEMP); return (error); } SYSCTL_PROC(_net_isr, OID_AUTO, work, CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_work, "S,sysctl_netisr_work", "Return list of per-workstream, per-protocol work in netisr"); #ifdef DDB DB_SHOW_COMMAND(netisr, db_show_netisr) { struct netisr_workstream *nwsp; struct netisr_work *nwp; int first, proto; u_int cpuid; db_printf("%3s %6s %5s %5s %5s %8s %8s %8s %8s\n", "CPU", "Proto", "Len", "WMark", "Max", "Disp", "HDisp", "Drop", "Queue"); CPU_FOREACH(cpuid) { nwsp = DPCPU_ID_PTR(cpuid, nws); if (nwsp->nws_intr_event == NULL) continue; first = 1; for (proto = 0; proto < NETISR_MAXPROT; proto++) { - if (netisr_proto[proto].np_handler == NULL) + if (V_netisr_proto[proto].np_handler == NULL) continue; nwp = &nwsp->nws_work[proto]; if (first) { db_printf("%3d ", cpuid); first = 0; } else db_printf("%3s ", ""); db_printf( "%6s %5d %5d %5d %8ju %8ju %8ju %8ju\n", - netisr_proto[proto].np_name, nwp->nw_len, + V_netisr_proto[proto].np_name, nwp->nw_len, nwp->nw_watermark, nwp->nw_qlimit, nwp->nw_dispatched, nwp->nw_hybrid_dispatched, nwp->nw_qdrops, nwp->nw_queued); } } } #endif Index: projects/vnet/sys/net/rtsock.c =================================================================== --- projects/vnet/sys/net/rtsock.c (revision 298547) +++ projects/vnet/sys/net/rtsock.c (revision 298548) @@ -1,1925 +1,1942 @@ /*- * Copyright (c) 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 * $FreeBSD$ */ #include "opt_compat.h" #include "opt_mpath.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #ifdef COMPAT_FREEBSD32 #include #include struct if_msghdr32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; struct if_data ifm_data; }; struct if_msghdrl32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; uint16_t _ifm_spare1; uint16_t ifm_len; uint16_t ifm_data_off; struct if_data ifm_data; }; struct ifa_msghdrl32 { uint16_t ifam_msglen; uint8_t ifam_version; uint8_t ifam_type; int32_t ifam_addrs; int32_t ifam_flags; uint16_t ifam_index; uint16_t _ifam_spare1; uint16_t ifam_len; uint16_t ifam_data_off; int32_t ifam_metric; struct if_data ifam_data; }; #endif /* COMPAT_FREEBSD32 */ MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); /* NB: these are not modified */ static struct sockaddr route_src = { 2, PF_ROUTE, }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; /* These are external hooks for CARP. */ int (*carp_get_vhid_p)(struct ifaddr *); /* * Used by rtsock/raw_input callback code to decide whether to filter the update * notification to a socket bound to a particular FIB. */ #define RTS_FILTER_FIB M_PROTO8 typedef struct { int ip_count; /* attached w/ AF_INET */ int ip6_count; /* attached w/ AF_INET6 */ int any_count; /* total attached */ } route_cb_t; static VNET_DEFINE(route_cb_t, route_cb); #define V_route_cb VNET(route_cb) struct mtx rtsock_mtx; MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF); #define RTSOCK_LOCK() mtx_lock(&rtsock_mtx) #define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx) #define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED) static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, ""); struct walkarg { int w_tmemsize; int w_op, w_arg; caddr_t w_tmem; struct sysctl_req *w_req; }; static void rts_input(struct mbuf *m); static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo); static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen); static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo); static int sysctl_dumpentry(struct radix_node *rn, void *vw); static int sysctl_iflist(int af, struct walkarg *w); static int sysctl_ifmalist(int af, struct walkarg *w); static int route_output(struct mbuf *m, struct socket *so, ...); static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out); static void rt_dispatch(struct mbuf *, sa_family_t); static struct sockaddr *rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask, struct sockaddr_storage *dmask); static struct netisr_handler rtsock_nh = { .nh_name = "rtsock", .nh_handler = rts_input, .nh_proto = NETISR_ROUTE, .nh_policy = NETISR_POLICY_SOURCE, }; static int sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&rtsock_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&rtsock_nh, qlimit)); } SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_route_netisr_maxqlen, "I", "maximum routing socket dispatch queue length"); static void rts_init(void) { int tmp; if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp)) rtsock_nh.nh_qlimit = tmp; - netisr_register(&rtsock_nh); } SYSINIT(rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rts_init, 0); + +static void +vnet_rts_init(void) +{ + + netisr_register(&rtsock_nh); +} +VNET_SYSINIT(vnet_rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, + vnet_rts_init, 0); + +static void +vnet_rts_uninit(void) +{ + + netisr_unregister(&rtsock_nh); +} +VNET_SYSUNINIT(vnet_rts_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, + vnet_rts_uninit, 0); static int raw_input_rts_cb(struct mbuf *m, struct sockproto *proto, struct sockaddr *src, struct rawcb *rp) { int fibnum; KASSERT(m != NULL, ("%s: m is NULL", __func__)); KASSERT(proto != NULL, ("%s: proto is NULL", __func__)); KASSERT(rp != NULL, ("%s: rp is NULL", __func__)); /* No filtering requested. */ if ((m->m_flags & RTS_FILTER_FIB) == 0) return (0); /* Check if it is a rts and the fib matches the one of the socket. */ fibnum = M_GETFIB(m); if (proto->sp_family != PF_ROUTE || rp->rcb_socket == NULL || rp->rcb_socket->so_fibnum == fibnum) return (0); /* Filtering requested and no match, the socket shall be skipped. */ return (1); } static void rts_input(struct mbuf *m) { struct sockproto route_proto; unsigned short *family; struct m_tag *tag; route_proto.sp_family = PF_ROUTE; tag = m_tag_find(m, PACKET_TAG_RTSOCKFAM, NULL); if (tag != NULL) { family = (unsigned short *)(tag + 1); route_proto.sp_protocol = *family; m_tag_delete(m, tag); } else route_proto.sp_protocol = 0; raw_input_ext(m, &route_proto, &route_src, raw_input_rts_cb); } /* * It really doesn't make any sense at all for this code to share much * with raw_usrreq.c, since its functionality is so restricted. XXX */ static void rts_abort(struct socket *so) { raw_usrreqs.pru_abort(so); } static void rts_close(struct socket *so) { raw_usrreqs.pru_close(so); } /* pru_accept is EOPNOTSUPP */ static int rts_attach(struct socket *so, int proto, struct thread *td) { struct rawcb *rp; int error; KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL")); /* XXX */ rp = malloc(sizeof *rp, M_PCB, M_WAITOK | M_ZERO); so->so_pcb = (caddr_t)rp; so->so_fibnum = td->td_proc->p_fibnum; error = raw_attach(so, proto); rp = sotorawcb(so); if (error) { so->so_pcb = NULL; free(rp, M_PCB); return error; } RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: V_route_cb.ip_count++; break; case AF_INET6: V_route_cb.ip6_count++; break; } V_route_cb.any_count++; RTSOCK_UNLOCK(); soisconnected(so); so->so_options |= SO_USELOOPBACK; return 0; } static int rts_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */ } static int rts_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (raw_usrreqs.pru_connect(so, nam, td)); /* XXX just EINVAL */ } /* pru_connect2 is EOPNOTSUPP */ /* pru_control is EOPNOTSUPP */ static void rts_detach(struct socket *so) { struct rawcb *rp = sotorawcb(so); KASSERT(rp != NULL, ("rts_detach: rp == NULL")); RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: V_route_cb.ip_count--; break; case AF_INET6: V_route_cb.ip6_count--; break; } V_route_cb.any_count--; RTSOCK_UNLOCK(); raw_usrreqs.pru_detach(so); } static int rts_disconnect(struct socket *so) { return (raw_usrreqs.pru_disconnect(so)); } /* pru_listen is EOPNOTSUPP */ static int rts_peeraddr(struct socket *so, struct sockaddr **nam) { return (raw_usrreqs.pru_peeraddr(so, nam)); } /* pru_rcvd is EOPNOTSUPP */ /* pru_rcvoob is EOPNOTSUPP */ static int rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { return (raw_usrreqs.pru_send(so, flags, m, nam, control, td)); } /* pru_sense is null */ static int rts_shutdown(struct socket *so) { return (raw_usrreqs.pru_shutdown(so)); } static int rts_sockaddr(struct socket *so, struct sockaddr **nam) { return (raw_usrreqs.pru_sockaddr(so, nam)); } static struct pr_usrreqs route_usrreqs = { .pru_abort = rts_abort, .pru_attach = rts_attach, .pru_bind = rts_bind, .pru_connect = rts_connect, .pru_detach = rts_detach, .pru_disconnect = rts_disconnect, .pru_peeraddr = rts_peeraddr, .pru_send = rts_send, .pru_shutdown = rts_shutdown, .pru_sockaddr = rts_sockaddr, .pru_close = rts_close, }; #ifndef _SOCKADDR_UNION_DEFINED #define _SOCKADDR_UNION_DEFINED /* * The union of all possible address formats we handle. */ union sockaddr_union { struct sockaddr sa; struct sockaddr_in sin; struct sockaddr_in6 sin6; }; #endif /* _SOCKADDR_UNION_DEFINED */ static int rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred) { /* First, see if the returned address is part of the jail. */ if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) { info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; return (0); } switch (info->rti_info[RTAX_DST]->sa_family) { #ifdef INET case AF_INET: { struct in_addr ia; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; ia = ((struct sockaddr_in *)sa)->sin_addr; if (prison_check_ip4(cred, &ia) == 0) { found = 1; break; } } IF_ADDR_RUNLOCK(ifp); if (!found) { /* * As a last resort return the 'default' jail address. */ ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)-> sin_addr; if (prison_get_ip4(cred, &ia) != 0) return (ESRCH); } bzero(&saun->sin, sizeof(struct sockaddr_in)); saun->sin.sin_len = sizeof(struct sockaddr_in); saun->sin.sin_family = AF_INET; saun->sin.sin_addr.s_addr = ia.s_addr; info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin; break; } #endif #ifdef INET6 case AF_INET6: { struct in6_addr ia6; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET6) continue; bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr, &ia6, sizeof(struct in6_addr)); if (prison_check_ip6(cred, &ia6) == 0) { found = 1; break; } } IF_ADDR_RUNLOCK(ifp); if (!found) { /* * As a last resort return the 'default' jail address. */ ia6 = ((struct sockaddr_in6 *)rt->rt_ifa->ifa_addr)-> sin6_addr; if (prison_get_ip6(cred, &ia6) != 0) return (ESRCH); } bzero(&saun->sin6, sizeof(struct sockaddr_in6)); saun->sin6.sin6_len = sizeof(struct sockaddr_in6); saun->sin6.sin6_family = AF_INET6; bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr)); if (sa6_recoverscope(&saun->sin6) != 0) return (ESRCH); info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6; break; } #endif default: return (ESRCH); } return (0); } /*ARGSUSED*/ static int route_output(struct mbuf *m, struct socket *so, ...) { struct rt_msghdr *rtm = NULL; struct rtentry *rt = NULL; struct rib_head *rnh; struct rt_addrinfo info; struct sockaddr_storage ss; #ifdef INET6 struct sockaddr_in6 *sin6; int i, rti_need_deembed = 0; #endif int alloc_len = 0, len, error = 0, fibnum; struct ifnet *ifp = NULL; union sockaddr_union saun; sa_family_t saf = AF_UNSPEC; struct rawcb *rp = NULL; struct walkarg w; fibnum = so->so_fibnum; #define senderr(e) { error = e; goto flush;} if (m == NULL || ((m->m_len < sizeof(long)) && (m = m_pullup(m, sizeof(long))) == NULL)) return (ENOBUFS); if ((m->m_flags & M_PKTHDR) == 0) panic("route_output"); len = m->m_pkthdr.len; if (len < sizeof(*rtm) || len != mtod(m, struct rt_msghdr *)->rtm_msglen) senderr(EINVAL); /* * Most of current messages are in range 200-240 bytes, * minimize possible re-allocation on reply using larger size * buffer aligned on 1k boundaty. */ alloc_len = roundup2(len, 1024); if ((rtm = malloc(alloc_len, M_TEMP, M_NOWAIT)) == NULL) senderr(ENOBUFS); m_copydata(m, 0, len, (caddr_t)rtm); bzero(&info, sizeof(info)); bzero(&w, sizeof(w)); if (rtm->rtm_version != RTM_VERSION) { /* Do not touch message since format is unknown */ free(rtm, M_TEMP); rtm = NULL; senderr(EPROTONOSUPPORT); } /* * Starting from here, it is possible * to alter original message and insert * caller PID and error value. */ rtm->rtm_pid = curproc->p_pid; info.rti_addrs = rtm->rtm_addrs; info.rti_mflags = rtm->rtm_inits; info.rti_rmx = &rtm->rtm_rmx; /* * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6 * link-local address because rtrequest requires addresses with * embedded scope id. */ if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) senderr(EINVAL); info.rti_flags = rtm->rtm_flags; if (info.rti_info[RTAX_DST] == NULL || info.rti_info[RTAX_DST]->sa_family >= AF_MAX || (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX)) senderr(EINVAL); saf = info.rti_info[RTAX_DST]->sa_family; /* * Verify that the caller has the appropriate privilege; RTM_GET * is the only operation the non-superuser is allowed. */ if (rtm->rtm_type != RTM_GET) { error = priv_check(curthread, PRIV_NET_ROUTE); if (error) senderr(error); } /* * The given gateway address may be an interface address. * For example, issuing a "route change" command on a route * entry that was created from a tunnel, and the gateway * address given is the local end point. In this case the * RTF_GATEWAY flag must be cleared or the destination will * not be reachable even though there is no error message. */ if (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) { struct rt_addrinfo ginfo; struct sockaddr *gdst; bzero(&ginfo, sizeof(ginfo)); bzero(&ss, sizeof(ss)); ss.ss_len = sizeof(ss); ginfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&ss; gdst = info.rti_info[RTAX_GATEWAY]; /* * A host route through the loopback interface is * installed for each interface adddress. In pre 8.0 * releases the interface address of a PPP link type * is not reachable locally. This behavior is fixed as * part of the new L2/L3 redesign and rewrite work. The * signature of this interface address route is the * AF_LINK sa_family type of the rt_gateway, and the * rt_ifp has the IFF_LOOPBACK flag set. */ if (rib_lookup_info(fibnum, gdst, NHR_REF, 0, &ginfo) == 0) { if (ss.ss_family == AF_LINK && ginfo.rti_ifp->if_flags & IFF_LOOPBACK) { info.rti_flags &= ~RTF_GATEWAY; info.rti_flags |= RTF_GWFLAG_COMPAT; } rib_free_info(&ginfo); } } switch (rtm->rtm_type) { struct rtentry *saved_nrt; case RTM_ADD: case RTM_CHANGE: if (info.rti_info[RTAX_GATEWAY] == NULL) senderr(EINVAL); saved_nrt = NULL; /* support for new ARP code */ if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK && (rtm->rtm_flags & RTF_LLDATA) != 0) { error = lla_rt_output(rtm, &info); #ifdef INET6 if (error == 0) rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; } error = rtrequest1_fib(rtm->rtm_type, &info, &saved_nrt, fibnum); if (error == 0 && saved_nrt != NULL) { #ifdef INET6 rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif RT_LOCK(saved_nrt); rtm->rtm_index = saved_nrt->rt_ifp->if_index; RT_REMREF(saved_nrt); RT_UNLOCK(saved_nrt); } break; case RTM_DELETE: saved_nrt = NULL; /* support for new ARP code */ if (info.rti_info[RTAX_GATEWAY] && (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) && (rtm->rtm_flags & RTF_LLDATA) != 0) { error = lla_rt_output(rtm, &info); #ifdef INET6 if (error == 0) rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; } error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt, fibnum); if (error == 0) { RT_LOCK(saved_nrt); rt = saved_nrt; goto report; } #ifdef INET6 /* rt_msg2() will not be used when RTM_DELETE fails. */ rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; case RTM_GET: rnh = rt_tables_get_rnh(fibnum, saf); if (rnh == NULL) senderr(EAFNOSUPPORT); RIB_RLOCK(rnh); if (info.rti_info[RTAX_NETMASK] == NULL && rtm->rtm_type == RTM_GET) { /* * Provide logest prefix match for * address lookup (no mask). * 'route -n get addr' */ rt = (struct rtentry *) rnh->rnh_matchaddr( info.rti_info[RTAX_DST], &rnh->head); } else rt = (struct rtentry *) rnh->rnh_lookup( info.rti_info[RTAX_DST], info.rti_info[RTAX_NETMASK], &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); senderr(ESRCH); } #ifdef RADIX_MPATH /* * for RTM_CHANGE/LOCK, if we got multipath routes, * we require users to specify a matching RTAX_GATEWAY. * * for RTM_GET, gate is optional even with multipath. * if gate == NULL the first match is returned. * (no need to call rt_mpath_matchgate if gate == NULL) */ if (rt_mpath_capable(rnh) && (rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) { rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]); if (!rt) { RIB_RUNLOCK(rnh); senderr(ESRCH); } } #endif /* * If performing proxied L2 entry insertion, and * the actual PPP host entry is found, perform * another search to retrieve the prefix route of * the local end point of the PPP link. */ if (rtm->rtm_flags & RTF_ANNOUNCE) { struct sockaddr laddr; if (rt->rt_ifp != NULL && rt->rt_ifp->if_type == IFT_PROPVIRTUAL) { struct ifaddr *ifa; ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1, RT_ALL_FIBS); if (ifa != NULL) rt_maskedcopy(ifa->ifa_addr, &laddr, ifa->ifa_netmask); } else rt_maskedcopy(rt->rt_ifa->ifa_addr, &laddr, rt->rt_ifa->ifa_netmask); /* * refactor rt and no lock operation necessary */ rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr, &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); senderr(ESRCH); } } RT_LOCK(rt); RT_ADDREF(rt); RIB_RUNLOCK(rnh); report: RT_LOCK_ASSERT(rt); if ((rt->rt_flags & RTF_HOST) == 0 ? jailed_without_vnet(curthread->td_ucred) : prison_if(curthread->td_ucred, rt_key(rt)) != 0) { RT_UNLOCK(rt); senderr(ESRCH); } info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss); info.rti_info[RTAX_GENMASK] = 0; if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { ifp = rt->rt_ifp; if (ifp) { info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; error = rtm_get_jailed(&info, ifp, rt, &saun, curthread->td_ucred); if (error != 0) { RT_UNLOCK(rt); senderr(error); } if (ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; rtm->rtm_index = ifp->if_index; } else { info.rti_info[RTAX_IFP] = NULL; info.rti_info[RTAX_IFA] = NULL; } } else if ((ifp = rt->rt_ifp) != NULL) { rtm->rtm_index = ifp->if_index; } /* Check if we need to realloc storage */ rtsock_msg_buffer(rtm->rtm_type, &info, NULL, &len); if (len > alloc_len) { struct rt_msghdr *new_rtm; new_rtm = malloc(len, M_TEMP, M_NOWAIT); if (new_rtm == NULL) { RT_UNLOCK(rt); senderr(ENOBUFS); } bcopy(rtm, new_rtm, rtm->rtm_msglen); free(rtm, M_TEMP); rtm = new_rtm; alloc_len = len; } w.w_tmem = (caddr_t)rtm; w.w_tmemsize = alloc_len; rtsock_msg_buffer(rtm->rtm_type, &info, &w, &len); if (rt->rt_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rt->rt_flags & ~RTF_GWFLAG_COMPAT); else rtm->rtm_flags = rt->rt_flags; rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_addrs = info.rti_addrs; RT_UNLOCK(rt); break; default: senderr(EOPNOTSUPP); } flush: if (rt != NULL) RTFREE(rt); /* * Check to see if we don't want our own messages. */ if ((so->so_options & SO_USELOOPBACK) == 0) { if (V_route_cb.any_count <= 1) { if (rtm != NULL) free(rtm, M_TEMP); m_freem(m); return (error); } /* There is another listener, so construct message */ rp = sotorawcb(so); } if (rtm != NULL) { #ifdef INET6 if (rti_need_deembed) { /* sin6_scope_id is recovered before sending rtm. */ sin6 = (struct sockaddr_in6 *)&ss; for (i = 0; i < RTAX_MAX; i++) { if (info.rti_info[i] == NULL) continue; if (info.rti_info[i]->sa_family != AF_INET6) continue; bcopy(info.rti_info[i], sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) bcopy(sin6, info.rti_info[i], sizeof(*sin6)); } } #endif if (error != 0) rtm->rtm_errno = error; else rtm->rtm_flags |= RTF_DONE; m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm); if (m->m_pkthdr.len < rtm->rtm_msglen) { m_freem(m); m = NULL; } else if (m->m_pkthdr.len > rtm->rtm_msglen) m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len); free(rtm, M_TEMP); } if (m != NULL) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; if (rp) { /* * XXX insure we don't get a copy by * invalidating our protocol */ unsigned short family = rp->rcb_proto.sp_family; rp->rcb_proto.sp_family = 0; rt_dispatch(m, saf); rp->rcb_proto.sp_family = family; } else rt_dispatch(m, saf); } return (error); } static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out) { bzero(out, sizeof(*out)); out->rmx_mtu = rt->rt_mtu; out->rmx_weight = rt->rt_weight; out->rmx_pksent = counter_u64_fetch(rt->rt_pksent); /* Kernel -> userland timebase conversion. */ out->rmx_expire = rt->rt_expire ? rt->rt_expire - time_uptime + time_second : 0; } /* * Extract the addresses of the passed sockaddrs. * Do a little sanity checking so as to avoid bad memory references. * This data is derived straight from userland. */ static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo) { struct sockaddr *sa; int i; for (i = 0; i < RTAX_MAX && cp < cplim; i++) { if ((rtinfo->rti_addrs & (1 << i)) == 0) continue; sa = (struct sockaddr *)cp; /* * It won't fit. */ if (cp + sa->sa_len > cplim) return (EINVAL); /* * there are no more.. quit now * If there are more bits, they are in error. * I've seen this. route(1) can evidently generate these. * This causes kernel to core dump. * for compatibility, If we see this, point to a safe address. */ if (sa->sa_len == 0) { rtinfo->rti_info[i] = &sa_zero; return (0); /* should be EINVAL but for compat */ } /* accept it */ #ifdef INET6 if (sa->sa_family == AF_INET6) sa6_embedscope((struct sockaddr_in6 *)sa, V_ip6_use_defzone); #endif rtinfo->rti_info[i] = sa; cp += SA_SIZE(sa); } return (0); } /* * Fill in @dmask with valid netmask leaving original @smask * intact. Mostly used with radix netmasks. */ static struct sockaddr * rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask, struct sockaddr_storage *dmask) { if (dst == NULL || smask == NULL) return (NULL); memset(dmask, 0, dst->sa_len); memcpy(dmask, smask, smask->sa_len); dmask->ss_len = dst->sa_len; dmask->ss_family = dst->sa_family; return ((struct sockaddr *)dmask); } /* * Writes information related to @rtinfo object to newly-allocated mbuf. * Assumes MCLBYTES is enough to construct any message. * Used for OS notifications of vaious events (if/ifa announces,etc) * * Returns allocated mbuf or NULL on failure. */ static struct mbuf * rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo) { struct rt_msghdr *rtm; struct mbuf *m; int i; struct sockaddr *sa; #ifdef INET6 struct sockaddr_storage ss; struct sockaddr_in6 *sin6; #endif int len, dlen; switch (type) { case RTM_DELADDR: case RTM_NEWADDR: len = sizeof(struct ifa_msghdr); break; case RTM_DELMADDR: case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; case RTM_IFINFO: len = sizeof(struct if_msghdr); break; case RTM_IFANNOUNCE: case RTM_IEEE80211: len = sizeof(struct if_announcemsghdr); break; default: len = sizeof(struct rt_msghdr); } /* XXXGL: can we use MJUMPAGESIZE cluster here? */ KASSERT(len <= MCLBYTES, ("%s: message too big", __func__)); if (len > MHLEN) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (m); m->m_pkthdr.len = m->m_len = len; rtm = mtod(m, struct rt_msghdr *); bzero((caddr_t)rtm, len); for (i = 0; i < RTAX_MAX; i++) { if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); dlen = SA_SIZE(sa); #ifdef INET6 if (V_deembed_scopeid && sa->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)&ss; bcopy(sa, sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) sa = (struct sockaddr *)sin6; } #endif m_copyback(m, len, dlen, (caddr_t)sa); len += dlen; } if (m->m_pkthdr.len != len) { m_freem(m); return (NULL); } rtm->rtm_msglen = len; rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; return (m); } /* * Writes information related to @rtinfo object to preallocated buffer. * Stores needed size in @plen. If @w is NULL, calculates size without * writing. * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation. * * Returns 0 on success. * */ static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen) { int i; int len, buflen = 0, dlen; caddr_t cp = NULL; struct rt_msghdr *rtm = NULL; #ifdef INET6 struct sockaddr_storage ss; struct sockaddr_in6 *sin6; #endif switch (type) { case RTM_DELADDR: case RTM_NEWADDR: if (w != NULL && w->w_op == NET_RT_IFLISTL) { #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) len = sizeof(struct ifa_msghdrl32); else #endif len = sizeof(struct ifa_msghdrl); } else len = sizeof(struct ifa_msghdr); break; case RTM_IFINFO: #ifdef COMPAT_FREEBSD32 if (w != NULL && w->w_req->flags & SCTL_MASK32) { if (w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl32); else len = sizeof(struct if_msghdr32); break; } #endif if (w != NULL && w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl); else len = sizeof(struct if_msghdr); break; case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; default: len = sizeof(struct rt_msghdr); } if (w != NULL) { rtm = (struct rt_msghdr *)w->w_tmem; buflen = w->w_tmemsize - len; cp = (caddr_t)w->w_tmem + len; } rtinfo->rti_addrs = 0; for (i = 0; i < RTAX_MAX; i++) { struct sockaddr *sa; if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); dlen = SA_SIZE(sa); if (cp != NULL && buflen >= dlen) { #ifdef INET6 if (V_deembed_scopeid && sa->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)&ss; bcopy(sa, sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) sa = (struct sockaddr *)sin6; } #endif bcopy((caddr_t)sa, cp, (unsigned)dlen); cp += dlen; buflen -= dlen; } else if (cp != NULL) { /* * Buffer too small. Count needed size * and return with error. */ cp = NULL; } len += dlen; } if (cp != NULL) { dlen = ALIGN(len) - len; if (buflen < dlen) cp = NULL; else buflen -= dlen; } len = ALIGN(len); if (cp != NULL) { /* fill header iff buffer is large enough */ rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; rtm->rtm_msglen = len; } *plen = len; if (w != NULL && cp == NULL) return (ENOBUFS); return (0); } /* * This routine is called to generate a message from the routing * socket indicating that a redirect has occured, a routing lookup * has failed, or that a protocol has detected timeouts to a particular * destination. */ void rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error, int fibnum) { struct rt_msghdr *rtm; struct mbuf *m; struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; if (V_route_cb.any_count == 0) return; m = rtsock_msg_mbuf(type, rtinfo); if (m == NULL) return; if (fibnum != RT_ALL_FIBS) { KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out " "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs)); M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rtm = mtod(m, struct rt_msghdr *); rtm->rtm_flags = RTF_DONE | flags; rtm->rtm_errno = error; rtm->rtm_addrs = rtinfo->rti_addrs; rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); } void rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) { rt_missmsg_fib(type, rtinfo, flags, error, RT_ALL_FIBS); } /* * This routine is called to generate a message from the routing * socket indicating that the status of a network interface has changed. */ void rt_ifmsg(struct ifnet *ifp) { struct if_msghdr *ifm; struct mbuf *m; struct rt_addrinfo info; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); m = rtsock_msg_mbuf(RTM_IFINFO, &info); if (m == NULL) return; ifm = mtod(m, struct if_msghdr *); ifm->ifm_index = ifp->if_index; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; if_data_copy(ifp, &ifm->ifm_data); ifm->ifm_addrs = 0; rt_dispatch(m, AF_UNSPEC); } /* * Announce interface address arrival/withdraw. * Please do not call directly, use rt_addrmsg(). * Assume input data to be valid. * Returns 0 on success. */ int rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { struct rt_addrinfo info; struct sockaddr *sa; int ncmd; struct mbuf *m; struct ifa_msghdr *ifam; struct ifnet *ifp = ifa->ifa_ifp; struct sockaddr_storage ss; if (V_route_cb.any_count == 0) return (0); ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( info.rti_info[RTAX_IFP], ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL) return (ENOBUFS); ifam = mtod(m, struct ifa_msghdr *); ifam->ifam_index = ifp->if_index; ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_addrs = info.rti_addrs; if (fibnum != RT_ALL_FIBS) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * Announce route addition/removal. * Please do not call directly, use rt_routemsg(). * Note that @rt data MAY be inconsistent/invalid: * if some userland app sends us "invalid" route message (invalid mask, * no dst, wrong address families, etc...) we need to pass it back * to app (and any other rtsock consumers) with rtm_errno field set to * non-zero value. * * Returns 0 on success. */ int rtsock_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt, int fibnum) { struct rt_addrinfo info; struct sockaddr *sa; struct mbuf *m; struct rt_msghdr *rtm; struct sockaddr_storage ss; if (V_route_cb.any_count == 0) return (0); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = sa = rt_key(rt); info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(sa, rt_mask(rt), &ss); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; if ((m = rtsock_msg_mbuf(cmd, &info)) == NULL) return (ENOBUFS); rtm = mtod(m, struct rt_msghdr *); rtm->rtm_index = ifp->if_index; rtm->rtm_flags |= rt->rt_flags; rtm->rtm_errno = error; rtm->rtm_addrs = info.rti_addrs; if (fibnum != RT_ALL_FIBS) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * This is the analogue to the rt_newaddrmsg which performs the same * function but for multicast group memberhips. This is easier since * there is no route state to worry about. */ void rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) { struct rt_addrinfo info; struct mbuf *m = NULL; struct ifnet *ifp = ifma->ifma_ifp; struct ifma_msghdr *ifmam; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = ifma->ifma_addr; info.rti_info[RTAX_IFP] = ifp ? ifp->if_addr->ifa_addr : NULL; /* * If a link-layer address is present, present it as a ``gateway'' * (similarly to how ARP entries, e.g., are presented). */ info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr; m = rtsock_msg_mbuf(cmd, &info); if (m == NULL) return; ifmam = mtod(m, struct ifma_msghdr *); KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n", __func__)); ifmam->ifmam_index = ifp->if_index; ifmam->ifmam_addrs = info.rti_addrs; rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC); } static struct mbuf * rt_makeifannouncemsg(struct ifnet *ifp, int type, int what, struct rt_addrinfo *info) { struct if_announcemsghdr *ifan; struct mbuf *m; if (V_route_cb.any_count == 0) return NULL; bzero((caddr_t)info, sizeof(*info)); m = rtsock_msg_mbuf(type, info); if (m != NULL) { ifan = mtod(m, struct if_announcemsghdr *); ifan->ifan_index = ifp->if_index; strlcpy(ifan->ifan_name, ifp->if_xname, sizeof(ifan->ifan_name)); ifan->ifan_what = what; } return m; } /* * This is called to generate routing socket messages indicating * IEEE80211 wireless events. * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way. */ void rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info); if (m != NULL) { /* * Append the ieee80211 data. Try to stick it in the * mbuf containing the ifannounce msg; otherwise allocate * a new mbuf and append. * * NB: we assume m is a single mbuf. */ if (data_len > M_TRAILINGSPACE(m)) { struct mbuf *n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } bcopy(data, mtod(n, void *), data_len); n->m_len = data_len; m->m_next = n; } else if (data_len > 0) { bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len); m->m_len += data_len; } if (m->m_flags & M_PKTHDR) m->m_pkthdr.len += data_len; mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len; rt_dispatch(m, AF_UNSPEC); } } /* * This is called to generate routing socket messages indicating * network interface arrival and departure. */ void rt_ifannouncemsg(struct ifnet *ifp, int what) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info); if (m != NULL) rt_dispatch(m, AF_UNSPEC); } static void rt_dispatch(struct mbuf *m, sa_family_t saf) { struct m_tag *tag; /* * Preserve the family from the sockaddr, if any, in an m_tag for * use when injecting the mbuf into the routing socket buffer from * the netisr. */ if (saf != AF_UNSPEC) { tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short), M_NOWAIT); if (tag == NULL) { m_freem(m); return; } *(unsigned short *)(tag + 1) = saf; m_tag_prepend(m, tag); } #ifdef VIMAGE if (V_loif) m->m_pkthdr.rcvif = V_loif; else { m_freem(m); return; } #endif netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */ } /* * This is used in dumping the kernel table via sysctl(). */ static int sysctl_dumpentry(struct radix_node *rn, void *vw) { struct walkarg *w = vw; struct rtentry *rt = (struct rtentry *)rn; int error = 0, size; struct rt_addrinfo info; struct sockaddr_storage ss; if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) return 0; if ((rt->rt_flags & RTF_HOST) == 0 ? jailed_without_vnet(w->w_req->td->td_ucred) : prison_if(w->w_req->td->td_ucred, rt_key(rt)) != 0) return (0); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss); info.rti_info[RTAX_GENMASK] = 0; if (rt->rt_ifp) { info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr; info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; if (rt->rt_ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; } if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0) return (error); if (w->w_req && w->w_tmem) { struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; if (rt->rt_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rt->rt_flags & ~RTF_GWFLAG_COMPAT); else rtm->rtm_flags = rt->rt_flags; rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_index = rt->rt_ifp->if_index; rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0; rtm->rtm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); return (error); } return (error); } static int sysctl_iflist_ifml(struct ifnet *ifp, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdrl *ifm; struct if_data *ifd; ifm = (struct if_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdrl32 *ifm32; ifm32 = (struct if_msghdrl32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifm32->_ifm_spare1 = 0; ifm32->ifm_len = sizeof(*ifm32); ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data); ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifm->_ifm_spare1 = 0; ifm->ifm_len = sizeof(*ifm); ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data); ifd = &ifm->ifm_data; } if_data_copy(ifp, ifd); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifm(struct ifnet *ifp, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdr *ifm; struct if_data *ifd; ifm = (struct if_msghdr *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdr32 *ifm32; ifm32 = (struct if_msghdr32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifd = &ifm->ifm_data; } if_data_copy(ifp, ifd); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdrl *ifam; struct if_data *ifd; ifam = (struct ifa_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct ifa_msghdrl32 *ifam32; ifam32 = (struct ifa_msghdrl32 *)ifam; ifam32->ifam_addrs = info->rti_addrs; ifam32->ifam_flags = ifa->ifa_flags; ifam32->ifam_index = ifa->ifa_ifp->if_index; ifam32->_ifam_spare1 = 0; ifam32->ifam_len = sizeof(*ifam32); ifam32->ifam_data_off = offsetof(struct ifa_msghdrl32, ifam_data); ifam32->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam32->ifam_data; } else #endif { ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->_ifam_spare1 = 0; ifam->ifam_len = sizeof(*ifam); ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data); ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam->ifam_data; } bzero(ifd, sizeof(*ifd)); ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets); ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets); ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes); ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes); /* Fixup if_data carp(4) vhid. */ if (carp_get_vhid_p != NULL) ifd->ifi_vhid = (*carp_get_vhid_p)(ifa); return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdr *ifam; ifam = (struct ifa_msghdr *)w->w_tmem; ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->ifam_metric = ifa->ifa_ifp->if_metric; return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist(int af, struct walkarg *w) { struct ifnet *ifp; struct ifaddr *ifa; struct rt_addrinfo info; int len, error = 0; struct sockaddr_storage ss; bzero((caddr_t)&info, sizeof(info)); IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; IF_ADDR_RLOCK(ifp); ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa->ifa_addr; error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len); if (error != 0) goto done; info.rti_info[RTAX_IFP] = NULL; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifml(ifp, &info, w, len); else error = sysctl_iflist_ifm(ifp, &info, w, len); if (error) goto done; } while ((ifa = TAILQ_NEXT(ifa, ifa_link)) != NULL) { if (af && af != ifa->ifa_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifa->ifa_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( ifa->ifa_addr, ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len); if (error != 0) goto done; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifaml(ifa, &info, w, len); else error = sysctl_iflist_ifam(ifa, &info, w, len); if (error) goto done; } } IF_ADDR_RUNLOCK(ifp); info.rti_info[RTAX_IFA] = NULL; info.rti_info[RTAX_NETMASK] = NULL; info.rti_info[RTAX_BRD] = NULL; } done: if (ifp != NULL) IF_ADDR_RUNLOCK(ifp); IFNET_RUNLOCK_NOSLEEP(); return (error); } static int sysctl_ifmalist(int af, struct walkarg *w) { struct ifnet *ifp; struct ifmultiaddr *ifma; struct rt_addrinfo info; int len, error = 0; struct ifaddr *ifa; bzero((caddr_t)&info, sizeof(info)); IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (af && af != ifma->ifma_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifma->ifma_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifma->ifma_addr; info.rti_info[RTAX_GATEWAY] = (ifma->ifma_addr->sa_family != AF_LINK) ? ifma->ifma_lladdr : NULL; error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len); if (error != 0) goto done; if (w->w_req && w->w_tmem) { struct ifma_msghdr *ifmam; ifmam = (struct ifma_msghdr *)w->w_tmem; ifmam->ifmam_index = ifma->ifma_ifp->if_index; ifmam->ifmam_flags = 0; ifmam->ifmam_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, w->w_tmem, len); if (error) { IF_ADDR_RUNLOCK(ifp); goto done; } } } IF_ADDR_RUNLOCK(ifp); } done: IFNET_RUNLOCK_NOSLEEP(); return (error); } static int sysctl_rtsock(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct rib_head *rnh = NULL; /* silence compiler. */ int i, lim, error = EINVAL; int fib = 0; u_char af; struct walkarg w; name ++; namelen--; if (req->newptr) return (EPERM); if (name[1] == NET_RT_DUMP) { if (namelen == 3) fib = req->td->td_proc->p_fibnum; else if (namelen == 4) fib = (name[3] == RT_ALL_FIBS) ? req->td->td_proc->p_fibnum : name[3]; else return ((namelen < 3) ? EISDIR : ENOTDIR); if (fib < 0 || fib >= rt_numfibs) return (EINVAL); } else if (namelen != 3) return ((namelen < 3) ? EISDIR : ENOTDIR); af = name[0]; if (af > AF_MAX) return (EINVAL); bzero(&w, sizeof(w)); w.w_op = name[1]; w.w_arg = name[2]; w.w_req = req; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); /* * Allocate reply buffer in advance. * All rtsock messages has maximum length of u_short. */ w.w_tmemsize = 65536; w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK); switch (w.w_op) { case NET_RT_DUMP: case NET_RT_FLAGS: if (af == 0) { /* dump all tables */ i = 1; lim = AF_MAX; } else /* dump only one table */ i = lim = af; /* * take care of llinfo entries, the caller must * specify an AF */ if (w.w_op == NET_RT_FLAGS && (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) { if (af != 0) error = lltable_sysctl_dumparp(af, w.w_req); else error = EINVAL; break; } /* * take care of routing entries */ for (error = 0; error == 0 && i <= lim; i++) { rnh = rt_tables_get_rnh(fib, i); if (rnh != NULL) { RIB_RLOCK(rnh); error = rnh->rnh_walktree(&rnh->head, sysctl_dumpentry, &w); RIB_RUNLOCK(rnh); } else if (af != 0) error = EAFNOSUPPORT; } break; case NET_RT_IFLIST: case NET_RT_IFLISTL: error = sysctl_iflist(af, &w); break; case NET_RT_IFMALIST: error = sysctl_ifmalist(af, &w); break; } free(w.w_tmem, M_TEMP); return (error); } static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, ""); /* * Definitions of protocols supported in the ROUTE domain. */ static struct domain routedomain; /* or at least forward */ static struct protosw routesw[] = { { .pr_type = SOCK_RAW, .pr_domain = &routedomain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_output = route_output, .pr_ctlinput = raw_ctlinput, .pr_init = raw_init, .pr_usrreqs = &route_usrreqs } }; static struct domain routedomain = { .dom_family = PF_ROUTE, .dom_name = "route", .dom_protosw = routesw, .dom_protoswNPROTOSW = &routesw[sizeof(routesw)/sizeof(routesw[0])] }; VNET_DOMAIN_SET(route); Index: projects/vnet/sys/netinet/if_ether.c =================================================================== --- projects/vnet/sys/netinet/if_ether.c (revision 298547) +++ projects/vnet/sys/netinet/if_ether.c (revision 298548) @@ -1,1338 +1,1353 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ether.c 8.1 (Berkeley) 6/10/93 */ /* * Ethernet address resolution protocol. * TODO: * add "inuse/lock" bit (or ref. count) along with valid bit */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #include #define SIN(s) ((const struct sockaddr_in *)(s)) static struct timeval arp_lastlog; static int arp_curpps; static int arp_maxpps = 1; /* Simple ARP state machine */ enum arp_llinfo_state { ARP_LLINFO_INCOMPLETE = 0, /* No LLE data */ ARP_LLINFO_REACHABLE, /* LLE is valid */ ARP_LLINFO_VERIFY, /* LLE is valid, need refresh */ ARP_LLINFO_DELETED, /* LLE is deleted */ }; SYSCTL_DECL(_net_link_ether); static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, ""); /* timer values */ static VNET_DEFINE(int, arpt_keep) = (20*60); /* once resolved, good for 20 * minutes */ static VNET_DEFINE(int, arp_maxtries) = 5; static VNET_DEFINE(int, arp_proxyall) = 0; static VNET_DEFINE(int, arpt_down) = 20; /* keep incomplete entries for * 20 seconds */ static VNET_DEFINE(int, arpt_rexmit) = 1; /* retransmit arp entries, sec*/ VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ VNET_PCPUSTAT_SYSINIT(arpstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(arpstat); #endif /* VIMAGE */ static VNET_DEFINE(int, arp_maxhold) = 1; #define V_arpt_keep VNET(arpt_keep) #define V_arpt_down VNET(arpt_down) #define V_arpt_rexmit VNET(arpt_rexmit) #define V_arp_maxtries VNET(arp_maxtries) #define V_arp_proxyall VNET(arp_proxyall) #define V_arp_maxhold VNET(arp_maxhold) SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arpt_keep), 0, "ARP entry lifetime in seconds"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_maxtries), 0, "ARP resolution attempts before returning error"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_proxyall), 0, "Enable proxy ARP for all suitable requests"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arpt_down), 0, "Incomplete ARP entry lifetime in seconds"); SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat, arpstat, "ARP statistics (struct arpstat, net/if_arp.h)"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_maxhold), 0, "Number of packets to hold per ARP entry"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second, CTLFLAG_RW, &arp_maxpps, 0, "Maximum number of remotely triggered ARP messages that can be " "logged per second"); #define ARP_LOG(pri, ...) do { \ if (ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps)) \ log((pri), "arp: " __VA_ARGS__); \ } while (0) -static void arp_init(void); static void arpintr(struct mbuf *); static void arptimer(void *); #ifdef INET static void in_arpinput(struct mbuf *); #endif static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp, int bridged, struct llentry *la); static void arp_mark_lle_reachable(struct llentry *la); static void arp_iflladdr(void *arg __unused, struct ifnet *ifp); static eventhandler_tag iflladdr_tag; static const struct netisr_handler arp_nh = { .nh_name = "arp", .nh_handler = arpintr, .nh_proto = NETISR_ARP, .nh_policy = NETISR_POLICY_SOURCE, }; /* * Timeout routine. Age arp_tab entries periodically. */ static void arptimer(void *arg) { struct llentry *lle = (struct llentry *)arg; struct ifnet *ifp; int r_skip_req; if (lle->la_flags & LLE_STATIC) { return; } LLE_WLOCK(lle); if (callout_pending(&lle->lle_timer)) { /* * Here we are a bit odd here in the treatment of * active/pending. If the pending bit is set, it got * rescheduled before I ran. The active * bit we ignore, since if it was stopped * in ll_tablefree() and was currently running * it would have return 0 so the code would * not have deleted it since the callout could * not be stopped so we want to go through * with the delete here now. If the callout * was restarted, the pending bit will be back on and * we just want to bail since the callout_reset would * return 1 and our reference would have been removed * by arpresolve() below. */ LLE_WUNLOCK(lle); return; } ifp = lle->lle_tbl->llt_ifp; CURVNET_SET(ifp->if_vnet); switch (lle->ln_state) { case ARP_LLINFO_REACHABLE: /* * Expiration time is approaching. * Let's try to refresh entry if it is still * in use. * * Set r_skip_req to get feedback from * fast path. Change state and re-schedule * ourselves. */ LLE_REQ_LOCK(lle); lle->r_skip_req = 1; LLE_REQ_UNLOCK(lle); lle->ln_state = ARP_LLINFO_VERIFY; callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit); LLE_WUNLOCK(lle); CURVNET_RESTORE(); return; case ARP_LLINFO_VERIFY: LLE_REQ_LOCK(lle); r_skip_req = lle->r_skip_req; LLE_REQ_UNLOCK(lle); if (r_skip_req == 0 && lle->la_preempt > 0) { /* Entry was used, issue refresh request */ struct in_addr dst; dst = lle->r_l3addr.addr4; lle->la_preempt--; callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit); LLE_WUNLOCK(lle); arprequest(ifp, NULL, &dst, NULL); CURVNET_RESTORE(); return; } /* Nothing happened. Reschedule if not too late */ if (lle->la_expire > time_uptime) { callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit); LLE_WUNLOCK(lle); CURVNET_RESTORE(); return; } break; case ARP_LLINFO_INCOMPLETE: case ARP_LLINFO_DELETED: break; } if ((lle->la_flags & LLE_DELETED) == 0) { int evt; if (lle->la_flags & LLE_VALID) evt = LLENTRY_EXPIRED; else evt = LLENTRY_TIMEDOUT; EVENTHANDLER_INVOKE(lle_event, lle, evt); } callout_stop(&lle->lle_timer); /* XXX: LOR avoidance. We still have ref on lle. */ LLE_WUNLOCK(lle); IF_AFDATA_LOCK(ifp); LLE_WLOCK(lle); /* Guard against race with other llentry_free(). */ if (lle->la_flags & LLE_LINKED) { LLE_REMREF(lle); lltable_unlink_entry(lle->lle_tbl, lle); } IF_AFDATA_UNLOCK(ifp); size_t pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); ARPSTAT_INC(timeouts); CURVNET_RESTORE(); } /* * Stores link-layer header for @ifp in format suitable for if_output() * into buffer @buf. Resulting header length is stored in @bufsize. * * Returns 0 on success. */ static int arp_fillheader(struct ifnet *ifp, struct arphdr *ah, int bcast, u_char *buf, size_t *bufsize) { struct if_encap_req ereq; int error; bzero(buf, *bufsize); bzero(&ereq, sizeof(ereq)); ereq.buf = buf; ereq.bufsize = *bufsize; ereq.rtype = IFENCAP_LL; ereq.family = AF_ARP; ereq.lladdr = ar_tha(ah); ereq.hdata = (u_char *)ah; if (bcast) ereq.flags = IFENCAP_FLAG_BROADCAST; error = ifp->if_requestencap(ifp, &ereq); if (error == 0) *bufsize = ereq.bufsize; return (error); } /* * Broadcast an ARP request. Caller specifies: * - arp header source ip address * - arp header target ip address * - arp header source ethernet address */ void arprequest(struct ifnet *ifp, const struct in_addr *sip, const struct in_addr *tip, u_char *enaddr) { struct mbuf *m; struct arphdr *ah; struct sockaddr sa; u_char *carpaddr = NULL; uint8_t linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; struct route ro; int error; if (sip == NULL) { /* * The caller did not supply a source address, try to find * a compatible one among those assigned to this interface. */ struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; if (ifa->ifa_carp) { if ((*carp_iamatch_p)(ifa, &carpaddr) == 0) continue; sip = &IA_SIN(ifa)->sin_addr; } else { carpaddr = NULL; sip = &IA_SIN(ifa)->sin_addr; } if (0 == ((sip->s_addr ^ tip->s_addr) & IA_MASKSIN(ifa)->sin_addr.s_addr)) break; /* found it. */ } IF_ADDR_RUNLOCK(ifp); if (sip == NULL) { printf("%s: cannot find matching address\n", __func__); return; } } if (enaddr == NULL) enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp); if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) return; m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) + 2 * ifp->if_addrlen; m->m_pkthdr.len = m->m_len; M_ALIGN(m, m->m_len); ah = mtod(m, struct arphdr *); bzero((caddr_t)ah, m->m_len); #ifdef MAC mac_netinet_arp_send(ifp, m); #endif ah->ar_pro = htons(ETHERTYPE_IP); ah->ar_hln = ifp->if_addrlen; /* hardware address length */ ah->ar_pln = sizeof(struct in_addr); /* protocol address length */ ah->ar_op = htons(ARPOP_REQUEST); bcopy(enaddr, ar_sha(ah), ah->ar_hln); bcopy(sip, ar_spa(ah), ah->ar_pln); bcopy(tip, ar_tpa(ah), ah->ar_pln); sa.sa_family = AF_ARP; sa.sa_len = 2; /* Calculate link header for sending frame */ bzero(&ro, sizeof(ro)); linkhdrsize = sizeof(linkhdr); error = arp_fillheader(ifp, ah, 1, linkhdr, &linkhdrsize); if (error != 0 && error != EAFNOSUPPORT) { ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n", if_name(ifp), error); return; } ro.ro_prepend = linkhdr; ro.ro_plen = linkhdrsize; ro.ro_flags = 0; m->m_flags |= M_BCAST; m_clrprotoflags(m); /* Avoid confusing lower layers. */ (*ifp->if_output)(ifp, m, &sa, &ro); ARPSTAT_INC(txrequests); } /* * Resolve an IP address into an ethernet address - heavy version. * Used internally by arpresolve(). * We have already checked than we can't use existing lle without * modification so we have to acquire LLE_EXCLUSIVE lle lock. * * On success, desten and flags are filled in and the function returns 0; * If the packet must be held pending resolution, we return EWOULDBLOCK * On other errors, we return the corresponding error code. * Note that m_freem() handles NULL. */ static int arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m, const struct sockaddr *dst, u_char *desten, uint32_t *pflags) { struct llentry *la = NULL, *la_tmp; struct mbuf *curr = NULL; struct mbuf *next = NULL; int error, renew; char *lladdr; int ll_len; if (pflags != NULL) *pflags = 0; if ((flags & LLE_CREATE) == 0) { IF_AFDATA_RLOCK(ifp); la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); IF_AFDATA_RUNLOCK(ifp); } if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) { la = lltable_alloc_entry(LLTABLE(ifp), 0, dst); if (la == NULL) { log(LOG_DEBUG, "arpresolve: can't allocate llinfo for %s on %s\n", inet_ntoa(SIN(dst)->sin_addr), if_name(ifp)); m_freem(m); return (EINVAL); } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(la); la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); /* Prefer ANY existing lle over newly-created one */ if (la_tmp == NULL) lltable_link_entry(LLTABLE(ifp), la); IF_AFDATA_WUNLOCK(ifp); if (la_tmp != NULL) { lltable_free_entry(LLTABLE(ifp), la); la = la_tmp; } } if (la == NULL) { m_freem(m); return (EINVAL); } if ((la->la_flags & LLE_VALID) && ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) { if (flags & LLE_ADDRONLY) { lladdr = la->ll_addr; ll_len = ifp->if_addrlen; } else { lladdr = la->r_linkdata; ll_len = la->r_hdrlen; } bcopy(lladdr, desten, ll_len); /* Check if we have feedback request from arptimer() */ if (la->r_skip_req != 0) { LLE_REQ_LOCK(la); la->r_skip_req = 0; /* Notify that entry was used */ LLE_REQ_UNLOCK(la); } if (pflags != NULL) *pflags = la->la_flags & (LLE_VALID|LLE_IFADDR); LLE_WUNLOCK(la); return (0); } renew = (la->la_asked == 0 || la->la_expire != time_uptime); /* * There is an arptab entry, but no ethernet address * response yet. Add the mbuf to the list, dropping * the oldest packet if we have exceeded the system * setting. */ if (m != NULL) { if (la->la_numheld >= V_arp_maxhold) { if (la->la_hold != NULL) { next = la->la_hold->m_nextpkt; m_freem(la->la_hold); la->la_hold = next; la->la_numheld--; ARPSTAT_INC(dropped); } } if (la->la_hold != NULL) { curr = la->la_hold; while (curr->m_nextpkt != NULL) curr = curr->m_nextpkt; curr->m_nextpkt = m; } else la->la_hold = m; la->la_numheld++; } /* * Return EWOULDBLOCK if we have tried less than arp_maxtries. It * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH * if we have already sent arp_maxtries ARP requests. Retransmit the * ARP request, but not faster than one request per second. */ if (la->la_asked < V_arp_maxtries) error = EWOULDBLOCK; /* First request. */ else error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN; if (renew) { int canceled; LLE_ADDREF(la); la->la_expire = time_uptime; canceled = callout_reset(&la->lle_timer, hz * V_arpt_down, arptimer, la); if (canceled) LLE_REMREF(la); la->la_asked++; LLE_WUNLOCK(la); arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL); return (error); } LLE_WUNLOCK(la); return (error); } /* * Resolve an IP address into an ethernet address. */ int arpresolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst, char *desten, uint32_t *pflags) { int error; flags |= LLE_ADDRONLY; error = arpresolve_full(ifp, 0, flags, NULL, dst, desten, pflags); return (error); } /* * Lookups link header based on an IP address. * On input: * ifp is the interface we use * is_gw != 0 if @dst represents gateway to some destination * m is the mbuf. May be NULL if we don't have a packet. * dst is the next hop, * desten is the storage to put LL header. * flags returns subset of lle flags: LLE_VALID | LLE_IFADDR * * On success, full/partial link header and flags are filled in and * the function returns 0. * If the packet must be held pending resolution, we return EWOULDBLOCK * On other errors, we return the corresponding error code. * Note that m_freem() handles NULL. */ int arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, const struct sockaddr *dst, u_char *desten, uint32_t *pflags) { struct llentry *la = 0; if (pflags != NULL) *pflags = 0; if (m != NULL) { if (m->m_flags & M_BCAST) { /* broadcast */ (void)memcpy(desten, ifp->if_broadcastaddr, ifp->if_addrlen); return (0); } if (m->m_flags & M_MCAST) { /* multicast */ ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); return (0); } } IF_AFDATA_RLOCK(ifp); la = lla_lookup(LLTABLE(ifp), LLE_UNLOCKED, dst); if (la != NULL && (la->r_flags & RLLE_VALID) != 0) { /* Entry found, let's copy lle info */ bcopy(la->r_linkdata, desten, la->r_hdrlen); if (pflags != NULL) *pflags = LLE_VALID | (la->r_flags & RLLE_IFADDR); /* Check if we have feedback request from arptimer() */ if (la->r_skip_req != 0) { LLE_REQ_LOCK(la); la->r_skip_req = 0; /* Notify that entry was used */ LLE_REQ_UNLOCK(la); } IF_AFDATA_RUNLOCK(ifp); return (0); } IF_AFDATA_RUNLOCK(ifp); return (arpresolve_full(ifp, is_gw, la == NULL ? LLE_CREATE : 0, m, dst, desten, pflags)); } /* * Common length and type checks are done here, * then the protocol-specific routine is called. */ static void arpintr(struct mbuf *m) { struct arphdr *ar; struct ifnet *ifp; char *layer; int hlen; ifp = m->m_pkthdr.rcvif; if (m->m_len < sizeof(struct arphdr) && ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) { ARP_LOG(LOG_NOTICE, "packet with short header received on %s\n", if_name(ifp)); return; } ar = mtod(m, struct arphdr *); /* Check if length is sufficient */ if (m->m_len < arphdr_len(ar)) { m = m_pullup(m, arphdr_len(ar)); if (m == NULL) { ARP_LOG(LOG_NOTICE, "short packet received on %s\n", if_name(ifp)); return; } ar = mtod(m, struct arphdr *); } hlen = 0; layer = ""; switch (ntohs(ar->ar_hrd)) { case ARPHRD_ETHER: hlen = ETHER_ADDR_LEN; /* RFC 826 */ layer = "ethernet"; break; case ARPHRD_IEEE802: hlen = 6; /* RFC 1390, FDDI_ADDR_LEN */ layer = "fddi"; break; case ARPHRD_ARCNET: hlen = 1; /* RFC 1201, ARC_ADDR_LEN */ layer = "arcnet"; break; case ARPHRD_INFINIBAND: hlen = 20; /* RFC 4391, INFINIBAND_ALEN */ layer = "infiniband"; break; case ARPHRD_IEEE1394: hlen = 0; /* SHALL be 16 */ /* RFC 2734 */ layer = "firewire"; /* * Restrict too long hardware addresses. * Currently we are capable of handling 20-byte * addresses ( sizeof(lle->ll_addr) ) */ if (ar->ar_hln >= 20) hlen = 16; break; default: ARP_LOG(LOG_NOTICE, "packet with unknown hardware format 0x%02d received on " "%s\n", ntohs(ar->ar_hrd), if_name(ifp)); m_freem(m); return; } if (hlen != 0 && hlen != ar->ar_hln) { ARP_LOG(LOG_NOTICE, "packet with invalid %s address length %d received on %s\n", layer, ar->ar_hln, if_name(ifp)); m_freem(m); return; } ARPSTAT_INC(received); switch (ntohs(ar->ar_pro)) { #ifdef INET case ETHERTYPE_IP: in_arpinput(m); return; #endif } m_freem(m); } #ifdef INET /* * ARP for Internet protocols on 10 Mb/s Ethernet. * Algorithm is that given in RFC 826. * In addition, a sanity check is performed on the sender * protocol address, to catch impersonators. * We no longer handle negotiations for use of trailer protocol: * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent * along with IP replies if we wanted trailers sent to us, * and also sent them in response to IP replies. * This allowed either end to announce the desire to receive * trailer packets. * We no longer reply to requests for ETHERTYPE_TRAIL protocol either, * but formerly didn't normally send requests. */ static int log_arp_wrong_iface = 1; static int log_arp_movements = 1; static int log_arp_permanent_modify = 1; static int allow_multicast = 0; SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW, &log_arp_wrong_iface, 0, "log arp packets arriving on the wrong interface"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW, &log_arp_movements, 0, "log arp replies from MACs different than the one in the cache"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW, &log_arp_permanent_modify, 0, "log arp replies from MACs different than the one in the permanent arp entry"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW, &allow_multicast, 0, "accept multicast addresses"); static void in_arpinput(struct mbuf *m) { struct rm_priotracker in_ifa_tracker; struct arphdr *ah; struct ifnet *ifp = m->m_pkthdr.rcvif; struct llentry *la = NULL, *la_tmp; struct ifaddr *ifa; struct in_ifaddr *ia; struct sockaddr sa; struct in_addr isaddr, itaddr, myaddr; u_int8_t *enaddr = NULL; int op; int bridged = 0, is_bridge = 0; int carped; struct sockaddr_in sin; struct sockaddr *dst; struct nhop4_basic nh4; uint8_t linkhdr[LLE_MAX_LINKHDR]; struct route ro; size_t linkhdrsize; int lladdr_off; int error; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr.s_addr = 0; if (ifp->if_bridge) bridged = 1; if (ifp->if_type == IFT_BRIDGE) is_bridge = 1; /* * We already have checked that mbuf contains enough contiguous data * to hold entire arp message according to the arp header. */ ah = mtod(m, struct arphdr *); /* * ARP is only for IPv4 so we can reject packets with * a protocol length not equal to an IPv4 address. */ if (ah->ar_pln != sizeof(struct in_addr)) { ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n", sizeof(struct in_addr)); goto drop; } if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) { ARP_LOG(LOG_NOTICE, "%*D is multicast\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":"); goto drop; } op = ntohs(ah->ar_op); (void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr)); (void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr)); if (op == ARPOP_REPLY) ARPSTAT_INC(rxreplies); /* * For a bridge, we want to check the address irrespective * of the receive interface. (This will change slightly * when we have clusters of interfaces). */ IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && itaddr.s_addr == ia->ia_addr.sin_addr.s_addr && (ia->ia_ifa.ifa_carp == NULL || (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash) if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } #define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \ (ia->ia_ifp->if_bridge == ifp->if_softc && \ !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) && \ addr == ia->ia_addr.sin_addr.s_addr) /* * Check the case when bridge shares its MAC address with * some of its children, so packets are claimed by bridge * itself (bridge_input() does it first), but they are really * meant to be destined to the bridge member. */ if (is_bridge) { LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) { ifa_ref(&ia->ia_ifa); ifp = ia->ia_ifp; IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } } #undef BDG_MEMBER_MATCHES_ARP IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * No match, use the first inet address on the receive interface * as a dummy address for the rest of the function. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && (ifa->ifa_carp == NULL || (*carp_iamatch_p)(ifa, &enaddr))) { ia = ifatoia(ifa); ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto match; } IF_ADDR_RUNLOCK(ifp); /* * If bridging, fall back to using any inet address. */ IN_IFADDR_RLOCK(&in_ifa_tracker); if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto drop; } ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); match: if (!enaddr) enaddr = (u_int8_t *)IF_LLADDR(ifp); carped = (ia->ia_ifa.ifa_carp != NULL); myaddr = ia->ia_addr.sin_addr; ifa_free(&ia->ia_ifa); if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen)) goto drop; /* it's from me, ignore it. */ if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) { ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address " "%s!\n", inet_ntoa(isaddr)); goto drop; } if (ifp->if_addrlen != ah->ar_hln) { ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, " "i/f %d (ignored)\n", ifp->if_addrlen, (u_char *) ar_sha(ah), ":", ah->ar_hln, ifp->if_addrlen); goto drop; } /* * Warn if another host is using the same IP address, but only if the * IP address isn't 0.0.0.0, which is used for DHCP only, in which * case we suppress the warning to avoid false positive complaints of * potential misconfiguration. */ if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr && myaddr.s_addr != 0) { ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", inet_ntoa(isaddr), ifp->if_xname); itaddr = myaddr; ARPSTAT_INC(dupips); goto reply; } if (ifp->if_flags & IFF_STATICARP) goto reply; bzero(&sin, sizeof(sin)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr = isaddr; dst = (struct sockaddr *)&sin; IF_AFDATA_RLOCK(ifp); la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); IF_AFDATA_RUNLOCK(ifp); if (la != NULL) arp_check_update_lle(ah, isaddr, ifp, bridged, la); else if (itaddr.s_addr == myaddr.s_addr) { /* * Request/reply to our address, but no lle exists yet. * Calculate full link prepend to use in lle. */ linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr, &linkhdrsize, &lladdr_off) != 0) goto reply; /* Allocate new entry */ la = lltable_alloc_entry(LLTABLE(ifp), 0, dst); if (la == NULL) { /* * lle creation may fail if source address belongs * to non-directly connected subnet. However, we * will try to answer the request instead of dropping * frame. */ goto reply; } lltable_set_entry_addr(ifp, la, linkhdr, linkhdrsize, lladdr_off); IF_AFDATA_WLOCK(ifp); LLE_WLOCK(la); la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); /* * Check if lle still does not exists. * If it does, that means that we either * 1) have configured it explicitly, via * 1a) 'arp -s' static entry or * 1b) interface address static record * or * 2) it was the result of sending first packet to-host * or * 3) it was another arp reply packet we handled in * different thread. * * In all cases except 3) we definitely need to prefer * existing lle. For the sake of simplicity, prefer any * existing lle over newly-create one. */ if (la_tmp == NULL) lltable_link_entry(LLTABLE(ifp), la); IF_AFDATA_WUNLOCK(ifp); if (la_tmp == NULL) { arp_mark_lle_reachable(la); LLE_WUNLOCK(la); } else { /* Free newly-create entry and handle packet */ lltable_free_entry(LLTABLE(ifp), la); la = la_tmp; la_tmp = NULL; arp_check_update_lle(ah, isaddr, ifp, bridged, la); /* arp_check_update_lle() returns @la unlocked */ } la = NULL; } reply: if (op != ARPOP_REQUEST) goto drop; ARPSTAT_INC(rxrequests); if (itaddr.s_addr == myaddr.s_addr) { /* Shortcut.. the receiving interface is the target. */ (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); } else { struct llentry *lle = NULL; sin.sin_addr = itaddr; IF_AFDATA_RLOCK(ifp); lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin); IF_AFDATA_RUNLOCK(ifp); if ((lle != NULL) && (lle->la_flags & LLE_PUB)) { (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), lle->ll_addr, ah->ar_hln); LLE_RUNLOCK(lle); } else { if (lle != NULL) LLE_RUNLOCK(lle); if (!V_arp_proxyall) goto drop; /* XXX MRT use table 0 for arp reply */ if (fib4_lookup_nh_basic(0, itaddr, 0, 0, &nh4) != 0) goto drop; /* * Don't send proxies for nodes on the same interface * as this one came out of, or we'll get into a fight * over who claims what Ether address. */ if (nh4.nh_ifp == ifp) goto drop; (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); /* * Also check that the node which sent the ARP packet * is on the interface we expect it to be on. This * avoids ARP chaos if an interface is connected to the * wrong network. */ /* XXX MRT use table 0 for arp checks */ if (fib4_lookup_nh_basic(0, isaddr, 0, 0, &nh4) != 0) goto drop; if (nh4.nh_ifp != ifp) { ARP_LOG(LOG_INFO, "proxy: ignoring request" " from %s via %s\n", inet_ntoa(isaddr), ifp->if_xname); goto drop; } #ifdef DEBUG_PROXY printf("arp: proxying for %s\n", inet_ntoa(itaddr)); #endif } } if (itaddr.s_addr == myaddr.s_addr && IN_LINKLOCAL(ntohl(itaddr.s_addr))) { /* RFC 3927 link-local IPv4; always reply by broadcast. */ #ifdef DEBUG_LINKLOCAL printf("arp: sending reply for link-local addr %s\n", inet_ntoa(itaddr)); #endif m->m_flags |= M_BCAST; m->m_flags &= ~M_MCAST; } else { /* default behaviour; never reply by broadcast. */ m->m_flags &= ~(M_BCAST|M_MCAST); } (void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln); (void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln); ah->ar_op = htons(ARPOP_REPLY); ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */ m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln); m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = NULL; sa.sa_family = AF_ARP; sa.sa_len = 2; /* Calculate link header for sending frame */ bzero(&ro, sizeof(ro)); linkhdrsize = sizeof(linkhdr); error = arp_fillheader(ifp, ah, 0, linkhdr, &linkhdrsize); /* * arp_fillheader() may fail due to lack of support inside encap request * routing. This is not necessary an error, AF_ARP can/should be handled * by if_output(). */ if (error != 0 && error != EAFNOSUPPORT) { ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n", if_name(ifp), error); return; } ro.ro_prepend = linkhdr; ro.ro_plen = linkhdrsize; ro.ro_flags = 0; m_clrprotoflags(m); /* Avoid confusing lower layers. */ (*ifp->if_output)(ifp, m, &sa, &ro); ARPSTAT_INC(txreplies); return; drop: m_freem(m); } #endif /* * Checks received arp data against existing @la. * Updates lle state/performs notification if necessary. */ static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp, int bridged, struct llentry *la) { struct sockaddr sa; struct mbuf *m_hold, *m_hold_next; uint8_t linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; LLE_WLOCK_ASSERT(la); /* the following is not an error when doing bridging */ if (!bridged && la->lle_tbl->llt_ifp != ifp) { if (log_arp_wrong_iface) ARP_LOG(LOG_WARNING, "%s is on %s " "but got reply from %*D on %s\n", inet_ntoa(isaddr), la->lle_tbl->llt_ifp->if_xname, ifp->if_addrlen, (u_char *)ar_sha(ah), ":", ifp->if_xname); LLE_WUNLOCK(la); return; } if ((la->la_flags & LLE_VALID) && bcmp(ar_sha(ah), la->ll_addr, ifp->if_addrlen)) { if (la->la_flags & LLE_STATIC) { LLE_WUNLOCK(la); if (log_arp_permanent_modify) ARP_LOG(LOG_ERR, "%*D attempts to modify " "permanent entry for %s on %s\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", inet_ntoa(isaddr), ifp->if_xname); return; } if (log_arp_movements) { ARP_LOG(LOG_INFO, "%s moved from %*D " "to %*D on %s\n", inet_ntoa(isaddr), ifp->if_addrlen, (u_char *)&la->ll_addr, ":", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", ifp->if_xname); } } /* Calculate full link prepend to use in lle */ linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr, &linkhdrsize, &lladdr_off) != 0) return; /* Check if something has changed */ if (memcmp(la->r_linkdata, linkhdr, linkhdrsize) != 0 || (la->la_flags & LLE_VALID) == 0) { /* Try to perform LLE update */ if (lltable_try_set_entry_addr(ifp, la, linkhdr, linkhdrsize, lladdr_off) == 0) return; /* Clear fast path feedback request if set */ la->r_skip_req = 0; } arp_mark_lle_reachable(la); /* * The packets are all freed within the call to the output * routine. * * NB: The lock MUST be released before the call to the * output routine. */ if (la->la_hold != NULL) { m_hold = la->la_hold; la->la_hold = NULL; la->la_numheld = 0; lltable_fill_sa_entry(la, &sa); LLE_WUNLOCK(la); for (; m_hold != NULL; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_hold->m_nextpkt = NULL; /* Avoid confusing lower layers. */ m_clrprotoflags(m_hold); (*ifp->if_output)(ifp, m_hold, &sa, NULL); } } else LLE_WUNLOCK(la); } static void arp_mark_lle_reachable(struct llentry *la) { int canceled, wtime; LLE_WLOCK_ASSERT(la); la->ln_state = ARP_LLINFO_REACHABLE; EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED); if (!(la->la_flags & LLE_STATIC)) { LLE_ADDREF(la); la->la_expire = time_uptime + V_arpt_keep; wtime = V_arpt_keep - V_arp_maxtries * V_arpt_rexmit; if (wtime < 0) wtime = V_arpt_keep; canceled = callout_reset(&la->lle_timer, hz * wtime, arptimer, la); if (canceled) LLE_REMREF(la); } la->la_asked = 0; la->la_preempt = V_arp_maxtries; } /* * Add pernament link-layer record for given interface address. */ static __noinline void arp_add_ifa_lle(struct ifnet *ifp, const struct sockaddr *dst) { struct llentry *lle, *lle_tmp; /* * Interface address LLE record is considered static * because kernel code relies on LLE_STATIC flag to check * if these entries can be rewriten by arp updates. */ lle = lltable_alloc_entry(LLTABLE(ifp), LLE_IFADDR | LLE_STATIC, dst); if (lle == NULL) { log(LOG_INFO, "arp_ifinit: cannot create arp " "entry for interface address\n"); return; } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(lle); /* Unlink any entry if exists */ lle_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); if (lle_tmp != NULL) lltable_unlink_entry(LLTABLE(ifp), lle_tmp); lltable_link_entry(LLTABLE(ifp), lle); IF_AFDATA_WUNLOCK(ifp); if (lle_tmp != NULL) EVENTHANDLER_INVOKE(lle_event, lle_tmp, LLENTRY_EXPIRED); EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED); LLE_WUNLOCK(lle); if (lle_tmp != NULL) lltable_free_entry(LLTABLE(ifp), lle_tmp); } void arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) { const struct sockaddr_in *dst_in; const struct sockaddr *dst; if (ifa->ifa_carp != NULL) return; dst = ifa->ifa_addr; dst_in = (const struct sockaddr_in *)dst; if (ntohl(dst_in->sin_addr.s_addr) == INADDR_ANY) return; arp_announce_ifaddr(ifp, dst_in->sin_addr, IF_LLADDR(ifp)); arp_add_ifa_lle(ifp, dst); } void arp_announce_ifaddr(struct ifnet *ifp, struct in_addr addr, u_char *enaddr) { if (ntohl(addr.s_addr) != INADDR_ANY) arprequest(ifp, &addr, &addr, enaddr); } /* * Sends gratuitous ARPs for each ifaddr to notify other * nodes about the address change. */ static __noinline void arp_handle_ifllchange(struct ifnet *ifp) { struct ifaddr *ifa; TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); } } /* * A handler for interface link layer address change event. */ static void arp_iflladdr(void *arg __unused, struct ifnet *ifp) { lltable_update_ifaddr(LLTABLE(ifp)); if ((ifp->if_flags & IFF_UP) != 0) arp_handle_ifllchange(ifp); } static void -arp_init(void) +vnet_arp_init(void) { netisr_register(&arp_nh); if (IS_DEFAULT_VNET(curvnet)) iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY); } -SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, arp_init, 0); +VNET_SYSINIT(vnet_arp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, + vnet_arp_init, 0); + +#ifdef VIMAGE +/* + * We have to unregister ARP along with IP otherwise we risk doing INADDR_HASH + * lookups after destroying the hash. Ideally this would go on SI_ORDER_3.5. + */ +static void +vnet_arp_destroy(__unused void *arg) +{ + + netisr_unregister(&arp_nh); +} +VNET_SYSUNINIT(vnet_arp_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, + vnet_arp_destroy, NULL); +#endif Index: projects/vnet/sys/netinet/igmp.c =================================================================== --- projects/vnet/sys/netinet/igmp.c (revision 298547) +++ projects/vnet/sys/netinet/igmp.c (revision 298548) @@ -1,3667 +1,3676 @@ /*- * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ /* * Internet Group Management Protocol (IGMP) routines. * [RFC1112, RFC2236, RFC3376] * * Written by Steve Deering, Stanford, May 1988. * Modified by Rosen Sharma, Stanford, Aug 1994. * Modified by Bill Fenner, Xerox PARC, Feb 1995. * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson. * * MULTICAST Revision: 3.5.1.4 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif static struct igmp_ifsoftc * igi_alloc_locked(struct ifnet *); static void igi_delete_locked(const struct ifnet *); static void igmp_dispatch_queue(struct mbufq *, int, const int); static void igmp_fasttimo_vnet(void); static void igmp_final_leave(struct in_multi *, struct igmp_ifsoftc *); static int igmp_handle_state_change(struct in_multi *, struct igmp_ifsoftc *); static int igmp_initial_join(struct in_multi *, struct igmp_ifsoftc *); static int igmp_input_v1_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v2_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v3_query(struct ifnet *, const struct ip *, /*const*/ struct igmpv3 *); static int igmp_input_v3_group_query(struct in_multi *, struct igmp_ifsoftc *, int, /*const*/ struct igmpv3 *); static int igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static int igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static void igmp_intr(struct mbuf *); static int igmp_isgroupreported(const struct in_addr); static struct mbuf * igmp_ra_alloc(void); #ifdef KTR static char * igmp_rec_type_to_str(const int); #endif static void igmp_set_version(struct igmp_ifsoftc *, const int); static void igmp_slowtimo_vnet(void); static int igmp_v1v2_queue_report(struct in_multi *, const int); static void igmp_v1v2_process_group_timer(struct in_multi *, const int); static void igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *); static void igmp_v2_update_group(struct in_multi *, const int); static void igmp_v3_cancel_link_timers(struct igmp_ifsoftc *); static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *); static struct mbuf * igmp_v3_encap_report(struct ifnet *, struct mbuf *); static int igmp_v3_enqueue_group_record(struct mbufq *, struct in_multi *, const int, const int, const int); static int igmp_v3_enqueue_filter_change(struct mbufq *, struct in_multi *); static void igmp_v3_process_group_timers(struct igmp_ifsoftc *, struct mbufq *, struct mbufq *, struct in_multi *, const int); static int igmp_v3_merge_state_changes(struct in_multi *, struct mbufq *); static void igmp_v3_suppress_group_record(struct in_multi *); static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS); static const struct netisr_handler igmp_nh = { .nh_name = "igmp", .nh_handler = igmp_intr, .nh_proto = NETISR_IGMP, .nh_policy = NETISR_POLICY_SOURCE, }; /* * System-wide globals. * * Unlocked access to these is OK, except for the global IGMP output * queue. The IGMP subsystem lock ends up being system-wide for the moment, * because all VIMAGEs have to share a global output queue, as netisrs * themselves are not virtualized. * * Locking: * * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * Any may be taken independently; if any are held at the same * time, the above lock order must be followed. * * All output is delegated to the netisr. * Now that Giant has been eliminated, the netisr may be inlined. * * IN_MULTI_LOCK covers in_multi. * * IGMP_LOCK covers igmp_ifsoftc and any global variables in this file, * including the output queue. * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of * per-link state iterators. * * igmp_ifsoftc is valid as long as PF_INET is attached to the interface, * therefore it is not refcounted. * We allow unlocked reads of igmp_ifsoftc when accessed via in_multi. * * Reference counting * * IGMP acquires its own reference every time an in_multi is passed to * it and the group is being joined for the first time. * * IGMP releases its reference(s) on in_multi in a deferred way, * because the operations which process the release run as part of * a loop whose control variables are directly affected by the release * (that, and not recursing on the IF_ADDR_LOCK). * * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds * to a vnet in ifp->if_vnet. * * SMPng: XXX We may potentially race operations on ifma_protospec. * The problem is that we currently lack a clean way of taking the * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing, * as anything which modifies ifma needs to be covered by that lock. * So check for ifma_protospec being NULL before proceeding. */ struct mtx igmp_mtx; struct mbuf *m_raopt; /* Router Alert option */ static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); /* * VIMAGE-wide globals. * * The IGMPv3 timers themselves need to run per-image, however, * protosw timers run globally (see tcp). * An ifnet can only be in one vimage at a time, and the loopback * ifnet, loif, is itself virtualized. * It would otherwise be possible to seriously hose IGMP state, * and create inconsistencies in upstream multicast routing, if you have * multiple VIMAGEs running on the same link joining different multicast * groups, UNLESS the "primary IP address" is different. This is because * IGMP for IPv4 does not force link-local addresses to be used for each * node, unlike MLD for IPv6. * Obviously the IGMPv3 per-interface state has per-vimage granularity * also as a result. * * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection * policy to control the address used by IGMP on the link. */ static VNET_DEFINE(int, interface_timers_running); /* IGMPv3 general * query response */ static VNET_DEFINE(int, state_change_timers_running); /* IGMPv3 state-change * retransmit */ static VNET_DEFINE(int, current_state_timers_running); /* IGMPv1/v2 host * report; IGMPv3 g/sg * query response */ #define V_interface_timers_running VNET(interface_timers_running) #define V_state_change_timers_running VNET(state_change_timers_running) #define V_current_state_timers_running VNET(current_state_timers_running) static VNET_DEFINE(LIST_HEAD(, igmp_ifsoftc), igi_head) = LIST_HEAD_INITIALIZER(igi_head); static VNET_DEFINE(struct igmpstat, igmpstat) = { .igps_version = IGPS_VERSION_3, .igps_len = sizeof(struct igmpstat), }; static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0}; #define V_igi_head VNET(igi_head) #define V_igmpstat VNET(igmpstat) #define V_igmp_gsrdelay VNET(igmp_gsrdelay) static VNET_DEFINE(int, igmp_recvifkludge) = 1; static VNET_DEFINE(int, igmp_sendra) = 1; static VNET_DEFINE(int, igmp_sendlocal) = 1; static VNET_DEFINE(int, igmp_v1enable) = 1; static VNET_DEFINE(int, igmp_v2enable) = 1; static VNET_DEFINE(int, igmp_legacysupp); static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3; #define V_igmp_recvifkludge VNET(igmp_recvifkludge) #define V_igmp_sendra VNET(igmp_sendra) #define V_igmp_sendlocal VNET(igmp_sendlocal) #define V_igmp_v1enable VNET(igmp_v1enable) #define V_igmp_v2enable VNET(igmp_v2enable) #define V_igmp_legacysupp VNET(igmp_legacysupp) #define V_igmp_default_version VNET(igmp_default_version) /* * Virtualized sysctls. */ SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmpstat), igmpstat, ""); SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_recvifkludge), 0, "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendra), 0, "Send IP Router Alert option in IGMPv2/v3 messages"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendlocal), 0, "Send IGMP membership reports for 224.0.0.0/24 groups"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v1enable), 0, "Enable backwards compatibility with IGMPv1"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v2enable), 0, "Enable backwards compatibility with IGMPv2"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_legacysupp), 0, "Allow v1/v2 reports to suppress v3 group responses"); SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I", "Default version of IGMP to run on each interface"); SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I", "Rate limit for IGMPv3 Group-and-Source queries in seconds"); /* * Non-virtualized sysctls. */ static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo, "Per-interface IGMPv3 state"); static __inline void igmp_save_context(struct mbuf *m, struct ifnet *ifp) { #ifdef VIMAGE m->m_pkthdr.PH_loc.ptr = ifp->if_vnet; #endif /* VIMAGE */ m->m_pkthdr.flowid = ifp->if_index; } static __inline void igmp_scrub_context(struct mbuf *m) { m->m_pkthdr.PH_loc.ptr = NULL; m->m_pkthdr.flowid = 0; } #ifdef KTR static __inline char * inet_ntoa_haddr(in_addr_t haddr) { struct in_addr ia; ia.s_addr = htonl(haddr); return (inet_ntoa(ia)); } #endif /* * Restore context from a queued IGMP output chain. * Return saved ifindex. * * VIMAGE: The assertion is there to make sure that we * actually called CURVNET_SET() with what's in the mbuf chain. */ static __inline uint32_t igmp_restore_context(struct mbuf *m) { #ifdef notyet #if defined(VIMAGE) && defined(INVARIANTS) KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr), ("%s: called when curvnet was not restored", __func__)); #endif #endif return (m->m_pkthdr.flowid); } /* * Retrieve or set default IGMP version. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by IGMP lock. */ static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS) { int error; int new; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); IGMP_LOCK(); new = V_igmp_default_version; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) goto out_locked; if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) { error = EINVAL; goto out_locked; } CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d", V_igmp_default_version, new); V_igmp_default_version = new; out_locked: IGMP_UNLOCK(); return (error); } /* * Retrieve or set threshold between group-source queries in seconds. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by IGMP lock. */ static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS) { int error; int i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); IGMP_LOCK(); i = V_igmp_gsrdelay.tv_sec; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) goto out_locked; if (i < -1 || i >= 60) { error = EINVAL; goto out_locked; } CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d", V_igmp_gsrdelay.tv_sec, i); V_igmp_gsrdelay.tv_sec = i; out_locked: IGMP_UNLOCK(); return (error); } /* * Expose struct igmp_ifsoftc to userland, keyed by ifindex. * For use by ifmcstat(8). * * SMPng: NOTE: Does an unlocked ifindex space read. * VIMAGE: Assume curvnet set by caller. The node handler itself * is not directly virtualized. */ static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS) { int *name; int error; u_int namelen; struct ifnet *ifp; struct igmp_ifsoftc *igi; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo)); if (error) return (error); IN_MULTI_LOCK(); IGMP_LOCK(); if (name[0] <= 0 || name[0] > V_if_index) { error = ENOENT; goto out_locked; } error = ENOENT; ifp = ifnet_byindex(name[0]); if (ifp == NULL) goto out_locked; LIST_FOREACH(igi, &V_igi_head, igi_link) { if (ifp == igi->igi_ifp) { struct igmp_ifinfo info; info.igi_version = igi->igi_version; info.igi_v1_timer = igi->igi_v1_timer; info.igi_v2_timer = igi->igi_v2_timer; info.igi_v3_timer = igi->igi_v3_timer; info.igi_flags = igi->igi_flags; info.igi_rv = igi->igi_rv; info.igi_qi = igi->igi_qi; info.igi_qri = igi->igi_qri; info.igi_uri = igi->igi_uri; error = SYSCTL_OUT(req, &info, sizeof(info)); break; } } out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); return (error); } /* * Dispatch an entire queue of pending packet chains * using the netisr. * VIMAGE: Assumes the vnet pointer has been set. */ static void igmp_dispatch_queue(struct mbufq *mq, int limit, const int loop) { struct mbuf *m; while ((m = mbufq_dequeue(mq)) != NULL) { CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, mq, m); if (loop) m->m_flags |= M_IGMP_LOOP; netisr_dispatch(NETISR_IGMP, m); if (--limit == 0) break; } } /* * Filter outgoing IGMP report state by group. * * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1). * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are * disabled for all groups in the 224.0.0.0/24 link-local scope. However, * this may break certain IGMP snooping switches which rely on the old * report behaviour. * * Return zero if the given group is one for which IGMP reports * should be suppressed, or non-zero if reports should be issued. */ static __inline int igmp_isgroupreported(const struct in_addr addr) { if (in_allhosts(addr) || ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr))))) return (0); return (1); } /* * Construct a Router Alert option to use in outgoing packets. */ static struct mbuf * igmp_ra_alloc(void) { struct mbuf *m; struct ipoption *p; m = m_get(M_WAITOK, MT_DATA); p = mtod(m, struct ipoption *); p->ipopt_dst.s_addr = INADDR_ANY; p->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ p->ipopt_list[1] = 0x04; /* 4 bytes long */ p->ipopt_list[2] = IPOPT_EOL; /* End of IP option list */ p->ipopt_list[3] = 0x00; /* pad byte */ m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1]; return (m); } /* * Attach IGMP when PF_INET is attached to an interface. */ struct igmp_ifsoftc * igmp_domifattach(struct ifnet *ifp) { struct igmp_ifsoftc *igi; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi = igi_alloc_locked(ifp); if (!(ifp->if_flags & IFF_MULTICAST)) igi->igi_flags |= IGIF_SILENT; IGMP_UNLOCK(); return (igi); } /* * VIMAGE: assume curvnet set by caller. */ static struct igmp_ifsoftc * igi_alloc_locked(/*const*/ struct ifnet *ifp) { struct igmp_ifsoftc *igi; IGMP_LOCK_ASSERT(); igi = malloc(sizeof(struct igmp_ifsoftc), M_IGMP, M_NOWAIT|M_ZERO); if (igi == NULL) goto out; igi->igi_ifp = ifp; igi->igi_version = V_igmp_default_version; igi->igi_flags = 0; igi->igi_rv = IGMP_RV_INIT; igi->igi_qi = IGMP_QI_INIT; igi->igi_qri = IGMP_QRI_INIT; igi->igi_uri = IGMP_URI_INIT; SLIST_INIT(&igi->igi_relinmhead); mbufq_init(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS); LIST_INSERT_HEAD(&V_igi_head, igi, igi_link); CTR2(KTR_IGMPV3, "allocate igmp_ifsoftc for ifp %p(%s)", ifp, ifp->if_xname); out: return (igi); } /* * Hook for ifdetach. * * NOTE: Some finalization tasks need to run before the protocol domain * is detached, but also before the link layer does its cleanup. * * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK(). * XXX This is also bitten by unlocked ifma_protospec access. */ void igmp_ifdetach(struct ifnet *ifp) { struct igmp_ifsoftc *igi; struct ifmultiaddr *ifma; struct in_multi *inm, *tinm; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; if (igi->igi_version == IGMP_VERSION_3) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; #if 0 KASSERT(ifma->ifma_protospec != NULL, ("%s: ifma_protospec is NULL", __func__)); #endif inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_state == IGMP_LEAVING_MEMBER) { SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); } inm_clear_recorded(inm); } IF_ADDR_RUNLOCK(ifp); /* * Free the in_multi reference(s) for this IGMP lifecycle. */ SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); inm_release_locked(inm); } } IGMP_UNLOCK(); } /* * Hook for domifdetach. */ void igmp_domifdetach(struct ifnet *ifp) { CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi_delete_locked(ifp); IGMP_UNLOCK(); } static void igi_delete_locked(const struct ifnet *ifp) { struct igmp_ifsoftc *igi, *tigi; CTR3(KTR_IGMPV3, "%s: freeing igmp_ifsoftc for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK_ASSERT(); LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) { if (igi->igi_ifp == ifp) { /* * Free deferred General Query responses. */ mbufq_drain(&igi->igi_gq); LIST_REMOVE(igi, igi_link); KASSERT(SLIST_EMPTY(&igi->igi_relinmhead), ("%s: there are dangling in_multi references", __func__)); free(igi, M_IGMP); return; } } } /* * Process a received IGMPv1 query. * Return non-zero if the message should be dropped. * * VIMAGE: The curvnet pointer is derived from the input ifp. */ static int igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { struct ifmultiaddr *ifma; struct igmp_ifsoftc *igi; struct in_multi *inm; /* * IGMPv1 Host Mmembership Queries SHOULD always be addressed to * 224.0.0.1. They are always treated as General Queries. * igmp_group is always ignored. Do not drop it as a userland * daemon may wish to see it. * XXX SMPng: unlocked increments in igmpstat assumed atomic. */ if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) { IGMPSTAT_INC(igps_rcv_badqueries); return (0); } IGMPSTAT_INC(igps_rcv_gen_queries); IN_MULTI_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Switch to IGMPv1 host compatibility mode. */ igmp_set_version(igi, IGMP_VERSION_1); CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname); /* * Start the timers in all of our group records * for the interface on which the query arrived, * except those which are already running. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_timer != 0) continue; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_V1V2_MAX_RI * PR_FASTHZ); V_current_state_timers_running = 1; break; case IGMP_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); return (0); } /* * Process a received IGMPv2 general or group-specific query. */ static int igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { struct ifmultiaddr *ifma; struct igmp_ifsoftc *igi; struct in_multi *inm; int is_general_query; uint16_t timer; is_general_query = 0; /* * Validate address fields upfront. * XXX SMPng: unlocked increments in igmpstat assumed atomic. */ if (in_nullhost(igmp->igmp_group)) { /* * IGMPv2 General Query. * If this was not sent to the all-hosts group, ignore it. */ if (!in_allhosts(ip->ip_dst)) return (0); IGMPSTAT_INC(igps_rcv_gen_queries); is_general_query = 1; } else { /* IGMPv2 Group-Specific Query. */ IGMPSTAT_INC(igps_rcv_group_queries); } IN_MULTI_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Ignore v2 query if in v1 Compatibility Mode. */ if (igi->igi_version == IGMP_VERSION_1) goto out_locked; igmp_set_version(igi, IGMP_VERSION_2); timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; if (is_general_query) { /* * For each reporting group joined on this * interface, kick the report timer. */ CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)", ifp, ifp->if_xname); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; igmp_v2_update_group(inm, timer); } IF_ADDR_RUNLOCK(ifp); } else { /* * Group-specific IGMPv2 query, we need only * look up the single group to process it. */ inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); igmp_v2_update_group(inm, timer); } } out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); return (0); } /* * Update the report timer on a group in response to an IGMPv2 query. * * If we are becoming the reporting member for this group, start the timer. * If we already are the reporting member for this group, and timer is * below the threshold, reset it. * * We may be updating the group for the first time since we switched * to IGMPv3. If we are, then we must clear any recorded source lists, * and transition to REPORTING state; the group timer is overloaded * for group and group-source query responses. * * Unlike IGMPv3, the delay per group should be jittered * to avoid bursts of IGMPv2 reports. */ static void igmp_v2_update_group(struct in_multi *inm, const int timer) { CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer); IN_MULTI_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_REPORTING_MEMBER: if (inm->inm_timer != 0 && inm->inm_timer <= timer) { CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, " "skipping.", __func__); break; } /* FALLTHROUGH */ case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__); inm->inm_state = IGMP_REPORTING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; break; case IGMP_SLEEPING_MEMBER: CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__); inm->inm_state = IGMP_AWAKENING_MEMBER; break; case IGMP_LEAVING_MEMBER: break; } } /* * Process a received IGMPv3 general, group-specific or * group-and-source-specific query. * Assumes m has already been pulled up to the full IGMP message length. * Return 0 if successful, otherwise an appropriate error code is returned. */ static int igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, /*const*/ struct igmpv3 *igmpv3) { struct igmp_ifsoftc *igi; struct in_multi *inm; int is_general_query; uint32_t maxresp, nsrc, qqi; uint16_t timer; uint8_t qrv; is_general_query = 0; CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname); maxresp = igmpv3->igmp_code; /* in 1/10ths of a second */ if (maxresp >= 128) { maxresp = IGMP_MANT(igmpv3->igmp_code) << (IGMP_EXP(igmpv3->igmp_code) + 3); } /* * Robustness must never be less than 2 for on-wire IGMPv3. * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make * an exception for interfaces whose IGMPv3 state changes * are redirected to loopback (e.g. MANET). */ qrv = IGMP_QRV(igmpv3->igmp_misc); if (qrv < 2) { CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__, qrv, IGMP_RV_INIT); qrv = IGMP_RV_INIT; } qqi = igmpv3->igmp_qqi; if (qqi >= 128) { qqi = IGMP_MANT(igmpv3->igmp_qqi) << (IGMP_EXP(igmpv3->igmp_qqi) + 3); } timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; nsrc = ntohs(igmpv3->igmp_numsrc); /* * Validate address fields and versions upfront before * accepting v3 query. * XXX SMPng: Unlocked access to igmpstat counters here. */ if (in_nullhost(igmpv3->igmp_group)) { /* * IGMPv3 General Query. * * General Queries SHOULD be directed to 224.0.0.1. * A general query with a source list has undefined * behaviour; discard it. */ IGMPSTAT_INC(igps_rcv_gen_queries); if (!in_allhosts(ip->ip_dst) || nsrc > 0) { IGMPSTAT_INC(igps_rcv_badqueries); return (0); } is_general_query = 1; } else { /* Group or group-source specific query. */ if (nsrc == 0) IGMPSTAT_INC(igps_rcv_group_queries); else IGMPSTAT_INC(igps_rcv_gsr_queries); } IN_MULTI_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Discard the v3 query if we're in Compatibility Mode. * The RFC is not obviously worded that hosts need to stay in * compatibility mode until the Old Version Querier Present * timer expires. */ if (igi->igi_version != IGMP_VERSION_3) { CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)", igi->igi_version, ifp, ifp->if_xname); goto out_locked; } igmp_set_version(igi, IGMP_VERSION_3); igi->igi_rv = qrv; igi->igi_qi = qqi; igi->igi_qri = maxresp; CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi, maxresp); if (is_general_query) { /* * Schedule a current-state report on this ifp for * all groups, possibly containing source lists. * If there is a pending General Query response * scheduled earlier than the selected delay, do * not schedule any other reports. * Otherwise, reset the interface timer. */ CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)", ifp, ifp->if_xname); if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) { igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer); V_interface_timers_running = 1; } } else { /* * Group-source-specific queries are throttled on * a per-group basis to defeat denial-of-service attempts. * Queries for groups we are not a member of on this * link are simply ignored. */ inm = inm_lookup(ifp, igmpv3->igmp_group); if (inm == NULL) goto out_locked; if (nsrc > 0) { if (!ratecheck(&inm->inm_lastgsrtv, &V_igmp_gsrdelay)) { CTR1(KTR_IGMPV3, "%s: GS query throttled.", __func__); IGMPSTAT_INC(igps_drop_gsr_queries); goto out_locked; } } CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)", inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname); /* * If there is a pending General Query response * scheduled sooner than the selected delay, no * further report need be scheduled. * Otherwise, prepare to respond to the * group-specific or group-and-source query. */ if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) igmp_input_v3_group_query(inm, igi, timer, igmpv3); } out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); return (0); } /* * Process a recieved IGMPv3 group-specific or group-and-source-specific * query. * Return <0 if any error occured. Currently this is ignored. */ static int igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifsoftc *igi, int timer, /*const*/ struct igmpv3 *igmpv3) { int retval; uint16_t nsrc; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); retval = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LEAVING_MEMBER: return (retval); break; case IGMP_REPORTING_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: break; } nsrc = ntohs(igmpv3->igmp_numsrc); /* * Deal with group-specific queries upfront. * If any group query is already pending, purge any recorded * source-list state if it exists, and schedule a query response * for this group-specific query. */ if (nsrc == 0) { if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) { inm_clear_recorded(inm); timer = min(inm->inm_timer, timer); } inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; return (retval); } /* * Deal with the case where a group-and-source-specific query has * been received but a group-specific query is already pending. */ if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) { timer = min(inm->inm_timer, timer); inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; return (retval); } /* * Finally, deal with the case where a group-and-source-specific * query has been received, where a response to a previous g-s-r * query exists, or none exists. * In this case, we need to parse the source-list which the Querier * has provided us with and check if we have any source list filter * entries at T1 for these sources. If we do not, there is no need * schedule a report and the query may be dropped. * If we do, we must record them and schedule a current-state * report for those sources. * FIXME: Handling source lists larger than 1 mbuf requires that * we pass the mbuf chain pointer down to this function, and use * m_getptr() to walk the chain. */ if (inm->inm_nsrc > 0) { const struct in_addr *ap; int i, nrecorded; ap = (const struct in_addr *)(igmpv3 + 1); nrecorded = 0; for (i = 0; i < nsrc; i++, ap++) { retval = inm_record_source(inm, ap->s_addr); if (retval < 0) break; nrecorded += retval; } if (nrecorded > 0) { CTR1(KTR_IGMPV3, "%s: schedule response to SG query", __func__); inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; } } return (retval); } /* * Process a received IGMPv1 host membership report. * * NOTE: 0.0.0.0 workaround breaks const correctness. */ static int igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_multi *inm; IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) return (0); if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } /* * RFC 3376, Section 4.2.13, 9.2, 9.3: * Booting clients may use the source address 0.0.0.0. Some * IGMP daemons may not know how to use IP_RECVIF to determine * the interface upon which this message was received. * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { ip->ip_src.s_addr = htonl(ia->ia_subnet); ifa_free(&ia->ia_ifa); } } CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); /* * IGMPv1 report suppression. * If we are a member of this group, and our membership should be * reported, stop our group timer and transition to the 'lazy' state. */ IN_MULTI_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifsoftc *igi; igi = inm->inm_igi; if (igi == NULL) { KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); goto out_locked; } IGMPSTAT_INC(igps_rcv_ourreports); /* * If we are in IGMPv3 host mode, do not allow the * other host's IGMPv1 report to suppress our reports * unless explicitly configured to do so. */ if (igi->igi_version == IGMP_VERSION_3) { if (V_igmp_legacysupp) igmp_v3_suppress_group_record(inm); goto out_locked; } inm->inm_timer = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); case IGMP_SLEEPING_MEMBER: inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_REPORTING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); if (igi->igi_version == IGMP_VERSION_1) inm->inm_state = IGMP_LAZY_MEMBER; else if (igi->igi_version == IGMP_VERSION_2) inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } out_locked: IN_MULTI_UNLOCK(); return (0); } /* * Process a received IGMPv2 host membership report. * * NOTE: 0.0.0.0 workaround breaks const correctness. */ static int igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_multi *inm; /* * Make sure we don't hear our own membership report. Fast * leave requires knowing that we are the only member of a * group. */ IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) { ifa_free(&ia->ia_ifa); return (0); } IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (0); } if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { if (ia != NULL) ifa_free(&ia->ia_ifa); IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } /* * RFC 3376, Section 4.2.13, 9.2, 9.3: * Booting clients may use the source address 0.0.0.0. Some * IGMP daemons may not know how to use IP_RECVIF to determine * the interface upon which this message was received. * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { if (ia != NULL) ip->ip_src.s_addr = htonl(ia->ia_subnet); } if (ia != NULL) ifa_free(&ia->ia_ifa); CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); /* * IGMPv2 report suppression. * If we are a member of this group, and our membership should be * reported, and our group timer is pending or about to be reset, * stop our group timer by transitioning to the 'lazy' state. */ IN_MULTI_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifsoftc *igi; igi = inm->inm_igi; KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); IGMPSTAT_INC(igps_rcv_ourreports); /* * If we are in IGMPv3 host mode, do not allow the * other host's IGMPv1 report to suppress our reports * unless explicitly configured to do so. */ if (igi->igi_version == IGMP_VERSION_3) { if (V_igmp_legacysupp) igmp_v3_suppress_group_record(inm); goto out_locked; } inm->inm_timer = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_AWAKENING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); case IGMP_LAZY_MEMBER: inm->inm_state = IGMP_LAZY_MEMBER; break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } out_locked: IN_MULTI_UNLOCK(); return (0); } int igmp_input(struct mbuf **mp, int *offp, int proto) { int iphlen; struct ifnet *ifp; struct igmp *igmp; struct ip *ip; struct mbuf *m; int igmplen; int minlen; int queryver; CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp); m = *mp; ifp = m->m_pkthdr.rcvif; *mp = NULL; IGMPSTAT_INC(igps_rcv_total); ip = mtod(m, struct ip *); iphlen = *offp; igmplen = ntohs(ip->ip_len) - iphlen; /* * Validate lengths. */ if (igmplen < IGMP_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); return (IPPROTO_DONE); } /* * Always pullup to the minimum size for v1/v2 or v3 * to amortize calls to m_pullup(). */ minlen = iphlen; if (igmplen >= IGMP_V3_QUERY_MINLEN) minlen += IGMP_V3_QUERY_MINLEN; else minlen += IGMP_MINLEN; if ((!M_WRITABLE(m) || m->m_len < minlen) && (m = m_pullup(m, minlen)) == 0) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); /* * Validate checksum. */ m->m_data += iphlen; m->m_len -= iphlen; igmp = mtod(m, struct igmp *); if (in_cksum(m, igmplen)) { IGMPSTAT_INC(igps_rcv_badsum); m_freem(m); return (IPPROTO_DONE); } m->m_data -= iphlen; m->m_len += iphlen; /* * IGMP control traffic is link-scope, and must have a TTL of 1. * DVMRP traffic (e.g. mrinfo, mtrace) is an exception; * probe packets may come from beyond the LAN. */ if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) { IGMPSTAT_INC(igps_rcv_badttl); m_freem(m); return (IPPROTO_DONE); } switch (igmp->igmp_type) { case IGMP_HOST_MEMBERSHIP_QUERY: if (igmplen == IGMP_MINLEN) { if (igmp->igmp_code == 0) queryver = IGMP_VERSION_1; else queryver = IGMP_VERSION_2; } else if (igmplen >= IGMP_V3_QUERY_MINLEN) { queryver = IGMP_VERSION_3; } else { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); return (IPPROTO_DONE); } switch (queryver) { case IGMP_VERSION_1: IGMPSTAT_INC(igps_rcv_v1v2_queries); if (!V_igmp_v1enable) break; if (igmp_input_v1_query(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_VERSION_2: IGMPSTAT_INC(igps_rcv_v1v2_queries); if (!V_igmp_v2enable) break; if (igmp_input_v2_query(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_VERSION_3: { struct igmpv3 *igmpv3; uint16_t igmpv3len; uint16_t nsrc; IGMPSTAT_INC(igps_rcv_v3_queries); igmpv3 = (struct igmpv3 *)igmp; /* * Validate length based on source count. */ nsrc = ntohs(igmpv3->igmp_numsrc); if (nsrc * sizeof(in_addr_t) > UINT16_MAX - iphlen - IGMP_V3_QUERY_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } /* * m_pullup() may modify m, so pullup in * this scope. */ igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN + sizeof(struct in_addr) * nsrc; if ((!M_WRITABLE(m) || m->m_len < igmpv3len) && (m = m_pullup(m, igmpv3len)) == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *) + iphlen); if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) { m_freem(m); return (IPPROTO_DONE); } } break; } break; case IGMP_v1_HOST_MEMBERSHIP_REPORT: if (!V_igmp_v1enable) break; if (igmp_input_v1_report(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_v2_HOST_MEMBERSHIP_REPORT: if (!V_igmp_v2enable) break; if (!ip_checkrouteralert(m)) IGMPSTAT_INC(igps_rcv_nora); if (igmp_input_v2_report(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_v3_HOST_MEMBERSHIP_REPORT: /* * Hosts do not need to process IGMPv3 membership reports, * as report suppression is no longer required. */ if (!ip_checkrouteralert(m)) IGMPSTAT_INC(igps_rcv_nora); break; default: break; } /* * Pass all valid IGMP packets up to any process(es) listening on a * raw IGMP socket. */ *mp = m; return (rip_input(mp, offp, proto)); } /* * Fast timeout handler (global). * VIMAGE: Timeout handlers are expected to service all vimages. */ void igmp_fasttimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_fasttimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Fast timeout handler (per-vnet). * Sends are shuffled off to a netisr to deal with Giant. * * VIMAGE: Assume caller has set up our curvnet. */ static void igmp_fasttimo_vnet(void) { struct mbufq scq; /* State-change packets */ struct mbufq qrq; /* Query response packets */ struct ifnet *ifp; struct igmp_ifsoftc *igi; struct ifmultiaddr *ifma; struct in_multi *inm; int loop, uri_fasthz; loop = 0; uri_fasthz = 0; /* * Quick check to see if any work needs to be done, in order to * minimize the overhead of fasttimo processing. * SMPng: XXX Unlocked reads. */ if (!V_current_state_timers_running && !V_interface_timers_running && !V_state_change_timers_running) return; IN_MULTI_LOCK(); IGMP_LOCK(); /* * IGMPv3 General Query response timer processing. */ if (V_interface_timers_running) { CTR1(KTR_IGMPV3, "%s: interface timers running", __func__); V_interface_timers_running = 0; LIST_FOREACH(igi, &V_igi_head, igi_link) { if (igi->igi_v3_timer == 0) { /* Do nothing. */ } else if (--igi->igi_v3_timer == 0) { igmp_v3_dispatch_general_query(igi); } else { V_interface_timers_running = 1; } } } if (!V_current_state_timers_running && !V_state_change_timers_running) goto out_locked; V_current_state_timers_running = 0; V_state_change_timers_running = 0; CTR1(KTR_IGMPV3, "%s: state change timers running", __func__); /* * IGMPv1/v2/v3 host report and state-change timer processing. * Note: Processing a v3 group timer may remove a node. */ LIST_FOREACH(igi, &V_igi_head, igi_link) { ifp = igi->igi_ifp; if (igi->igi_version == IGMP_VERSION_3) { loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri * PR_FASTHZ); mbufq_init(&qrq, IGMP_MAX_G_GS_PACKETS); mbufq_init(&scq, IGMP_MAX_STATE_CHANGE_PACKETS); } IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; switch (igi->igi_version) { case IGMP_VERSION_1: case IGMP_VERSION_2: igmp_v1v2_process_group_timer(inm, igi->igi_version); break; case IGMP_VERSION_3: igmp_v3_process_group_timers(igi, &qrq, &scq, inm, uri_fasthz); break; } } IF_ADDR_RUNLOCK(ifp); if (igi->igi_version == IGMP_VERSION_3) { struct in_multi *tinm; igmp_dispatch_queue(&qrq, 0, loop); igmp_dispatch_queue(&scq, 0, loop); /* * Free the in_multi reference(s) for this * IGMP lifecycle. */ SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); inm_release_locked(inm); } } } out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); } /* * Update host report group timer for IGMPv1/v2. * Will update the global pending timer flags. */ static void igmp_v1v2_process_group_timer(struct in_multi *inm, const int version) { int report_timer_expired; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); if (inm->inm_timer == 0) { report_timer_expired = 0; } else if (--inm->inm_timer == 0) { report_timer_expired = 1; } else { V_current_state_timers_running = 1; return; } switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: break; case IGMP_REPORTING_MEMBER: if (report_timer_expired) { inm->inm_state = IGMP_IDLE_MEMBER; (void)igmp_v1v2_queue_report(inm, (version == IGMP_VERSION_2) ? IGMP_v2_HOST_MEMBERSHIP_REPORT : IGMP_v1_HOST_MEMBERSHIP_REPORT); } break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } /* * Update a group's timers for IGMPv3. * Will update the global pending timer flags. * Note: Unlocked read from igi. */ static void igmp_v3_process_group_timers(struct igmp_ifsoftc *igi, struct mbufq *qrq, struct mbufq *scq, struct in_multi *inm, const int uri_fasthz) { int query_response_timer_expired; int state_change_retransmit_timer_expired; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); query_response_timer_expired = 0; state_change_retransmit_timer_expired = 0; /* * During a transition from v1/v2 compatibility mode back to v3, * a group record in REPORTING state may still have its group * timer active. This is a no-op in this function; it is easier * to deal with it here than to complicate the slow-timeout path. */ if (inm->inm_timer == 0) { query_response_timer_expired = 0; } else if (--inm->inm_timer == 0) { query_response_timer_expired = 1; } else { V_current_state_timers_running = 1; } if (inm->inm_sctimer == 0) { state_change_retransmit_timer_expired = 0; } else if (--inm->inm_sctimer == 0) { state_change_retransmit_timer_expired = 1; } else { V_state_change_timers_running = 1; } /* We are in fasttimo, so be quick about it. */ if (!state_change_retransmit_timer_expired && !query_response_timer_expired) return; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_IDLE_MEMBER: break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: /* * Respond to a previously pending Group-Specific * or Group-and-Source-Specific query by enqueueing * the appropriate Current-State report for * immediate transmission. */ if (query_response_timer_expired) { int retval; retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1, (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); inm->inm_state = IGMP_REPORTING_MEMBER; /* XXX Clear recorded sources for next time. */ inm_clear_recorded(inm); } /* FALLTHROUGH */ case IGMP_REPORTING_MEMBER: case IGMP_LEAVING_MEMBER: if (state_change_retransmit_timer_expired) { /* * State-change retransmission timer fired. * If there are any further pending retransmissions, * set the global pending state-change flag, and * reset the timer. */ if (--inm->inm_scrv > 0) { inm->inm_sctimer = uri_fasthz; V_state_change_timers_running = 1; } /* * Retransmit the previously computed state-change * report. If there are no further pending * retransmissions, the mbuf queue will be consumed. * Update T0 state to T1 as we have now sent * a state-change. */ (void)igmp_v3_merge_state_changes(inm, scq); inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); /* * If we are leaving the group for good, make sure * we release IGMP's reference to it. * This release must be deferred using a SLIST, * as we are called from a loop which traverses * the in_ifmultiaddr TAILQ. */ if (inm->inm_state == IGMP_LEAVING_MEMBER && inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); } } break; } } /* * Suppress a group's pending response to a group or source/group query. * * Do NOT suppress state changes. This leads to IGMPv3 inconsistency. * Do NOT update ST1/ST0 as this operation merely suppresses * the currently pending group record. * Do NOT suppress the response to a general query. It is possible but * it would require adding another state or flag. */ static void igmp_v3_suppress_group_record(struct in_multi *inm) { IN_MULTI_LOCK_ASSERT(); KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3, ("%s: not IGMPv3 mode on link", __func__)); if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER) return; if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) inm_clear_recorded(inm); inm->inm_timer = 0; inm->inm_state = IGMP_REPORTING_MEMBER; } /* * Switch to a different IGMP version on the given interface, * as per Section 7.2.1. */ static void igmp_set_version(struct igmp_ifsoftc *igi, const int version) { int old_version_timer; IGMP_LOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__, version, igi->igi_ifp, igi->igi_ifp->if_xname); if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) { /* * Compute the "Older Version Querier Present" timer as per * Section 8.12. */ old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri; old_version_timer *= PR_SLOWHZ; if (version == IGMP_VERSION_1) { igi->igi_v1_timer = old_version_timer; igi->igi_v2_timer = 0; } else if (version == IGMP_VERSION_2) { igi->igi_v1_timer = 0; igi->igi_v2_timer = old_version_timer; } } if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { if (igi->igi_version != IGMP_VERSION_2) { igi->igi_version = IGMP_VERSION_2; igmp_v3_cancel_link_timers(igi); } } else if (igi->igi_v1_timer > 0) { if (igi->igi_version != IGMP_VERSION_1) { igi->igi_version = IGMP_VERSION_1; igmp_v3_cancel_link_timers(igi); } } } /* * Cancel pending IGMPv3 timers for the given link and all groups * joined on it; state-change, general-query, and group-query timers. * * Only ever called on a transition from v3 to Compatibility mode. Kill * the timers stone dead (this may be expensive for large N groups), they * will be restarted if Compatibility Mode deems that they must be due to * query processing. */ static void igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm, *tinm; CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); /* * Stop the v3 General Query Response on this link stone dead. * If fasttimo is woken up due to V_interface_timers_running, * the flag will be cleared if there are no pending link timers. */ igi->igi_v3_timer = 0; /* * Now clear the current-state and state-change report timers * for all memberships scoped to this link. */ ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: /* * These states are either not relevant in v3 mode, * or are unreported. Do nothing. */ break; case IGMP_LEAVING_MEMBER: /* * If we are leaving the group and switching to * compatibility mode, we need to release the final * reference held for issuing the INCLUDE {}, and * transition to REPORTING to ensure the host leave * message is sent upstream to the old querier -- * transition to NOT would lose the leave and race. */ SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); /* FALLTHROUGH */ case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: inm_clear_recorded(inm); /* FALLTHROUGH */ case IGMP_REPORTING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; break; } /* * Always clear state-change and group report timers. * Free any pending IGMPv3 state-change records. */ inm->inm_sctimer = 0; inm->inm_timer = 0; mbufq_drain(&inm->inm_scq); } IF_ADDR_RUNLOCK(ifp); SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); inm_release_locked(inm); } } /* * Update the Older Version Querier Present timers for a link. * See Section 7.2.1 of RFC 3376. */ static void igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *igi) { IGMP_LOCK_ASSERT(); if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) { /* * IGMPv1 and IGMPv2 Querier Present timers expired. * * Revert to IGMPv3. */ if (igi->igi_version != IGMP_VERSION_3) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_3; } } else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { /* * IGMPv1 Querier Present timer expired, * IGMPv2 Querier Present timer running. * If IGMPv2 was disabled since last timeout, * revert to IGMPv3. * If IGMPv2 is enabled, revert to IGMPv2. */ if (!V_igmp_v2enable) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v2_timer = 0; igi->igi_version = IGMP_VERSION_3; } else { --igi->igi_v2_timer; if (igi->igi_version != IGMP_VERSION_2) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_2, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_2; igmp_v3_cancel_link_timers(igi); } } } else if (igi->igi_v1_timer > 0) { /* * IGMPv1 Querier Present timer running. * Stop IGMPv2 timer if running. * * If IGMPv1 was disabled since last timeout, * revert to IGMPv3. * If IGMPv1 is enabled, reset IGMPv2 timer if running. */ if (!V_igmp_v1enable) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v1_timer = 0; igi->igi_version = IGMP_VERSION_3; } else { --igi->igi_v1_timer; } if (igi->igi_v2_timer > 0) { CTR3(KTR_IGMPV3, "%s: cancel v2 timer on %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v2_timer = 0; } } } /* * Global slowtimo handler. * VIMAGE: Timeout handlers are expected to service all vimages. */ void igmp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_slowtimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Per-vnet slowtimo handler. */ static void igmp_slowtimo_vnet(void) { struct igmp_ifsoftc *igi; IGMP_LOCK(); LIST_FOREACH(igi, &V_igi_head, igi_link) { igmp_v1v2_process_querier_timers(igi); } IGMP_UNLOCK(); } /* * Dispatch an IGMPv1/v2 host report or leave message. * These are always small enough to fit inside a single mbuf. */ static int igmp_v1v2_queue_report(struct in_multi *inm, const int type) { struct ifnet *ifp; struct igmp *igmp; struct ip *ip; struct mbuf *m; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); ifp = inm->inm_ifp; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOMEM); M_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp)); m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp); m->m_data += sizeof(struct ip); m->m_len = sizeof(struct igmp); igmp = mtod(m, struct igmp *); igmp->igmp_type = type; igmp->igmp_code = 0; igmp->igmp_group = inm->inm_addr; igmp->igmp_cksum = 0; igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp)); m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = 0; ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp)); ip->ip_off = 0; ip->ip_p = IPPROTO_IGMP; ip->ip_src.s_addr = INADDR_ANY; if (type == IGMP_HOST_LEAVE_MESSAGE) ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP); else ip->ip_dst = inm->inm_addr; igmp_save_context(m, ifp); m->m_flags |= M_IGMPV2; if (inm->inm_igi->igi_flags & IGIF_LOOPBACK) m->m_flags |= M_IGMP_LOOP; CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m); netisr_dispatch(NETISR_IGMP, m); return (0); } /* * Process a state change from the upper layer for the given IPv4 group. * * Each socket holds a reference on the in_multi in its own ip_moptions. * The socket layer will have made the necessary updates to.the group * state, it is now up to IGMP to issue a state change report if there * has been any change between T0 (when the last state-change was issued) * and T1 (now). * * We use the IGMPv3 state machine at group level. The IGMP module * however makes the decision as to which IGMP protocol version to speak. * A state change *from* INCLUDE {} always means an initial join. * A state change *to* INCLUDE {} always means a final leave. * * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can * save ourselves a bunch of work; any exclusive mode groups need not * compute source filter lists. * * VIMAGE: curvnet should have been set by caller, as this routine * is called from the socket option handlers. */ int igmp_change_state(struct in_multi *inm) { struct igmp_ifsoftc *igi; struct ifnet *ifp; int error; IN_MULTI_LOCK_ASSERT(); error = 0; /* * Try to detect if the upper layer just asked us to change state * for an interface which has now gone away. */ KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); /* * If we detect a state transition to or from MCAST_UNDEFINED * for this group, then we are starting or finishing an IGMP * life cycle for this group. */ if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) { CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__, inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode); if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_IGMPV3, "%s: initial join", __func__); error = igmp_initial_join(inm, igi); goto out_locked; } else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_IGMPV3, "%s: final leave", __func__); igmp_final_leave(inm, igi); goto out_locked; } } else { CTR1(KTR_IGMPV3, "%s: filter set change", __func__); } error = igmp_handle_state_change(inm, igi); out_locked: IGMP_UNLOCK(); return (error); } /* * Perform the initial join for an IGMP group. * * When joining a group: * If the group should have its IGMP traffic suppressed, do nothing. * IGMPv1 starts sending IGMPv1 host membership reports. * IGMPv2 starts sending IGMPv2 host membership reports. * IGMPv3 will schedule an IGMPv3 state-change report containing the * initial state of the membership. */ static int igmp_initial_join(struct in_multi *inm, struct igmp_ifsoftc *igi) { struct ifnet *ifp; struct mbufq *mq; int error, retval, syncstates; CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, inm->inm_ifp->if_xname); error = 0; syncstates = 1; ifp = inm->inm_ifp; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); /* * Groups joined on loopback or marked as 'not reported', * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and * are never reported in any IGMP protocol exchanges. * All other groups enter the appropriate IGMP state machine * for the version in use on this link. * A link marked as IGIF_SILENT causes IGMP to be completely * disabled for the link. */ if ((ifp->if_flags & IFF_LOOPBACK) || (igi->igi_flags & IGIF_SILENT) || !igmp_isgroupreported(inm->inm_addr)) { CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); inm->inm_state = IGMP_SILENT_MEMBER; inm->inm_timer = 0; } else { /* * Deal with overlapping in_multi lifecycle. * If this group was LEAVING, then make sure * we drop the reference we picked up to keep the * group around for the final INCLUDE {} enqueue. */ if (igi->igi_version == IGMP_VERSION_3 && inm->inm_state == IGMP_LEAVING_MEMBER) inm_release_locked(inm); inm->inm_state = IGMP_REPORTING_MEMBER; switch (igi->igi_version) { case IGMP_VERSION_1: case IGMP_VERSION_2: inm->inm_state = IGMP_IDLE_MEMBER; error = igmp_v1v2_queue_report(inm, (igi->igi_version == IGMP_VERSION_2) ? IGMP_v2_HOST_MEMBERSHIP_REPORT : IGMP_v1_HOST_MEMBERSHIP_REPORT); if (error == 0) { inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_V1V2_MAX_RI * PR_FASTHZ); V_current_state_timers_running = 1; } break; case IGMP_VERSION_3: /* * Defer update of T0 to T1, until the first copy * of the state change has been transmitted. */ syncstates = 0; /* * Immediately enqueue a State-Change Report for * this interface, freeing any previous reports. * Don't kick the timers if there is nothing to do, * or if an error occurred. */ mq = &inm->inm_scq; mbufq_drain(mq); retval = igmp_v3_enqueue_group_record(mq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) { error = retval * -1; break; } /* * Schedule transmission of pending state-change * report up to RV times for this link. The timer * will fire at the next igmp_fasttimo (~200ms), * giving us an opportunity to merge the reports. */ if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; } else { KASSERT(igi->igi_rv > 1, ("%s: invalid robustness %d", __func__, igi->igi_rv)); inm->inm_scrv = igi->igi_rv; } inm->inm_sctimer = 1; V_state_change_timers_running = 1; error = 0; break; } } /* * Only update the T0 state if state change is atomic, * i.e. we don't need to wait for a timer to fire before we * can consider the state change to have been communicated. */ if (syncstates) { inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); } return (error); } /* * Issue an intermediate state change during the IGMP life-cycle. */ static int igmp_handle_state_change(struct in_multi *inm, struct igmp_ifsoftc *igi) { struct ifnet *ifp; int retval; CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, inm->inm_ifp->if_xname); ifp = inm->inm_ifp; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); if ((ifp->if_flags & IFF_LOOPBACK) || (igi->igi_flags & IGIF_SILENT) || !igmp_isgroupreported(inm->inm_addr) || (igi->igi_version != IGMP_VERSION_3)) { if (!igmp_isgroupreported(inm->inm_addr)) { CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); } CTR1(KTR_IGMPV3, "%s: nothing to do", __func__); inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); return (0); } mbufq_drain(&inm->inm_scq); retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) return (-retval); /* * If record(s) were enqueued, start the state-change * report timer for this group. */ inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv); inm->inm_sctimer = 1; V_state_change_timers_running = 1; return (0); } /* * Perform the final leave for an IGMP group. * * When leaving a group: * IGMPv1 does nothing. * IGMPv2 sends a host leave message, if and only if we are the reporter. * IGMPv3 enqueues a state-change report containing a transition * to INCLUDE {} for immediate transmission. */ static void igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi) { int syncstates; syncstates = 1; CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, inm->inm_ifp->if_xname); IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_LEAVING_MEMBER: /* Already leaving or left; do nothing. */ CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: if (igi->igi_version == IGMP_VERSION_2) { #ifdef INVARIANTS if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) panic("%s: IGMPv3 state reached, not IGMPv3 mode", __func__); #endif igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE); inm->inm_state = IGMP_NOT_MEMBER; } else if (igi->igi_version == IGMP_VERSION_3) { /* * Stop group timer and all pending reports. * Immediately enqueue a state-change report * TO_IN {} to be sent on the next fast timeout, * giving us an opportunity to merge reports. */ mbufq_drain(&inm->inm_scq); inm->inm_timer = 0; if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; } else { inm->inm_scrv = igi->igi_rv; } CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d " "pending retransmissions.", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, inm->inm_scrv); if (inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; inm->inm_sctimer = 0; } else { int retval; inm_acquire_locked(inm); retval = igmp_v3_enqueue_group_record( &inm->inm_scq, inm, 1, 0, 0); KASSERT(retval != 0, ("%s: enqueue record = %d", __func__, retval)); inm->inm_state = IGMP_LEAVING_MEMBER; inm->inm_sctimer = 1; V_state_change_timers_running = 1; syncstates = 0; } break; } break; case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: /* Our reports are suppressed; do nothing. */ break; } if (syncstates) { inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); } } /* * Enqueue an IGMPv3 group record to the given output queue. * * XXX This function could do with having the allocation code * split out, and the multiple-tree-walks coalesced into a single * routine as has been done in igmp_v3_enqueue_filter_change(). * * If is_state_change is zero, a current-state record is appended. * If is_state_change is non-zero, a state-change report is appended. * * If is_group_query is non-zero, an mbuf packet chain is allocated. * If is_group_query is zero, and if there is a packet with free space * at the tail of the queue, it will be appended to providing there * is enough free space. * Otherwise a new mbuf packet chain is allocated. * * If is_source_query is non-zero, each source is checked to see if * it was recorded for a Group-Source query, and will be omitted if * it is not both in-mode and recorded. * * The function will attempt to allocate leading space in the packet * for the IP/IGMP header to be prepended without fragmenting the chain. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm, const int is_state_change, const int is_group_query, const int is_source_query) { struct igmp_grouprec ig; struct igmp_grouprec *pig; struct ifnet *ifp; struct ip_msource *ims, *nims; struct mbuf *m0, *m, *md; int error, is_filter_list_change; int minrec0len, m0srcs, msrcs, nbytes, off; int record_has_sources; int now; int type; in_addr_t naddr; uint8_t mode; IN_MULTI_LOCK_ASSERT(); error = 0; ifp = inm->inm_ifp; is_filter_list_change = 0; m = NULL; m0 = NULL; m0srcs = 0; msrcs = 0; nbytes = 0; nims = NULL; record_has_sources = 1; pig = NULL; type = IGMP_DO_NOTHING; mode = inm->inm_st[1].iss_fmode; /* * If we did not transition out of ASM mode during t0->t1, * and there are no source nodes to process, we can skip * the generation of source records. */ if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 && inm->inm_nsrc == 0) record_has_sources = 0; if (is_state_change) { /* * Queue a state change record. * If the mode did not change, and there are non-ASM * listeners or source filters present, * we potentially need to issue two records for the group. * If we are transitioning to MCAST_UNDEFINED, we need * not send any sources. * If there are ASM listeners, and there was no filter * mode transition of any kind, do nothing. */ if (mode != inm->inm_st[0].iss_fmode) { if (mode == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: change to EXCLUDE", __func__); type = IGMP_CHANGE_TO_EXCLUDE_MODE; } else { CTR1(KTR_IGMPV3, "%s: change to INCLUDE", __func__); type = IGMP_CHANGE_TO_INCLUDE_MODE; if (mode == MCAST_UNDEFINED) record_has_sources = 0; } } else { if (record_has_sources) { is_filter_list_change = 1; } else { type = IGMP_DO_NOTHING; } } } else { /* * Queue a current state record. */ if (mode == MCAST_EXCLUDE) { type = IGMP_MODE_IS_EXCLUDE; } else if (mode == MCAST_INCLUDE) { type = IGMP_MODE_IS_INCLUDE; KASSERT(inm->inm_st[1].iss_asm == 0, ("%s: inm %p is INCLUDE but ASM count is %d", __func__, inm, inm->inm_st[1].iss_asm)); } } /* * Generate the filter list changes using a separate function. */ if (is_filter_list_change) return (igmp_v3_enqueue_filter_change(mq, inm)); if (type == IGMP_DO_NOTHING) { CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); return (0); } /* * If any sources are present, we must be able to fit at least * one in the trailing space of the tail packet's mbuf, * ideally more. */ minrec0len = sizeof(struct igmp_grouprec); if (record_has_sources) minrec0len += sizeof(in_addr_t); CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__, igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); /* * Check if we have a packet in the tail of the queue for this * group into which the first group record for this group will fit. * Otherwise allocate a new packet. * Always allocate leading space for IP+RA_OPT+IGMP+REPORT. * Note: Group records for G/GSR query responses MUST be sent * in their own packet. */ m0 = mbufq_last(mq); if (!is_group_query && m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && (m0->m_pkthdr.len + minrec0len) < (ifp->if_mtu - IGMP_LEADINGSPACE)) { m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); m = m0; CTR1(KTR_IGMPV3, "%s: use existing packet", __func__); } else { if (mbufq_full(mq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } m = NULL; m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); if (!is_state_change && !is_group_query) { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); igmp_save_context(m, ifp); CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__); } /* * Append group record. * If we have sources, we don't know how many yet. */ ig.ig_type = type; ig.ig_datalen = 0; ig.ig_numsrc = 0; ig.ig_group = inm->inm_addr; if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct igmp_grouprec); /* * Append as many sources as will fit in the first packet. * If we are appending to a new packet, the chain allocation * may potentially use clusters; use m_getptr() in this case. * If we are appending to an existing packet, we need to obtain * a pointer to the group record after m_append(), in case a new * mbuf was allocated. * Only append sources which are in-mode at t1. If we are * transitioning to MCAST_UNDEFINED state on the group, do not * include source entries. * Only report recorded sources in our filter set when responding * to a group-source query. */ if (record_has_sources) { if (m == m0) { md = m_last(m); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + md->m_len - nbytes); } else { md = m_getptr(m, 0, &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); } msrcs = 0; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) { CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, inet_ntoa_haddr(ims->ims_haddr)); now = ims_get_mode(inm, ims, 1); CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now); if ((now != mode) || (now == mode && mode == MCAST_UNDEFINED)) { CTR1(KTR_IGMPV3, "%s: skip node", __func__); continue; } if (is_source_query && ims->ims_stp == 0) { CTR1(KTR_IGMPV3, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_IGMPV3, "%s: append node", __func__); naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(in_addr_t); ++msrcs; if (msrcs == m0srcs) break; } CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__, msrcs); pig->ig_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(in_addr_t)); } if (is_source_query && msrcs == 0) { CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__); if (m != m0) m_freem(m); return (0); } /* * We are good to go with first packet. */ if (m != m0) { CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__); m->m_pkthdr.PH_vt.vt_nrecs = 1; mbufq_enqueue(mq, m); } else m->m_pkthdr.PH_vt.vt_nrecs++; /* * No further work needed if no source list in packet(s). */ if (!record_has_sources) return (nbytes); /* * Whilst sources remain to be announced, we need to allocate * a new packet and fill out as many sources as will fit. * Always try for a cluster first. */ while (nims != NULL) { if (mbufq_full(mq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); igmp_save_context(m, ifp); md = m_getptr(m, 0, &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__); if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 1; nbytes += sizeof(struct igmp_grouprec); m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); msrcs = 0; RB_FOREACH_FROM(ims, ip_msource_tree, nims) { CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, inet_ntoa_haddr(ims->ims_haddr)); now = ims_get_mode(inm, ims, 1); if ((now != mode) || (now == mode && mode == MCAST_UNDEFINED)) { CTR1(KTR_IGMPV3, "%s: skip node", __func__); continue; } if (is_source_query && ims->ims_stp == 0) { CTR1(KTR_IGMPV3, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_IGMPV3, "%s: append node", __func__); naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } ++msrcs; if (msrcs == m0srcs) break; } pig->ig_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(in_addr_t)); CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__); mbufq_enqueue(mq, m); } return (nbytes); } /* * Type used to mark record pass completion. * We exploit the fact we can cast to this easily from the * current filter modes on each ip_msource node. */ typedef enum { REC_NONE = 0x00, /* MCAST_UNDEFINED */ REC_ALLOW = 0x01, /* MCAST_INCLUDE */ REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ REC_FULL = REC_ALLOW | REC_BLOCK } rectype_t; /* * Enqueue an IGMPv3 filter list change to the given output queue. * * Source list filter state is held in an RB-tree. When the filter list * for a group is changed without changing its mode, we need to compute * the deltas between T0 and T1 for each source in the filter set, * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. * * As we may potentially queue two record types, and the entire R-B tree * needs to be walked at once, we break this out into its own function * so we can generate a tightly packed queue of packets. * * XXX This could be written to only use one tree walk, although that makes * serializing into the mbuf chains a bit harder. For now we do two walks * which makes things easier on us, and it may or may not be harder on * the L2 cache. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int igmp_v3_enqueue_filter_change(struct mbufq *mq, struct in_multi *inm) { static const int MINRECLEN = sizeof(struct igmp_grouprec) + sizeof(in_addr_t); struct ifnet *ifp; struct igmp_grouprec ig; struct igmp_grouprec *pig; struct ip_msource *ims, *nims; struct mbuf *m, *m0, *md; in_addr_t naddr; int m0srcs, nbytes, npbytes, off, rsrcs, schanged; int nallow, nblock; uint8_t mode, now, then; rectype_t crt, drt, nrt; IN_MULTI_LOCK_ASSERT(); if (inm->inm_nsrc == 0 || (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0)) return (0); ifp = inm->inm_ifp; /* interface */ mode = inm->inm_st[1].iss_fmode; /* filter mode at t1 */ crt = REC_NONE; /* current group record type */ drt = REC_NONE; /* mask of completed group record types */ nrt = REC_NONE; /* record type for current node */ m0srcs = 0; /* # source which will fit in current mbuf chain */ nbytes = 0; /* # of bytes appended to group's state-change queue */ npbytes = 0; /* # of bytes appended this packet */ rsrcs = 0; /* # sources encoded in current record */ schanged = 0; /* # nodes encoded in overall filter change */ nallow = 0; /* # of source entries in ALLOW_NEW */ nblock = 0; /* # of source entries in BLOCK_OLD */ nims = NULL; /* next tree node pointer */ /* * For each possible filter record mode. * The first kind of source we encounter tells us which * is the first kind of record we start appending. * If a node transitioned to UNDEFINED at t1, its mode is treated * as the inverse of the group's filter mode. */ while (drt != REC_FULL) { do { m0 = mbufq_last(mq); if (m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && (m0->m_pkthdr.len + MINRECLEN) < (ifp->if_mtu - IGMP_LEADINGSPACE)) { m = m0; m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); CTR1(KTR_IGMPV3, "%s: use previous packet", __func__); } else { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) { CTR1(KTR_IGMPV3, "%s: m_get*() failed", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 0; igmp_save_context(m, ifp); m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); npbytes = 0; CTR1(KTR_IGMPV3, "%s: allocated new packet", __func__); } /* * Append the IGMP group record header to the * current packet's data area. * Recalculate pointer to free space for next * group record, in case m_append() allocated * a new mbuf or cluster. */ memset(&ig, 0, sizeof(ig)); ig.ig_group = inm->inm_addr; if (!m_append(m, sizeof(ig), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed", __func__); return (-ENOMEM); } npbytes += sizeof(struct igmp_grouprec); if (m != m0) { /* new packet; offset in c hain */ md = m_getptr(m, npbytes - sizeof(struct igmp_grouprec), &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); } else { /* current packet; offset from last append */ md = m_last(m); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + md->m_len - sizeof(struct igmp_grouprec)); } /* * Begin walking the tree for this record type * pass, or continue from where we left off * previously if we had to allocate a new packet. * Only report deltas in-mode at t1. * We need not report included sources as allowed * if we are in inclusive mode on the group, * however the converse is not true. */ rsrcs = 0; if (nims == NULL) nims = RB_MIN(ip_msource_tree, &inm->inm_srcs); RB_FOREACH_FROM(ims, ip_msource_tree, nims) { CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, inet_ntoa_haddr(ims->ims_haddr)); now = ims_get_mode(inm, ims, 1); then = ims_get_mode(inm, ims, 0); CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d", __func__, then, now); if (now == then) { CTR1(KTR_IGMPV3, "%s: skip unchanged", __func__); continue; } if (mode == MCAST_EXCLUDE && now == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: skip IN src on EX group", __func__); continue; } nrt = (rectype_t)now; if (nrt == REC_NONE) nrt = (rectype_t)(~mode & REC_FULL); if (schanged++ == 0) { crt = nrt; } else if (crt != nrt) continue; naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed", __func__); return (-ENOMEM); } nallow += !!(crt == REC_ALLOW); nblock += !!(crt == REC_BLOCK); if (++rsrcs == m0srcs) break; } /* * If we did not append any tree nodes on this * pass, back out of allocations. */ if (rsrcs == 0) { npbytes -= sizeof(struct igmp_grouprec); if (m != m0) { CTR1(KTR_IGMPV3, "%s: m_free(m)", __func__); m_freem(m); } else { CTR1(KTR_IGMPV3, "%s: m_adj(m, -ig)", __func__); m_adj(m, -((int)sizeof( struct igmp_grouprec))); } continue; } npbytes += (rsrcs * sizeof(in_addr_t)); if (crt == REC_ALLOW) pig->ig_type = IGMP_ALLOW_NEW_SOURCES; else if (crt == REC_BLOCK) pig->ig_type = IGMP_BLOCK_OLD_SOURCES; pig->ig_numsrc = htons(rsrcs); /* * Count the new group record, and enqueue this * packet if it wasn't already queued. */ m->m_pkthdr.PH_vt.vt_nrecs++; if (m != m0) mbufq_enqueue(mq, m); nbytes += npbytes; } while (nims != NULL); drt |= crt; crt = (~crt & REC_FULL); } CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__, nallow, nblock); return (nbytes); } static int igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq) { struct mbufq *gq; struct mbuf *m; /* pending state-change */ struct mbuf *m0; /* copy of pending state-change */ struct mbuf *mt; /* last state-change in packet */ int docopy, domerge; u_int recslen; docopy = 0; domerge = 0; recslen = 0; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); /* * If there are further pending retransmissions, make a writable * copy of each queued state-change message before merging. */ if (inm->inm_scrv > 0) docopy = 1; gq = &inm->inm_scq; #ifdef KTR if (mbufq_first(gq) == NULL) { CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty", __func__, inm); } #endif m = mbufq_first(gq); while (m != NULL) { /* * Only merge the report into the current packet if * there is sufficient space to do so; an IGMPv3 report * packet may only contain 65,535 group records. * Always use a simple mbuf chain concatentation to do this, * as large state changes for single groups may have * allocated clusters. */ domerge = 0; mt = mbufq_last(scq); if (mt != NULL) { recslen = m_length(m, NULL); if ((mt->m_pkthdr.PH_vt.vt_nrecs + m->m_pkthdr.PH_vt.vt_nrecs <= IGMP_V3_REPORT_MAXRECS) && (mt->m_pkthdr.len + recslen <= (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE))) domerge = 1; } if (!domerge && mbufq_full(gq)) { CTR2(KTR_IGMPV3, "%s: outbound queue full, skipping whole packet %p", __func__, m); mt = m->m_nextpkt; if (!docopy) m_freem(m); m = mt; continue; } if (!docopy) { CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m); m0 = mbufq_dequeue(gq); m = m0->m_nextpkt; } else { CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m); m0 = m_dup(m, M_NOWAIT); if (m0 == NULL) return (ENOMEM); m0->m_nextpkt = NULL; m = m->m_nextpkt; } if (!domerge) { CTR3(KTR_IGMPV3, "%s: queueing %p to scq %p)", __func__, m0, scq); mbufq_enqueue(scq, m0); } else { struct mbuf *mtl; /* last mbuf of packet mt */ CTR3(KTR_IGMPV3, "%s: merging %p with scq tail %p)", __func__, m0, mt); mtl = m_last(mt); m0->m_flags &= ~M_PKTHDR; mt->m_pkthdr.len += recslen; mt->m_pkthdr.PH_vt.vt_nrecs += m0->m_pkthdr.PH_vt.vt_nrecs; mtl->m_next = m0; } } return (0); } /* * Respond to a pending IGMPv3 General Query. */ static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm; int retval, loop; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi->igi_version == IGMP_VERSION_3, ("%s: called when version %d", __func__, igi->igi_version)); /* * Check that there are some packets queued. If so, send them first. * For large number of groups the reply to general query can take * many packets, we should finish sending them before starting of * queuing the new reply. */ if (mbufq_len(&igi->igi_gq) != 0) goto send; ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; KASSERT(ifp == inm->inm_ifp, ("%s: inconsistent ifp", __func__)); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; retval = igmp_v3_enqueue_group_record(&igi->igi_gq, inm, 0, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); send: loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop); /* * Slew transmission of bursts over 500ms intervals. */ if (mbufq_first(&igi->igi_gq) != NULL) { igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY( IGMP_RESPONSE_BURST_INTERVAL); V_interface_timers_running = 1; } } /* * Transmit the next pending IGMP message in the output queue. * * We get called from netisr_processqueue(). A mutex private to igmpoq * will be acquired and released around this routine. * * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis. * MRT: Nothing needs to be done, as IGMP traffic is always local to * a link and uses a link-scope multicast address. */ static void igmp_intr(struct mbuf *m) { struct ip_moptions imo; struct ifnet *ifp; struct mbuf *ipopts, *m0; int error; uint32_t ifindex; CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m); /* * Set VNET image pointer from enqueued mbuf chain * before doing anything else. Whilst we use interface * indexes to guard against interface detach, they are * unique to each VIMAGE and must be retrieved. */ CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr)); ifindex = igmp_restore_context(m); /* * Check if the ifnet still exists. This limits the scope of * any race in the absence of a global ifp lock for low cost * (an array lookup). */ ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.", __func__, m, ifindex); m_freem(m); IPSTAT_INC(ips_noroute); goto out; } ipopts = V_igmp_sendra ? m_raopt : NULL; imo.imo_multicast_ttl = 1; imo.imo_multicast_vif = -1; imo.imo_multicast_loop = (V_ip_mrouter != NULL); /* * If the user requested that IGMP traffic be explicitly * redirected to the loopback interface (e.g. they are running a * MANET interface and the routing protocol needs to see the * updates), handle this now. */ if (m->m_flags & M_IGMP_LOOP) imo.imo_multicast_ifp = V_loif; else imo.imo_multicast_ifp = ifp; if (m->m_flags & M_IGMPV2) { m0 = m; } else { m0 = igmp_v3_encap_report(ifp, m); if (m0 == NULL) { CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m); m_freem(m); IPSTAT_INC(ips_odropped); goto out; } } igmp_scrub_context(m0); m_clrprotoflags(m); m0->m_pkthdr.rcvif = V_loif; #ifdef MAC mac_netinet_igmp_send(ifp, m0); #endif error = ip_output(m0, ipopts, NULL, 0, &imo, NULL); if (error) { CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error); goto out; } IGMPSTAT_INC(igps_snd_reports); out: /* * We must restore the existing vnet pointer before * continuing as we are run from netisr context. */ CURVNET_RESTORE(); } /* * Encapsulate an IGMPv3 report. * * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf * chain has already had its IP/IGMPv3 header prepended. In this case * the function will not attempt to prepend; the lengths and checksums * will however be re-computed. * * Returns a pointer to the new mbuf chain head, or NULL if the * allocation failed. */ static struct mbuf * igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) { struct rm_priotracker in_ifa_tracker; struct igmp_report *igmp; struct ip *ip; int hdrlen, igmpreclen; KASSERT((m->m_flags & M_PKTHDR), ("%s: mbuf chain %p is !M_PKTHDR", __func__, m)); igmpreclen = m_length(m, NULL); hdrlen = sizeof(struct ip) + sizeof(struct igmp_report); if (m->m_flags & M_IGMPV3_HDR) { igmpreclen -= hdrlen; } else { M_PREPEND(m, hdrlen, M_NOWAIT); if (m == NULL) return (NULL); m->m_flags |= M_IGMPV3_HDR; } CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen); m->m_data += sizeof(struct ip); m->m_len -= sizeof(struct ip); igmp = mtod(m, struct igmp_report *); igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT; igmp->ir_rsv1 = 0; igmp->ir_rsv2 = 0; igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs); igmp->ir_cksum = 0; igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen); m->m_pkthdr.PH_vt.vt_nrecs = 0; m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = IPTOS_PREC_INTERNETCONTROL; ip->ip_len = htons(hdrlen + igmpreclen); ip->ip_off = htons(IP_DF); ip->ip_p = IPPROTO_IGMP; ip->ip_sum = 0; ip->ip_src.s_addr = INADDR_ANY; if (m->m_flags & M_IGMP_LOOP) { struct in_ifaddr *ia; IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { ip->ip_src = ia->ia_addr.sin_addr; ifa_free(&ia->ia_ifa); } } ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP); return (m); } #ifdef KTR static char * igmp_rec_type_to_str(const int type) { switch (type) { case IGMP_CHANGE_TO_EXCLUDE_MODE: return "TO_EX"; break; case IGMP_CHANGE_TO_INCLUDE_MODE: return "TO_IN"; break; case IGMP_MODE_IS_EXCLUDE: return "MODE_EX"; break; case IGMP_MODE_IS_INCLUDE: return "MODE_IN"; break; case IGMP_ALLOW_NEW_SOURCES: return "ALLOW_NEW"; break; case IGMP_BLOCK_OLD_SOURCES: return "BLOCK_OLD"; break; default: break; } return "unknown"; } #endif +static void +vnet_igmp_init(const void *unused __unused) +{ + + netisr_register(&igmp_nh); +} +VNET_SYSINIT(vnet_igmp_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, + vnet_igmp_init, NULL); + #ifdef VIMAGE static void vnet_igmp_uninit(const void *unused __unused) { /* This can happen when we shutdown the entire network stack. */ CTR1(KTR_IGMPV3, "%s: tearing down", __func__); + + netisr_unregister(&igmp_nh); } VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_igmp_uninit, NULL); #endif #ifdef DDB DB_SHOW_COMMAND(igi_list, db_show_igi_list) { struct igmp_ifsoftc *igi, *tigi; LIST_HEAD(_igi_list, igmp_ifsoftc) *igi_head; if (!have_addr) { db_printf("usage: show igi_list \n"); return; } igi_head = (struct _igi_list *)addr; LIST_FOREACH_SAFE(igi, igi_head, igi_link, tigi) { db_printf("igmp_ifsoftc %p:\n", igi); db_printf(" ifp %p\n", igi->igi_ifp); db_printf(" version %u\n", igi->igi_version); db_printf(" v1_timer %u\n", igi->igi_v1_timer); db_printf(" v2_timer %u\n", igi->igi_v2_timer); db_printf(" v3_timer %u\n", igi->igi_v3_timer); db_printf(" flags %#x\n", igi->igi_flags); db_printf(" rv %u\n", igi->igi_rv); db_printf(" qi %u\n", igi->igi_qi); db_printf(" qri %u\n", igi->igi_qri); db_printf(" uri %u\n", igi->igi_uri); /* SLIST_HEAD(,in_multi) igi_relinmhead */ /* struct mbufq igi_gq; */ db_printf("\n"); } } #endif static int igmp_modevent(module_t mod, int type, void *unused __unused) { switch (type) { case MOD_LOAD: CTR1(KTR_IGMPV3, "%s: initializing", __func__); IGMP_LOCK_INIT(); m_raopt = igmp_ra_alloc(); - netisr_register(&igmp_nh); break; case MOD_UNLOAD: CTR1(KTR_IGMPV3, "%s: tearing down", __func__); - netisr_unregister(&igmp_nh); m_free(m_raopt); m_raopt = NULL; IGMP_LOCK_DESTROY(); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t igmp_mod = { "igmp", igmp_modevent, 0 }; -DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PROTO_MC, SI_ORDER_ANY); +DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE); Index: projects/vnet/sys/netinet/ip_input.c =================================================================== --- projects/vnet/sys/netinet/ip_input.c (revision 298547) +++ projects/vnet/sys/netinet/ip_input.c (revision 298548) @@ -1,1357 +1,1363 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_ipfw.h" #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #include #endif /* IPSEC */ #include #include #include #ifdef CTASSERT CTASSERT(sizeof(struct ip) == 20); #endif /* IP reassembly functions are defined in ip_reass.c. */ extern void ipreass_init(void); extern void ipreass_drain(void); extern void ipreass_slowtimo(void); #ifdef VIMAGE extern void ipreass_destroy(void); #endif struct rmlock in_ifaddr_lock; RM_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock"); VNET_DEFINE(int, rsvp_on); VNET_DEFINE(int, ipforwarding); SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipforwarding), 0, "Enable IP forwarding between interfaces"); static VNET_DEFINE(int, ipsendredirects) = 1; /* XXX */ #define V_ipsendredirects VNET(ipsendredirects) SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsendredirects), 0, "Enable sending IP redirects"); /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table * and transmit implementation do not implement the Strong ES model, * setting this to 1 results in an odd hybrid. * * XXX - ip_checkinterface currently must be disabled if you use ipnat * to translate the destination address to another local interface. * * XXX - ip_checkinterface must be disabled if you add IP aliases * to the loopback interface instead of the interface where the * packets for those addresses are received. */ static VNET_DEFINE(int, ip_checkinterface); #define V_ip_checkinterface VNET(ip_checkinterface) SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_checkinterface), 0, "Verify packet arrives on correct interface"); VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */ static struct netisr_handler ip_nh = { .nh_name = "ip", .nh_handler = ip_input, .nh_proto = NETISR_IP, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid_v4, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, #else .nh_policy = NETISR_POLICY_FLOW, #endif }; #ifdef RSS /* * Directly dispatched frames are currently assumed * to have a flowid already calculated. * * It should likely have something that assert it * actually has valid flow details. */ static struct netisr_handler ip_direct_nh = { .nh_name = "ip_direct", .nh_handler = ip_direct_input, .nh_proto = NETISR_IP_DIRECT, .nh_m2cpuid = rss_soft_m2cpuid_v4, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, }; #endif extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ #ifdef IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, &ip_mtu, 0, "Default MTU"); #endif #ifdef IPSTEALTH VNET_DEFINE(int, ipstealth); SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipstealth), 0, "IP stealth mode, no TTL decrementation on forwarding"); #endif /* * IP statistics are stored in the "array" of counter(9)s. */ VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat); VNET_PCPUSTAT_SYSINIT(ipstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ipstat); #endif /* VIMAGE */ /* * Kernel module interface for updating ipstat. The argument is an index * into ipstat treated as an array. */ void kmod_ipstat_inc(int statnum) { counter_u64_add(VNET(ipstat)[statnum], 1); } void kmod_ipstat_dec(int statnum) { counter_u64_add(VNET(ipstat)[statnum], -1); } static int sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I", "Maximum size of the IP input queue"); static int sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I", "Number of packets dropped from the IP input queue"); #ifdef RSS static int sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_direct_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_direct_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_direct_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I", "Maximum size of the IP direct input queue"); static int sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_direct_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_direct_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_direct_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I", "Number of packets dropped from the IP direct input queue"); #endif /* RSS */ /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. */ void ip_init(void) { struct protosw *pr; int i; TAILQ_INIT(&V_in_ifaddrhead); V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); /* Initialize IP reassembly queue. */ ipreass_init(); /* Initialize packet filter hooks. */ V_inet_pfil_hook.ph_type = PFIL_TYPE_AF; V_inet_pfil_hook.ph_af = AF_INET; if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET, &V_ipsec_hhh_in[HHOOK_IPSEC_INET], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register input helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET, &V_ipsec_hhh_out[HHOOK_IPSEC_INET], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register output helper hook\n", __func__); /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) - return; + goto out; pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip_init: PF_INET not found"); /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip_protox[i] = pr - inetsw; /* * Cycle through IP protocols and put them into the appropriate place * in ip_protox[]. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip_protox[pr->pr_protocol] = pr - inetsw; } +out: netisr_register(&ip_nh); #ifdef RSS netisr_register(&ip_direct_nh); #endif } #ifdef VIMAGE static void ip_destroy(void *unused __unused) { struct ifnet *ifp; int error; + +#ifdef RSS + netisr_unregister(&ip_direct_nh); +#endif + netisr_unregister(&ip_nh); if ((error = pfil_head_unregister(&V_inet_pfil_hook)) != 0) printf("%s: WARNING: unable to unregister pfil hook, " "error %d\n", __func__, error); error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]); if (error != 0) { printf("%s: WARNING: unable to deregister input helper hook " "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET: " "error %d returned\n", __func__, error); } error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET]); if (error != 0) { printf("%s: WARNING: unable to deregister output helper hook " "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET: " "error %d returned\n", __func__, error); } /* Remove the IPv4 addresses from all interfaces. */ in_ifscrub_all(); /* Make sure the IPv4 routes are gone as well. */ IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) rt_flushifroutes_af(ifp, AF_INET); IFNET_RUNLOCK(); /* Destroy IP reassembly queue. */ ipreass_destroy(); /* Cleanup in_ifaddr hash table; should be empty. */ hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask); } VNET_SYSUNINIT(ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_destroy, NULL); #endif #ifdef RSS /* * IP direct input routine. * * This is called when reinjecting completed fragments where * all of the previous checking and book-keeping has been done. */ void ip_direct_input(struct mbuf *m) { struct ip *ip; int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; } #endif /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. */ void ip_input(struct mbuf *m) { struct ip *ip = NULL; struct in_ifaddr *ia = NULL; struct ifaddr *ifa; struct ifnet *ifp; int checkif, hlen = 0; uint16_t sum, ip_len; int dchg = 0; /* dest changed after fw */ struct in_addr odst; /* original dst address */ M_ASSERTPKTHDR(m); if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; /* Set up some basics that will be used later. */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip_len = ntohs(ip->ip_len); goto ours; } IPSTAT_INC(ips_total); if (m->m_pkthdr.len < sizeof(struct ip)) goto tooshort; if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { IPSTAT_INC(ips_toosmall); return; } ip = mtod(m, struct ip *); if (ip->ip_v != IPVERSION) { IPSTAT_INC(ips_badvers); goto bad; } hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ IPSTAT_INC(ips_badhlen); goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { IPSTAT_INC(ips_badhlen); return; } ip = mtod(m, struct ip *); } IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL); /* 127/8 must not appear on wire - RFC1122 */ ifp = m->m_pkthdr.rcvif; if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); goto bad; } } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { if (hlen == sizeof(struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); } } if (sum) { IPSTAT_INC(ips_badsum); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) /* packet is dropped by traffic conditioner */ return; #endif ip_len = ntohs(ip->ip_len); if (ip_len < hlen) { IPSTAT_INC(ips_badlen); goto bad; } /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len < ip_len) { tooshort: IPSTAT_INC(ips_tooshort); goto bad; } if (m->m_pkthdr.len > ip_len) { if (m->m_len == m->m_pkthdr.len) { m->m_len = ip_len; m->m_pkthdr.len = ip_len; } else m_adj(m, ip_len - m->m_pkthdr.len); } /* Try to forward the packet, but if we fail continue */ #ifdef IPSEC /* For now we do not handle IPSEC in tryforward. */ if (!key_havesp(IPSEC_DIR_INBOUND) && !key_havesp(IPSEC_DIR_OUTBOUND) && (V_ipforwarding == 1)) if (ip_tryforward(m) == NULL) return; /* * Bypass packet filtering for packets previously handled by IPsec. */ if (ip_ipsec_filtertunnel(m)) goto passin; #else if (V_ipforwarding == 1) if (ip_tryforward(m) == NULL) return; #endif /* IPSEC */ /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing (e.g. * by NAT rewriting). When this happens, tell * ip_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet_pfil_hook)) goto passin; odst = ip->ip_dst; if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0) return; if (m == NULL) /* consumed by filter */ return; ip = mtod(m, struct ip *); dchg = (odst.s_addr != ip->ip_dst.s_addr); ifp = m->m_pkthdr.rcvif; if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; goto ours; } if (m->m_flags & M_IP_NEXTHOP) { dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL); if (dchg != 0) { /* * Directly ship the packet on. This allows * forwarding packets originally destined to us * to some other directly connected host. */ ip_forward(m, 1); return; } } passin: /* * Process options and, if not destined for us, * ship it on. ip_dooptions returns 1 when an * error was detected (causing an icmp message * to be sent and the original packet to be freed). */ if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) return; /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no * matter if it is destined to another node, or whether it is * a multicast one, RSVP wants it! and prevents it from being forwarded * anywhere else. Also checks if the rsvp daemon is running before * grabbing the packet. */ if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) goto ours; /* * Check our list of addresses, to see if the packet is for us. * If we don't have any addresses, assume any unicast packet * we receive might be for us (and let the upper layers deal * with it). */ if (TAILQ_EMPTY(&V_in_ifaddrhead) && (m->m_flags & (M_MCAST|M_BCAST)) == 0) goto ours; /* * Enable a consistency check between the destination address * and the arrival interface for a unicast packet (the RFC 1122 * strong ES model) if IP forwarding is disabled and the packet * is not locally generated and the packet is not subject to * 'ipfw fwd'. * * XXX - Checking also should be disabled if the destination * address is ipnat'ed to a different interface. * * XXX - Checking is incompatible with IP aliases added * to the loopback interface instead of the interface where * the packets are received. * * XXX - This is the case for carp vhost IPs as well so we * insert a workaround. If the packet got here, we already * checked with carp_iamatch() and carp_forus(). */ checkif = V_ip_checkinterface && (V_ipforwarding == 0) && ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) && ifp->if_carp == NULL && (dchg == 0); /* * Check for exact addresses in the hash bucket. */ /* IN_IFADDR_RLOCK(); */ LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { /* * If the address matches, verify that the packet * arrived via the correct interface if checking is * enabled. */ if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && (!checkif || ia->ia_ifp == ifp)) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); /* IN_IFADDR_RUNLOCK(); */ goto ours; } } /* IN_IFADDR_RUNLOCK(); */ /* * Check for broadcast addresses. * * Only accept broadcast packets that arrive via the matching * interface. Reception of forwarded directed broadcasts would * be handled via ip_forward() and ether_output() with the loopback * into the stack for SIMPLEX interfaces handled by ether_output(). */ if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == ip->ip_dst.s_addr) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); IF_ADDR_RUNLOCK(ifp); goto ours; } #ifdef BOOTP_COMPAT if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); IF_ADDR_RUNLOCK(ifp); goto ours; } #endif } IF_ADDR_RUNLOCK(ifp); ia = NULL; } /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { if (V_ip_mrouter) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } /* * The process-level routing daemon needs to receive * all multicast IGMP packets, whether or not this * host belongs to their destination groups. */ if (ip->ip_p == IPPROTO_IGMP) goto ours; IPSTAT_INC(ips_forward); } /* * Assume the packet is for us, to avoid prematurely taking * a lock on the in_multi hash. Protocols must perform * their own filtering and update statistics accordingly. */ goto ours; } if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) goto ours; if (ip->ip_dst.s_addr == INADDR_ANY) goto ours; /* * Not for us; forward if possible and desirable. */ if (V_ipforwarding == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); } else { ip_forward(m, dchg); } return; ours: #ifdef IPSTEALTH /* * IPSTEALTH: Process non-routing options only * if the packet is destined for us. */ if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) return; #endif /* IPSTEALTH */ /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { /* XXXGL: shouldn't we save & set m_flags? */ m = ip_reass(m); if (m == NULL) return; ip = mtod(m, struct ip *); /* Get the header length of the reassembled packet */ hlen = ip->ip_hl << 2; } #ifdef IPSEC /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if (ip_ipsec_input(m, ip->ip_p) != 0) goto bad; #endif /* IPSEC */ /* * Switch out to protocol's input routine. */ IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; bad: m_freem(m); } /* * IP timer processing; * if a timer expires on a reassembly * queue, discard it. */ void ip_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ipreass_slowtimo(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } void ip_drain(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ipreass_drain(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * The protocol to be inserted into ip_protox[] must be already registered * in inetsw[], either statically or through pf_proto_register(). */ int ipproto_register(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to IPPROTO_RAW is unused. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */ return (EEXIST); /* Find the protocol position in inetsw[] and set the index. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) { if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol == ipproto) { ip_protox[pr->pr_protocol] = pr - inetsw; return (0); } } return (EPROTONOSUPPORT); } int ipproto_unregister(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* Check if the protocol was indeed registered. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */ return (ENOENT); /* Reset the protocol slot to IPPROTO_RAW. */ ip_protox[ipproto] = pr - inetsw; return (0); } u_char inetctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful * icmp message because icmp doesn't have a large enough repertoire * of codes and types. * * If not forwarding, just drop the packet. This could be confusing * if ipforwarding was zero but some routing protocol was advancing * us as a gateway to somewhere. However, we must let the routing * protocol deal with that. * * The srcrt parameter indicates whether the packet is being forwarded * via a source route. */ void ip_forward(struct mbuf *m, int srcrt) { struct ip *ip = mtod(m, struct ip *); struct in_ifaddr *ia; struct mbuf *mcopy; struct sockaddr_in *sin; struct in_addr dest; struct route ro; int error, type = 0, code = 0, mtu = 0; if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } #ifdef IPSEC if (ip_ipsec_fwd(m) != 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } #endif /* IPSEC */ #ifdef IPSTEALTH if (!V_ipstealth) { #endif if (ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); return; } #ifdef IPSTEALTH } #endif bzero(&ro, sizeof(ro)); sin = (struct sockaddr_in *)&ro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ip->ip_dst; #ifdef RADIX_MPATH rtalloc_mpath_fib(&ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), M_GETFIB(m)); #else in_rtalloc_ign(&ro, 0, M_GETFIB(m)); #endif if (ro.ro_rt != NULL) { ia = ifatoia(ro.ro_rt->rt_ifa); ifa_ref(&ia->ia_ifa); } else ia = NULL; #ifndef IPSEC /* * 'ia' may be NULL if there is no route for this destination. * In case of IPsec, Don't discard it just yet, but pass it to * ip_output in case of outgoing IPsec policy. */ if (!srcrt && ia == NULL) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); RO_RTFREE(&ro); return; } #endif /* * Save the IP header and at most 8 bytes of the payload, * in case we need to generate an ICMP message to the src. * * XXX this can be optimized a lot by saving the data in a local * buffer on the stack (72 bytes at most), and only allocating the * mbuf if really necessary. The vast majority of the packets * are forwarded without having to send an ICMP back (either * because unnecessary, or because rate limited), so we are * really we are wasting a lot of work here. * * We don't use m_copy() because it might return a reference * to a shared cluster. Both this function and ip_output() * assume exclusive access to the IP header in `m', so any * data in a cluster may change before we reach icmp_error(). */ mcopy = m_gethdr(M_NOWAIT, m->m_type); if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) { /* * It's probably ok if the pkthdr dup fails (because * the deep copy of the tag chain failed), but for now * be conservative and just discard the copy since * code below may some day want the tags. */ m_free(mcopy); mcopy = NULL; } if (mcopy != NULL) { mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy)); mcopy->m_pkthdr.len = mcopy->m_len; m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); } #ifdef IPSTEALTH if (!V_ipstealth) { #endif ip->ip_ttl -= IPTTLDEC; #ifdef IPSTEALTH } #endif /* * If forwarding packet using same interface that it came in on, * perhaps should send a redirect to sender to shortcut a hop. * Only send redirect if source is sending directly to us, * and if packet was not source routed (or has any options). * Also, don't send redirect if forwarding using a default route * or a route modified by a redirect. */ dest.s_addr = 0; if (!srcrt && V_ipsendredirects && ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) { struct rtentry *rt; rt = ro.ro_rt; if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && satosin(rt_key(rt))->sin_addr.s_addr != 0) { #define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) u_long src = ntohl(ip->ip_src.s_addr); if (RTA(rt) && (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { if (rt->rt_flags & RTF_GATEWAY) dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr; else dest.s_addr = ip->ip_dst.s_addr; /* Router requirements says to only send host redirects */ type = ICMP_REDIRECT; code = ICMP_REDIRECT_HOST; } } } error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); if (error == EMSGSIZE && ro.ro_rt) mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); if (error) IPSTAT_INC(ips_cantforward); else { IPSTAT_INC(ips_forward); if (type) IPSTAT_INC(ips_redirectsent); else { if (mcopy) m_freem(mcopy); if (ia != NULL) ifa_free(&ia->ia_ifa); return; } } if (mcopy == NULL) { if (ia != NULL) ifa_free(&ia->ia_ifa); return; } switch (error) { case 0: /* forwarded, but need redirect */ /* type, code set above */ break; case ENETUNREACH: case EHOSTUNREACH: case ENETDOWN: case EHOSTDOWN: default: type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; break; case EMSGSIZE: type = ICMP_UNREACH; code = ICMP_UNREACH_NEEDFRAG; #ifdef IPSEC /* * If IPsec is configured for this path, * override any possibly mtu value set by ip_output. */ mtu = ip_ipsec_mtu(mcopy, mtu); #endif /* IPSEC */ /* * If the MTU was set before make sure we are below the * interface MTU. * If the MTU wasn't set before use the interface mtu or * fall back to the next smaller mtu step compared to the * current packet size. */ if (mtu != 0) { if (ia != NULL) mtu = min(mtu, ia->ia_ifp->if_mtu); } else { if (ia != NULL) mtu = ia->ia_ifp->if_mtu; else mtu = ip_next_mtu(ntohs(ip->ip_len), 0); } IPSTAT_INC(ips_cantfrag); break; case ENOBUFS: case EACCES: /* ipfw denied packet */ m_freem(mcopy); if (ia != NULL) ifa_free(&ia->ia_ifa); return; } if (ia != NULL) ifa_free(&ia->ia_ifa); icmp_error(mcopy, type, code, dest.s_addr, mtu); } void ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct mbuf *m) { if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) { struct bintime bt; bintime(&bt); if (inp->inp_socket->so_options & SO_BINTIME) { *mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt), SCM_BINTIME, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_socket->so_options & SO_TIMESTAMP) { struct timeval tv; bintime2timeval(&bt, &tv); *mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } } if (inp->inp_flags & INP_RECVDSTADDR) { *mp = sbcreatecontrol((caddr_t)&ip->ip_dst, sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTTL) { *mp = sbcreatecontrol((caddr_t)&ip->ip_ttl, sizeof(u_char), IP_RECVTTL, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef notyet /* XXX * Moving these out of udp_input() made them even more broken * than they already were. */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { *mp = sbcreatecontrol((caddr_t)opts_deleted_above, sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { *mp = sbcreatecontrol((caddr_t)ip_srcroute(m), sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #endif if (inp->inp_flags & INP_RECVIF) { struct ifnet *ifp; struct sdlbuf { struct sockaddr_dl sdl; u_char pad[32]; } sdlbuf; struct sockaddr_dl *sdp; struct sockaddr_dl *sdl2 = &sdlbuf.sdl; if ((ifp = m->m_pkthdr.rcvif) && ifp->if_index && ifp->if_index <= V_if_index) { sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; /* * Change our mind and don't try copy. */ if (sdp->sdl_family != AF_LINK || sdp->sdl_len > sizeof(sdlbuf)) { goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); } else { makedummy: sdl2->sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; sdl2->sdl_index = 0; sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } *mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len, IP_RECVIF, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTOS) { *mp = sbcreatecontrol((caddr_t)&ip->ip_tos, sizeof(u_char), IP_RECVTOS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); /* * XXX should handle the failure of one or the * other - don't populate both? */ *mp = sbcreatecontrol((caddr_t) &flowid, sizeof(uint32_t), IP_FLOWID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; *mp = sbcreatecontrol((caddr_t) &flow_type, sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef RSS if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { *mp = sbcreatecontrol((caddr_t) &rss_bucketid, sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } } #endif } /* * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on * locking. This code remains in ip_input.c as ip_mroute.c is optionally * compiled. */ static VNET_DEFINE(int, ip_rsvp_on); VNET_DEFINE(struct socket *, ip_rsvpd); #define V_ip_rsvp_on VNET(ip_rsvp_on) int ip_rsvp_init(struct socket *so) { if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; if (V_ip_rsvpd != NULL) return EADDRINUSE; V_ip_rsvpd = so; /* * This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!V_ip_rsvp_on) { V_ip_rsvp_on = 1; V_rsvp_on++; } return 0; } int ip_rsvp_done(void) { V_ip_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (V_ip_rsvp_on) { V_ip_rsvp_on = 0; V_rsvp_on--; } return 0; } int rsvp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; m = *mp; *mp = NULL; if (rsvp_input_p) { /* call the real one if loaded */ *mp = m; rsvp_input_p(mp, offp, proto); return (IPPROTO_DONE); } /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!V_rsvp_on) { m_freem(m); return (IPPROTO_DONE); } if (V_ip_rsvpd != NULL) { *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); } /* Drop the packet */ m_freem(m); return (IPPROTO_DONE); } Index: projects/vnet/sys/netinet6/ip6_input.c =================================================================== --- projects/vnet/sys/netinet6/ip6_input.c (revision 298547) +++ projects/vnet/sys/netinet6/ip6_input.c (revision 298548) @@ -1,1729 +1,1735 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #include #endif /* IPSEC */ #include extern struct domain inet6domain; u_char ip6_protox[IPPROTO_MAX]; VNET_DEFINE(struct in6_ifaddrhead, in6_ifaddrhead); VNET_DEFINE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl); VNET_DEFINE(u_long, in6_ifaddrhmask); static struct netisr_handler ip6_nh = { .nh_name = "ip6", .nh_handler = ip6_input, .nh_proto = NETISR_IPV6, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid_v6, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, #else .nh_policy = NETISR_POLICY_FLOW, #endif }; #ifdef RSS static struct netisr_handler ip6_direct_nh = { .nh_name = "ip6_direct", .nh_handler = ip6_direct_input, .nh_proto = NETISR_IPV6_DIRECT, .nh_m2cpuid = rss_soft_m2cpuid_v6, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, }; #endif VNET_DEFINE(struct pfil_head, inet6_pfil_hook); VNET_PCPUSTAT_DEFINE(struct ip6stat, ip6stat); VNET_PCPUSTAT_SYSINIT(ip6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ip6stat); #endif /* VIMAGE */ struct rmlock in6_ifaddr_lock; RM_SYSINIT(in6_ifaddr_lock, &in6_ifaddr_lock, "in6_ifaddr_lock"); static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *); #ifdef PULLDOWN_TEST static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int); #endif /* * IP6 initialization: fill in IP6 protocol switch table. * All protocols not implemented in kernel go to raw IP6 protocol handler. */ void ip6_init(void) { struct protosw *pr; int i; TUNABLE_INT_FETCH("net.inet6.ip6.auto_linklocal", &V_ip6_auto_linklocal); TUNABLE_INT_FETCH("net.inet6.ip6.accept_rtadv", &V_ip6_accept_rtadv); TUNABLE_INT_FETCH("net.inet6.ip6.no_radr", &V_ip6_no_radr); TAILQ_INIT(&V_in6_ifaddrhead); V_in6_ifaddrhashtbl = hashinit(IN6ADDR_NHASH, M_IFADDR, &V_in6_ifaddrhmask); /* Initialize packet filter hooks. */ V_inet6_pfil_hook.ph_type = PFIL_TYPE_AF; V_inet6_pfil_hook.ph_af = AF_INET6; if ((i = pfil_head_register(&V_inet6_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET6, &V_ipsec_hhh_in[HHOOK_IPSEC_INET6], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register input helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET6, &V_ipsec_hhh_out[HHOOK_IPSEC_INET6], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register output helper hook\n", __func__); scope6_init(); addrsel_policy_init(); nd6_init(); frag6_init(); V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR; /* Skip global initialization stuff for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) - return; + goto out; pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip6_init"); /* Initialize the entire ip6_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip6_protox[i] = pr - inet6sw; /* * Cycle through IP protocols and put them into the appropriate place * in ip6_protox[]. */ for (pr = inet6domain.dom_protosw; pr < inet6domain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET6 && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip6_protox[pr->pr_protocol] = pr - inet6sw; } +out: netisr_register(&ip6_nh); #ifdef RSS netisr_register(&ip6_direct_nh); #endif } /* * The protocol to be inserted into ip6_protox[] must be already registered * in inet6sw[], either statically or through pf_proto_register(). */ int ip6proto_register(short ip6proto) { struct protosw *pr; /* Sanity checks. */ if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to IPPROTO_RAW is unused. */ pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip6_protox[ip6proto] != pr - inet6sw) /* IPPROTO_RAW */ return (EEXIST); /* * Find the protocol position in inet6sw[] and set the index. */ for (pr = inet6domain.dom_protosw; pr < inet6domain.dom_protoswNPROTOSW; pr++) { if (pr->pr_domain->dom_family == PF_INET6 && pr->pr_protocol && pr->pr_protocol == ip6proto) { ip6_protox[pr->pr_protocol] = pr - inet6sw; return (0); } } return (EPROTONOSUPPORT); } int ip6proto_unregister(short ip6proto) { struct protosw *pr; /* Sanity checks. */ if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* Check if the protocol was indeed registered. */ pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip6_protox[ip6proto] == pr - inet6sw) /* IPPROTO_RAW */ return (ENOENT); /* Reset the protocol slot to IPPROTO_RAW. */ ip6_protox[ip6proto] = pr - inet6sw; return (0); } #ifdef VIMAGE static void ip6_destroy(void *unused __unused) { struct ifaddr *ifa, *nifa; struct ifnet *ifp; int error; + +#ifdef RSS + netisr_unregister(&ip6_direct_nh); +#endif + netisr_unregister(&ip6_nh); if ((error = pfil_head_unregister(&V_inet6_pfil_hook)) != 0) printf("%s: WARNING: unable to unregister pfil hook, " "error %d\n", __func__, error); error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET6]); if (error != 0) { printf("%s: WARNING: unable to deregister input helper hook " "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET6: " "error %d returned\n", __func__, error); } error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET6]); if (error != 0) { printf("%s: WARNING: unable to deregister output helper hook " "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET6: " "error %d returned\n", __func__, error); } /* Cleanup addresses. */ IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ /* IF_ADDR_LOCK(ifp); */ TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6_purgeaddr(ifa); } /* IF_ADDR_UNLOCK(ifp); */ in6_ifdetach(ifp, 0); mld_domifdetach(ifp); /* Make sure any routes are gone as well. */ rt_flushifroutes_af(ifp, AF_INET6); } IFNET_RUNLOCK(); nd6_destroy(); in6_ifattach_destroy(); hashdestroy(V_in6_ifaddrhashtbl, M_IFADDR, V_in6_ifaddrhmask); } VNET_SYSUNINIT(inet6, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip6_destroy, NULL); #endif static int ip6_input_hbh(struct mbuf *m, uint32_t *plen, uint32_t *rtalert, int *off, int *nxt, int *ours) { struct ip6_hdr *ip6; struct ip6_hbh *hbh; if (ip6_hopopts_input(plen, rtalert, &m, off)) { #if 0 /*touches NULL pointer*/ in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); #endif goto out; /* m have already been freed */ } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* * if the payload length field is 0 and the next header field * indicates Hop-by-Hop Options header, then a Jumbo Payload * option MUST be included. */ if (ip6->ip6_plen == 0 && *plen == 0) { /* * Note that if a valid jumbo payload option is * contained, ip6_hopopts_input() must set a valid * (non-zero) payload length to the variable plen. */ IP6STAT_INC(ip6s_badoptions); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&ip6->ip6_plen - (caddr_t)ip6); goto out; } #ifndef PULLDOWN_TEST /* ip6_hopopts_input() ensures that mbuf is contiguous */ hbh = (struct ip6_hbh *)(ip6 + 1); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { IP6STAT_INC(ip6s_tooshort); goto out; } #endif *nxt = hbh->ip6h_nxt; /* * If we are acting as a router and the packet contains a * router alert option, see if we know the option value. * Currently, we only support the option value for MLD, in which * case we should pass the packet to the multicast routing * daemon. */ if (*rtalert != ~0) { switch (*rtalert) { case IP6OPT_RTALERT_MLD: if (V_ip6_forwarding) *ours = 1; break; default: /* * RFC2711 requires unrecognized values must be * silently ignored. */ break; } } return (0); out: return (1); } #ifdef RSS /* * IPv6 direct input routine. * * This is called when reinjecting completed fragments where * all of the previous checking and book-keeping has been done. */ void ip6_direct_input(struct mbuf *m) { int off, nxt; int nest; struct m_tag *mtag; struct ip6_direct_ctx *ip6dc; mtag = m_tag_locate(m, MTAG_ABI_IPV6, IPV6_TAG_DIRECT, NULL); KASSERT(mtag != NULL, ("Reinjected packet w/o direct ctx tag!")); ip6dc = (struct ip6_direct_ctx *)(mtag + 1); nxt = ip6dc->ip6dc_nxt; off = ip6dc->ip6dc_off; nest = 0; m_tag_delete(m, mtag); while (nxt != IPPROTO_DONE) { if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { IP6STAT_INC(ip6s_toomanyhdr); goto bad; } /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } #ifdef IPSEC /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if (ip6_ipsec_input(m, nxt)) goto bad; #endif /* IPSEC */ nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt); } return; bad: m_freem(m); } #endif void ip6_input(struct mbuf *m) { struct in6_addr odst; struct ip6_hdr *ip6; struct in6_ifaddr *ia; u_int32_t plen; u_int32_t rtalert = ~0; int off = sizeof(struct ip6_hdr), nest; int nxt, ours = 0; int srcrt = 0; #ifdef IPSEC /* * should the inner packet be considered authentic? * see comment in ah4_input(). * NB: m cannot be NULL when passed to the input routine */ m->m_flags &= ~M_AUTHIPHDR; m->m_flags &= ~M_AUTHIPDGM; #endif /* IPSEC */ if (m->m_flags & M_FASTFWD_OURS) { /* * Firewall changed destination to local. */ m->m_flags &= ~M_FASTFWD_OURS; ours = 1; ip6 = mtod(m, struct ip6_hdr *); goto hbhcheck; } /* * mbuf statistics */ if (m->m_flags & M_EXT) { if (m->m_next) IP6STAT_INC(ip6s_mext2m); else IP6STAT_INC(ip6s_mext1); } else { if (m->m_next) { if (m->m_flags & M_LOOP) { IP6STAT_INC(ip6s_m2m[V_loif->if_index]); } else if (m->m_pkthdr.rcvif->if_index < IP6S_M2MMAX) IP6STAT_INC( ip6s_m2m[m->m_pkthdr.rcvif->if_index]); else IP6STAT_INC(ip6s_m2m[0]); } else IP6STAT_INC(ip6s_m1); } /* drop the packet if IPv6 operation is disabled on the IF */ if ((ND_IFINFO(m->m_pkthdr.rcvif)->flags & ND6_IFF_IFDISABLED)) goto bad; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_receive); IP6STAT_INC(ip6s_total); #ifndef PULLDOWN_TEST /* * L2 bridge code and some other code can return mbuf chain * that does not conform to KAME requirement. too bad. * XXX: fails to join if interface MTU > MCLBYTES. jumbogram? */ if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) { struct mbuf *n; if (m->m_pkthdr.len > MHLEN) n = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; /* ENOBUFS */ } m_move_pkthdr(n, m); m_copydata(m, 0, n->m_pkthdr.len, mtod(n, caddr_t)); n->m_len = n->m_pkthdr.len; m_freem(m); m = n; } IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /* nothing */); #endif if (m->m_len < sizeof(struct ip6_hdr)) { struct ifnet *inifp; inifp = m->m_pkthdr.rcvif; if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { IP6STAT_INC(ip6s_toosmall); in6_ifstat_inc(inifp, ifs6_in_hdrerr); return; } } ip6 = mtod(m, struct ip6_hdr *); if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { IP6STAT_INC(ip6s_badvers); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); goto bad; } IP6STAT_INC(ip6s_nxthist[ip6->ip6_nxt]); IP_PROBE(receive, NULL, NULL, ip6, m->m_pkthdr.rcvif, NULL, ip6); /* * Check against address spoofing/corruption. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) { /* * XXX: "badscope" is not very suitable for a multicast source. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) && !(m->m_flags & M_LOOP)) { /* * In this case, the packet should come from the loopback * interface. However, we cannot just check the if_flags, * because ip6_mloopback() passes the "actual" interface * as the outgoing/incoming interface. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && IPV6_ADDR_MC_SCOPE(&ip6->ip6_dst) == 0) { /* * RFC4291 2.7: * Nodes must not originate a packet to a multicast address * whose scop field contains the reserved value 0; if such * a packet is received, it must be silently dropped. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET6) == 0) { /* packet is dropped by traffic conditioner */ return; } #endif /* * The following check is not documented in specs. A malicious * party may be able to use IPv4 mapped addr to confuse tcp/udp stack * and bypass security checks (act as if it was from 127.0.0.1 by using * IPv6 src ::ffff:127.0.0.1). Be cautious. * * This check chokes if we are in an SIIT cloud. As none of BSDs * support IPv4-less kernel compilation, we cannot support SIIT * environment at all. So, it makes more sense for us to reject any * malicious packets for non-SIIT environment, than try to do a * partial support for SIIT environment. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #if 0 /* * Reject packets with IPv4 compatible addresses (auto tunnel). * * The code forbids auto tunnel relay case in RFC1933 (the check is * stronger than RFC1933). We may want to re-enable it if mech-xx * is revised to forbid relaying case. */ if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) || IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #endif #ifdef IPSEC /* * Bypass packet filtering for packets previously handled by IPsec. */ if (ip6_ipsec_filtertunnel(m)) goto passin; #endif /* IPSEC */ /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing * (e.g. by NAT rewriting). When this happens, * tell ip6_forward to do the right thing. */ odst = ip6->ip6_dst; /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet6_pfil_hook)) goto passin; if (pfil_run_hooks(&V_inet6_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL)) return; if (m == NULL) /* consumed by filter */ return; ip6 = mtod(m, struct ip6_hdr *); srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst); if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; ours = 1; goto hbhcheck; } if ((m->m_flags & M_IP6_NEXTHOP) && m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) { /* * Directly ship the packet on. This allows forwarding * packets originally destined to us to some other directly * connected host. */ ip6_forward(m, 1); return; } passin: /* * Disambiguate address scope zones (if there is ambiguity). * We first make sure that the original source or destination address * is not in our internal form for scoped addresses. Such addresses * are not necessarily invalid spec-wise, but we cannot accept them due * to the usage conflict. * in6_setscope() then also checks and rejects the cases where src or * dst are the loopback address and the receiving interface * is not loopback. */ if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); /* XXX */ goto bad; } if (in6_setscope(&ip6->ip6_src, m->m_pkthdr.rcvif, NULL) || in6_setscope(&ip6->ip6_dst, m->m_pkthdr.rcvif, NULL)) { IP6STAT_INC(ip6s_badscope); goto bad; } /* * Multicast check. Assume packet is for us to avoid * prematurely taking locks. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { ours = 1; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mcast); goto hbhcheck; } /* * Unicast check * XXX: For now we keep link-local IPv6 addresses with embedded * scope zone id, therefore we use zero zoneid here. */ ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia != NULL) { if (ia->ia6_flags & IN6_IFF_NOTREADY) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; /* address is not ready, so discard the packet. */ nd6log((LOG_INFO, "ip6_input: packet to an unready address %s->%s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst))); ifa_free(&ia->ia_ifa); goto bad; } /* Count the packet in the ip address stats */ counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); ifa_free(&ia->ia_ifa); ours = 1; goto hbhcheck; } /* * Now there is no reason to process the packet if it's not our own * and we're not a router. */ if (!V_ip6_forwarding) { IP6STAT_INC(ip6s_cantforward); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto bad; } hbhcheck: /* * Process Hop-by-Hop options header if it's contained. * m may be modified in ip6_hopopts_input(). * If a JumboPayload option is included, plen will also be modified. */ plen = (u_int32_t)ntohs(ip6->ip6_plen); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { if (ip6_input_hbh(m, &plen, &rtalert, &off, &nxt, &ours) != 0) return; } else nxt = ip6->ip6_nxt; /* * Use mbuf flags to propagate Router Alert option to * ICMPv6 layer, as hop-by-hop options have been stripped. */ if (rtalert != ~0) m->m_flags |= M_RTALERT_MLD; /* * Check that the amount of data in the buffers * is as at least much as the IPv6 header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) { if (m->m_len == m->m_pkthdr.len) { m->m_len = sizeof(struct ip6_hdr) + plen; m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen; } else m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len); } /* * Forward if desirable. */ if (V_ip6_mrouter && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip6_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. * * XXX TODO: Check hlim and multicast scope here to avoid * unnecessarily calling into ip6_mforward(). */ if (ip6_mforward && ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) { IP6STAT_INC(ip6s_cantforward); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto bad; } } else if (!ours) { ip6_forward(m, srcrt); return; } ip6 = mtod(m, struct ip6_hdr *); /* * Malicious party may be able to use IPv4 mapped addr to confuse * tcp/udp stack and bypass security checks (act as if it was from * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1). Be cautious. * * For SIIT end node behavior, you may want to disable the check. * However, you will become vulnerable to attacks using IPv4 mapped * source. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } /* * Tell launch routine the next header */ IP6STAT_INC(ip6s_delivered); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_deliver); nest = 0; while (nxt != IPPROTO_DONE) { if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { IP6STAT_INC(ip6s_toomanyhdr); goto bad; } /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } #ifdef IPSEC /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if (ip6_ipsec_input(m, nxt)) goto bad; #endif /* IPSEC */ nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt); } return; bad: m_freem(m); } /* * Hop-by-Hop options header processing. If a valid jumbo payload option is * included, the real payload length will be stored in plenp. * * rtalertp - XXX: should be stored more smart way */ static int ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp, struct mbuf **mp, int *offp) { struct mbuf *m = *mp; int off = *offp, hbhlen; struct ip6_hbh *hbh; /* validation of the length of the header */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_CHECK(m, off, hbhlen, -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { IP6STAT_INC(ip6s_tooshort); return -1; } hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), hbhlen); if (hbh == NULL) { IP6STAT_INC(ip6s_tooshort); return -1; } #endif off += hbhlen; hbhlen -= sizeof(struct ip6_hbh); if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh), hbhlen, rtalertp, plenp) < 0) return (-1); *offp = off; *mp = m; return (0); } /* * Search header for all Hop-by-hop options and process each option. * This function is separate from ip6_hopopts_input() in order to * handle a case where the sending node itself process its hop-by-hop * options header. In such a case, the function is called from ip6_output(). * * The function assumes that hbh header is located right after the IPv6 header * (RFC2460 p7), opthead is pointer into data content in m, and opthead to * opthead + hbhlen is located in contiguous memory region. */ int ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen, u_int32_t *rtalertp, u_int32_t *plenp) { struct ip6_hdr *ip6; int optlen = 0; u_int8_t *opt = opthead; u_int16_t rtalert_val; u_int32_t jumboplen; const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh); for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) { switch (*opt) { case IP6OPT_PAD1: optlen = 1; break; case IP6OPT_PADN: if (hbhlen < IP6OPT_MINLEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } optlen = *(opt + 1) + 2; break; case IP6OPT_ROUTER_ALERT: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_RTALERT_LEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_RTALERT_LEN; bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2); *rtalertp = ntohs(rtalert_val); break; case IP6OPT_JUMBO: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_JUMBO_LEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_JUMBO_LEN; /* * IPv6 packets that have non 0 payload length * must not contain a jumbo payload option. */ ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_plen) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt - opthead); return (-1); } /* * We may see jumbolen in unaligned location, so * we'd need to perform bcopy(). */ bcopy(opt + 2, &jumboplen, sizeof(jumboplen)); jumboplen = (u_int32_t)htonl(jumboplen); #if 1 /* * if there are multiple jumbo payload options, * *plenp will be non-zero and the packet will be * rejected. * the behavior may need some debate in ipngwg - * multiple options does not make sense, however, * there's no explicit mention in specification. */ if (*plenp != 0) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } #endif /* * jumbo payload length must be larger than 65535. */ if (jumboplen <= IPV6_MAXPACKET) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } *plenp = jumboplen; break; default: /* unknown option */ if (hbhlen < IP6OPT_MINLEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } optlen = ip6_unknown_opt(opt, m, erroff + opt - opthead); if (optlen == -1) return (-1); optlen += 2; break; } } return (0); bad: m_freem(m); return (-1); } /* * Unknown option processing. * The third argument `off' is the offset from the IPv6 header to the option, * which is necessary if the IPv6 header the and option header and IPv6 header * is not contiguous in order to return an ICMPv6 error. */ int ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off) { struct ip6_hdr *ip6; switch (IP6OPT_TYPE(*optp)) { case IP6OPT_TYPE_SKIP: /* ignore the option */ return ((int)*(optp + 1)); case IP6OPT_TYPE_DISCARD: /* silently discard */ m_freem(m); return (-1); case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */ IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */ IP6STAT_INC(ip6s_badoptions); ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || (m->m_flags & (M_BCAST|M_MCAST))) m_freem(m); else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); } m_freem(m); /* XXX: NOTREACHED */ return (-1); } /* * Create the "control" list for this pcb. * These functions will not modify mbuf chain at all. * * With KAME mbuf chain restriction: * The routine will be called from upper layer handlers like tcp6_input(). * Thus the routine assumes that the caller (tcp6_input) have already * called IP6_EXTHDR_CHECK() and all the extension headers are located in the * very first mbuf on the mbuf chain. * * ip6_savecontrol_v4 will handle those options that are possible to be * set on a v4-mapped socket. * ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those * options and handle the v6-only ones itself. */ struct mbuf ** ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, int *v4only) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); #ifdef SO_TIMESTAMP if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) { struct timeval tv; microtime(&tv); *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } #endif #define IS2292(inp, x, y) (((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y)) /* RFC 2292 sec. 5 */ if ((inp->inp_flags & IN6P_PKTINFO) != 0) { struct in6_pktinfo pi6; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); pi6.ipi6_addr.s6_addr32[0] = 0; pi6.ipi6_addr.s6_addr32[1] = 0; pi6.ipi6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; pi6.ipi6_addr.s6_addr32[3] = ip->ip_dst.s_addr; #else /* We won't hit this code */ bzero(&pi6.ipi6_addr, sizeof(struct in6_addr)); #endif } else { bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr)); in6_clearscope(&pi6.ipi6_addr); /* XXX */ } pi6.ipi6_ifindex = (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0; *mp = sbcreatecontrol((caddr_t) &pi6, sizeof(struct in6_pktinfo), IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) { int hlim; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); hlim = ip->ip_ttl; #else /* We won't hit this code */ hlim = 0; #endif } else { hlim = ip6->ip6_hlim & 0xff; } *mp = sbcreatecontrol((caddr_t) &hlim, sizeof(int), IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if ((inp->inp_flags & IN6P_TCLASS) != 0) { int tclass; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); tclass = ip->ip_tos; #else /* We won't hit this code */ tclass = 0; #endif } else { u_int32_t flowinfo; flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK); flowinfo >>= 20; tclass = flowinfo & 0xff; } *mp = sbcreatecontrol((caddr_t) &tclass, sizeof(int), IPV6_TCLASS, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if (v4only != NULL) { if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { *v4only = 1; } else { *v4only = 0; } } return (mp); } void ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); int v4only = 0; mp = ip6_savecontrol_v4(in6p, m, mp, &v4only); if (v4only) return; /* * IPV6_HOPOPTS socket option. Recall that we required super-user * privilege for the option (see ip6_ctloutput), but it might be too * strict, since there might be some hop-by-hop options which can be * returned to normal user. * See also RFC 2292 section 6 (or RFC 3542 section 8). */ if ((in6p->inp_flags & IN6P_HOPOPTS) != 0) { /* * Check if a hop-by-hop options header is contatined in the * received packet, and if so, store the options as ancillary * data. Note that a hop-by-hop options header must be * just after the IPv6 header, which is assured through the * IPv6 input processing. */ if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; int hbhlen = 0; #ifdef PULLDOWN_TEST struct mbuf *ext; #endif #ifndef PULLDOWN_TEST hbh = (struct ip6_hbh *)(ip6 + 1); hbhlen = (hbh->ip6h_len + 1) << 3; #else ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr), ip6->ip6_nxt); if (ext == NULL) { IP6STAT_INC(ip6s_tooshort); return; } hbh = mtod(ext, struct ip6_hbh *); hbhlen = (hbh->ip6h_len + 1) << 3; if (hbhlen != ext->m_len) { m_freem(ext); IP6STAT_INC(ip6s_tooshort); return; } #endif /* * XXX: We copy the whole header even if a * jumbo payload option is included, the option which * is to be removed before returning according to * RFC2292. * Note: this constraint is removed in RFC3542 */ *mp = sbcreatecontrol((caddr_t)hbh, hbhlen, IS2292(in6p, IPV6_2292HOPOPTS, IPV6_HOPOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; #ifdef PULLDOWN_TEST m_freem(ext); #endif } } if ((in6p->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) { int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr); /* * Search for destination options headers or routing * header(s) through the header chain, and stores each * header as ancillary data. * Note that the order of the headers remains in * the chain of ancillary data. */ while (1) { /* is explicit loop prevention necessary? */ struct ip6_ext *ip6e = NULL; int elen; #ifdef PULLDOWN_TEST struct mbuf *ext = NULL; #endif /* * if it is not an extension header, don't try to * pull it from the chain. */ switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: goto loopend; } #ifndef PULLDOWN_TEST if (off + sizeof(*ip6e) > m->m_len) goto loopend; ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (off + elen > m->m_len) goto loopend; #else ext = ip6_pullexthdr(m, off, nxt); if (ext == NULL) { IP6STAT_INC(ip6s_tooshort); return; } ip6e = mtod(ext, struct ip6_ext *); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (elen != ext->m_len) { m_freem(ext); IP6STAT_INC(ip6s_tooshort); return; } #endif switch (nxt) { case IPPROTO_DSTOPTS: if (!(in6p->inp_flags & IN6P_DSTOPTS)) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, IS2292(in6p, IPV6_2292DSTOPTS, IPV6_DSTOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_ROUTING: if (!(in6p->inp_flags & IN6P_RTHDR)) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, IS2292(in6p, IPV6_2292RTHDR, IPV6_RTHDR), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: /* * other cases have been filtered in the above. * none will visit this case. here we supply * the code just in case (nxt overwritten or * other cases). */ #ifdef PULLDOWN_TEST m_freem(ext); #endif goto loopend; } /* proceed with the next header. */ off += elen; nxt = ip6e->ip6e_nxt; ip6e = NULL; #ifdef PULLDOWN_TEST m_freem(ext); ext = NULL; #endif } loopend: ; } if (in6p->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); /* * XXX should handle the failure of one or the * other - don't populate both? */ *mp = sbcreatecontrol((caddr_t) &flowid, sizeof(uint32_t), IPV6_FLOWID, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; *mp = sbcreatecontrol((caddr_t) &flow_type, sizeof(uint32_t), IPV6_FLOWTYPE, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } #ifdef RSS if (in6p->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { *mp = sbcreatecontrol((caddr_t) &rss_bucketid, sizeof(uint32_t), IPV6_RSSBUCKETID, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } } #endif } #undef IS2292 void ip6_notify_pmtu(struct inpcb *inp, struct sockaddr_in6 *dst, u_int32_t mtu) { struct socket *so; struct mbuf *m_mtu; struct ip6_mtuinfo mtuctl; KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); /* * Notify the error by sending IPV6_PATHMTU ancillary data if * application wanted to know the MTU value. * NOTE: we notify disconnected sockets, because some udp * applications keep sending sockets disconnected. * NOTE: our implementation doesn't notify connected sockets that has * foreign address that is different than given destination addresses * (this is permitted by RFC 3542). */ if ((inp->inp_flags & IN6P_MTU) == 0 || ( !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &dst->sin6_addr))) return; mtuctl.ip6m_mtu = mtu; mtuctl.ip6m_addr = *dst; if (sa6_recoverscope(&mtuctl.ip6m_addr)) return; if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl), IPV6_PATHMTU, IPPROTO_IPV6)) == NULL) return; so = inp->inp_socket; if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu) == 0) { m_freem(m_mtu); /* XXX: should count statistics */ } else sorwakeup(so); } #ifdef PULLDOWN_TEST /* * pull single extension header from mbuf chain. returns single mbuf that * contains the result, or NULL on error. */ static struct mbuf * ip6_pullexthdr(struct mbuf *m, size_t off, int nxt) { struct ip6_ext ip6e; size_t elen; struct mbuf *n; #ifdef DIAGNOSTIC switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: printf("ip6_pullexthdr: invalid nxt=%d\n", nxt); } #endif m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxt == IPPROTO_AH) elen = (ip6e.ip6e_len + 2) << 2; else elen = (ip6e.ip6e_len + 1) << 3; if (elen > MLEN) n = m_getcl(M_NOWAIT, MT_DATA, 0); else n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) return NULL; m_copydata(m, off, elen, mtod(n, caddr_t)); n->m_len = elen; return n; } #endif /* * Get pointer to the previous header followed by the header * currently processed. * XXX: This function supposes that * M includes all headers, * the next header field and the header length field of each header * are valid, and * the sum of each header length equals to OFF. * Because of these assumptions, this function must be called very * carefully. Moreover, it will not be used in the near future when * we develop `neater' mechanism to process extension headers. */ char * ip6_get_prevhdr(const struct mbuf *m, int off) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); if (off == sizeof(struct ip6_hdr)) return (&ip6->ip6_nxt); else { int len, nxt; struct ip6_ext *ip6e = NULL; nxt = ip6->ip6_nxt; len = sizeof(struct ip6_hdr); while (len < off) { ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + len); switch (nxt) { case IPPROTO_FRAGMENT: len += sizeof(struct ip6_frag); break; case IPPROTO_AH: len += (ip6e->ip6e_len + 2) << 2; break; default: len += (ip6e->ip6e_len + 1) << 3; break; } nxt = ip6e->ip6e_nxt; } if (ip6e) return (&ip6e->ip6e_nxt); else return NULL; } } /* * get next header offset. m will be retained. */ int ip6_nexthdr(const struct mbuf *m, int off, int proto, int *nxtp) { struct ip6_hdr ip6; struct ip6_ext ip6e; struct ip6_frag fh; /* just in case */ if (m == NULL) panic("ip6_nexthdr: m == NULL"); if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off) return -1; switch (proto) { case IPPROTO_IPV6: if (m->m_pkthdr.len < off + sizeof(ip6)) return -1; m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6); if (nxtp) *nxtp = ip6.ip6_nxt; off += sizeof(ip6); return off; case IPPROTO_FRAGMENT: /* * terminate parsing if it is not the first fragment, * it does not make sense to parse through it. */ if (m->m_pkthdr.len < off + sizeof(fh)) return -1; m_copydata(m, off, sizeof(fh), (caddr_t)&fh); /* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */ if (fh.ip6f_offlg & IP6F_OFF_MASK) return -1; if (nxtp) *nxtp = fh.ip6f_nxt; off += sizeof(struct ip6_frag); return off; case IPPROTO_AH: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 2) << 2; return off; case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 1) << 3; return off; case IPPROTO_NONE: case IPPROTO_ESP: case IPPROTO_IPCOMP: /* give up */ return -1; default: return -1; } /* NOTREACHED */ } /* * get offset for the last header in the chain. m will be kept untainted. */ int ip6_lasthdr(const struct mbuf *m, int off, int proto, int *nxtp) { int newoff; int nxt; if (!nxtp) { nxt = -1; nxtp = &nxt; } while (1) { newoff = ip6_nexthdr(m, off, proto, nxtp); if (newoff < 0) return off; else if (newoff < off) return -1; /* invalid */ else if (newoff == off) return newoff; off = newoff; proto = *nxtp; } } /* * System control for IP6 */ u_char inet6ctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, 0, 0, ENOPROTOOPT };