Index: projects/vnet/sys/net/if_epair.c
===================================================================
--- projects/vnet/sys/net/if_epair.c	(revision 298547)
+++ projects/vnet/sys/net/if_epair.c	(revision 298548)
@@ -1,1010 +1,1010 @@
 /*-
  * Copyright (c) 2008 The FreeBSD Foundation
  * Copyright (c) 2009-2010 Bjoern A. Zeeb <bz@FreeBSD.org>
  * All rights reserved.
  *
  * This software was developed by CK Software GmbH under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  * notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  * notice, this list of conditions and the following disclaimer in the
  * documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * A pair of virtual back-to-back connected ethernet like interfaces
  * (``two interfaces with a virtual cross-over cable'').
  *
  * This is mostly intended to be used to provide connectivity between
  * different virtual network stack instances.
  */
 /*
  * Things to re-think once we have more experience:
  * - ifp->if_reassign function once we can test with vimage. Depending on
  *   how if_vmove() is going to be improved.
  * - Real random etheraddrs that are checked to be uniquish; we would need
  *   to re-do them in case we move the interface between network stacks
  *   in a private if_reassign function.
  *   In case we bridge to a real interface/network or between indepedent
  *   epairs on multiple stacks/machines, we may need this.
  *   For now let the user handle that case.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/refcount.h>
 #include <sys/queue.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/types.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_media.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 SYSCTL_DECL(_net_link);
 static SYSCTL_NODE(_net_link, OID_AUTO, epair, CTLFLAG_RW, 0, "epair sysctl");
 
 #ifdef EPAIR_DEBUG
 static int epair_debug = 0;
 SYSCTL_INT(_net_link_epair, OID_AUTO, epair_debug, CTLFLAG_RW,
     &epair_debug, 0, "if_epair(4) debugging.");
 #define	DPRINTF(fmt, arg...)						\
 	if (epair_debug)						\
 		printf("[%s:%d] " fmt, __func__, __LINE__, ##arg)
 #else
 #define	DPRINTF(fmt, arg...)
 #endif
 
 static void epair_nh_sintr(struct mbuf *);
 static struct mbuf *epair_nh_m2cpuid(struct mbuf *, uintptr_t, u_int *);
 static void epair_nh_drainedcpu(u_int);
 
 static void epair_start_locked(struct ifnet *);
 static int epair_media_change(struct ifnet *);
 static void epair_media_status(struct ifnet *, struct ifmediareq *);
 
 static int epair_clone_match(struct if_clone *, const char *);
 static int epair_clone_create(struct if_clone *, char *, size_t, caddr_t);
 static int epair_clone_destroy(struct if_clone *, struct ifnet *);
 
 static const char epairname[] = "epair";
 
 /* Netisr related definitions and sysctl. */
 static struct netisr_handler epair_nh = {
 	.nh_name	= epairname,
 	.nh_proto	= NETISR_EPAIR,
 	.nh_policy	= NETISR_POLICY_CPU,
 	.nh_handler	= epair_nh_sintr,
 	.nh_m2cpuid	= epair_nh_m2cpuid,
 	.nh_drainedcpu	= epair_nh_drainedcpu,
 };
 
 static int
 sysctl_epair_netisr_maxqlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&epair_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&epair_nh, qlimit));
 }
 SYSCTL_PROC(_net_link_epair, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_epair_netisr_maxqlen, "I",
     "Maximum if_epair(4) netisr \"hw\" queue length");
 
 struct epair_softc {
 	struct ifnet	*ifp;		/* This ifp. */
 	struct ifnet	*oifp;		/* other ifp of pair. */
 	struct ifmedia	media;		/* Media config (fake). */
 	u_int		refcount;	/* # of mbufs in flight. */
 	u_int		cpuid;		/* CPU ID assigned upon creation. */
 	void		(*if_qflush)(struct ifnet *);
 					/* Original if_qflush routine. */
 };
 
 /*
  * Per-CPU list of ifps with data in the ifq that needs to be flushed
  * to the netisr ``hw'' queue before we allow any further direct queuing
  * to the ``hw'' queue.
  */
 struct epair_ifp_drain {
 	STAILQ_ENTRY(epair_ifp_drain)	ifp_next;
 	struct ifnet			*ifp;
 };
 STAILQ_HEAD(eid_list, epair_ifp_drain);
 
 #define	EPAIR_LOCK_INIT(dpcpu)		mtx_init(&(dpcpu)->if_epair_mtx, \
 					    "if_epair", NULL, MTX_DEF)
 #define	EPAIR_LOCK_DESTROY(dpcpu)	mtx_destroy(&(dpcpu)->if_epair_mtx)
 #define	EPAIR_LOCK_ASSERT(dpcpu)	mtx_assert(&(dpcpu)->if_epair_mtx, \
 					    MA_OWNED)
 #define	EPAIR_LOCK(dpcpu)		mtx_lock(&(dpcpu)->if_epair_mtx)
 #define	EPAIR_UNLOCK(dpcpu)		mtx_unlock(&(dpcpu)->if_epair_mtx)
 
 #ifdef INVARIANTS
 #define	EPAIR_REFCOUNT_INIT(r, v)	refcount_init((r), (v))
 #define	EPAIR_REFCOUNT_AQUIRE(r)	refcount_acquire((r))
 #define	EPAIR_REFCOUNT_RELEASE(r)	refcount_release((r))
 #define	EPAIR_REFCOUNT_ASSERT(a, p)	KASSERT(a, p)
 #else
 #define	EPAIR_REFCOUNT_INIT(r, v)
 #define	EPAIR_REFCOUNT_AQUIRE(r)
 #define	EPAIR_REFCOUNT_RELEASE(r)
 #define	EPAIR_REFCOUNT_ASSERT(a, p)
 #endif
 
 static MALLOC_DEFINE(M_EPAIR, epairname,
     "Pair of virtual cross-over connected Ethernet-like interfaces");
 
 static VNET_DEFINE(struct if_clone *, epair_cloner);
 #define	V_epair_cloner	VNET(epair_cloner)
 
 /*
  * DPCPU area and functions.
  */
 struct epair_dpcpu {
 	struct mtx	if_epair_mtx;		/* Per-CPU locking. */
 	int		epair_drv_flags;	/* Per-CPU ``hw'' drv flags. */
 	struct eid_list	epair_ifp_drain_list;	/* Per-CPU list of ifps with
 						 * data in the ifq. */
 };
 DPCPU_DEFINE(struct epair_dpcpu, epair_dpcpu);
 
 static void
 epair_dpcpu_init(void)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct eid_list *s;
 	u_int cpuid;
 
 	CPU_FOREACH(cpuid) {
 		epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
 
 		/* Initialize per-cpu lock. */
 		EPAIR_LOCK_INIT(epair_dpcpu);
 
 		/* Driver flags are per-cpu as are our netisr "hw" queues. */
 		epair_dpcpu->epair_drv_flags = 0;
 
 		/*
 		 * Initialize per-cpu drain list.
 		 * Manually do what STAILQ_HEAD_INITIALIZER would do.
 		 */
 		s = &epair_dpcpu->epair_ifp_drain_list;
 		s->stqh_first = NULL;
 		s->stqh_last = &s->stqh_first;
 	} 
 }
 
 static void
 epair_dpcpu_detach(void)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	u_int cpuid;
 
 	CPU_FOREACH(cpuid) {
 		epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
 
 		/* Destroy per-cpu lock. */
 		EPAIR_LOCK_DESTROY(epair_dpcpu);
 	}
 }
 
 /*
  * Helper functions.
  */
 static u_int
 cpuid_from_ifp(struct ifnet *ifp)
 {
 	struct epair_softc *sc;
 
 	if (ifp == NULL)
 		return (0);
 	sc = ifp->if_softc;
 
 	return (sc->cpuid);
 }
 
 /*
  * Netisr handler functions.
  */
 static void
 epair_nh_sintr(struct mbuf *m)
 {
 	struct ifnet *ifp;
 	struct epair_softc *sc;
 
 	ifp = m->m_pkthdr.rcvif;
 	(*ifp->if_input)(ifp, m);
 	sc = ifp->if_softc;
 	EPAIR_REFCOUNT_RELEASE(&sc->refcount);
 	EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
 	    ("%s: ifp=%p sc->refcount not >= 1: %d",
 	    __func__, ifp, sc->refcount));
 	DPRINTF("ifp=%p refcount=%u\n", ifp, sc->refcount);
 }
 
 static struct mbuf *
 epair_nh_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
 {
 
 	*cpuid = cpuid_from_ifp(m->m_pkthdr.rcvif);
 
 	return (m);
 }
 
 static void
 epair_nh_drainedcpu(u_int cpuid)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct epair_ifp_drain *elm, *tvar;
 	struct ifnet *ifp;
 
 	epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
 	EPAIR_LOCK(epair_dpcpu);
 	/*
 	 * Assume our "hw" queue and possibly ifq will be emptied
 	 * again. In case we will overflow the "hw" queue while
 	 * draining, epair_start_locked will set IFF_DRV_OACTIVE
 	 * again and we will stop and return.
 	 */
 	STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list,
 	    ifp_next, tvar) {
 		ifp = elm->ifp;
 		epair_dpcpu->epair_drv_flags &= ~IFF_DRV_OACTIVE;
 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 		epair_start_locked(ifp);
 
 		IFQ_LOCK(&ifp->if_snd);
 		if (IFQ_IS_EMPTY(&ifp->if_snd)) {
 			struct epair_softc *sc;
 
 			STAILQ_REMOVE(&epair_dpcpu->epair_ifp_drain_list,
 			    elm, epair_ifp_drain, ifp_next);
 			/* The cached ifp goes off the list. */
 			sc = ifp->if_softc;
 			EPAIR_REFCOUNT_RELEASE(&sc->refcount);
 			EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
 			    ("%s: ifp=%p sc->refcount not >= 1: %d",
 			    __func__, ifp, sc->refcount));
 			free(elm, M_EPAIR);
 		}
 		IFQ_UNLOCK(&ifp->if_snd);
 
 		if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) != 0) {
 			/* Our "hw"q overflew again. */
 			epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
 			DPRINTF("hw queue length overflow at %u\n",
 			    epair_nh.nh_qlimit);
 			break;
 		}
 	}
 	EPAIR_UNLOCK(epair_dpcpu);
 }
 
 /*
  * Network interface (`if') related functions.
  */
 static void
 epair_remove_ifp_from_draining(struct ifnet *ifp)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct epair_ifp_drain *elm, *tvar;
 	u_int cpuid;
 
 	CPU_FOREACH(cpuid) {
 		epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
 		EPAIR_LOCK(epair_dpcpu);
 		STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list,
 		    ifp_next, tvar) {
 			if (ifp == elm->ifp) {
 				struct epair_softc *sc;
 
 				STAILQ_REMOVE(
 				    &epair_dpcpu->epair_ifp_drain_list, elm,
 				    epair_ifp_drain, ifp_next);
 				/* The cached ifp goes off the list. */
 				sc = ifp->if_softc;
 				EPAIR_REFCOUNT_RELEASE(&sc->refcount);
 				EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
 				    ("%s: ifp=%p sc->refcount not >= 1: %d",
 				    __func__, ifp, sc->refcount));
 				free(elm, M_EPAIR);
 			}
 		}
 		EPAIR_UNLOCK(epair_dpcpu);
 	}
 }
 
 static int
 epair_add_ifp_for_draining(struct ifnet *ifp)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct epair_softc *sc;
 	struct epair_ifp_drain *elm = NULL;
 
 	sc = ifp->if_softc;
 	epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
 	EPAIR_LOCK_ASSERT(epair_dpcpu);
 	STAILQ_FOREACH(elm, &epair_dpcpu->epair_ifp_drain_list, ifp_next)
 		if (elm->ifp == ifp)
 			break;
 	/* If the ifp is there already, return success. */
 	if (elm != NULL)
 		return (0);
 
 	elm = malloc(sizeof(struct epair_ifp_drain), M_EPAIR, M_NOWAIT|M_ZERO);
 	if (elm == NULL)
 		return (ENOMEM);
 
 	elm->ifp = ifp;
 	/* Add a reference for the ifp pointer on the list. */
 	EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
 	STAILQ_INSERT_TAIL(&epair_dpcpu->epair_ifp_drain_list, elm, ifp_next);
 
 	return (0);
 }
 
 static void
 epair_start_locked(struct ifnet *ifp)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct mbuf *m;
 	struct epair_softc *sc;
 	struct ifnet *oifp;
 	int error;
 
 	DPRINTF("ifp=%p\n", ifp);
 	sc = ifp->if_softc;
 	epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
 	EPAIR_LOCK_ASSERT(epair_dpcpu);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 	if ((ifp->if_flags & IFF_UP) == 0)
 		return;
 
 	/*
 	 * We get patckets here from ether_output via if_handoff()
 	 * and ned to put them into the input queue of the oifp
 	 * and call oifp->if_input() via netisr/epair_sintr().
 	 */
 	oifp = sc->oifp;
 	sc = oifp->if_softc;
 	for (;;) {
 		IFQ_DEQUEUE(&ifp->if_snd, m);
 		if (m == NULL)
 			break;
 		BPF_MTAP(ifp, m);
 
 		/*
 		 * In case the outgoing interface is not usable,
 		 * drop the packet.
 		 */
 		if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 		    (oifp->if_flags & IFF_UP) ==0) {
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			m_freem(m);
 			continue;
 		}
 		DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname);
 
 		/*
 		 * Add a reference so the interface cannot go while the
 		 * packet is in transit as we rely on rcvif to stay valid.
 		 */
 		EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
 		m->m_pkthdr.rcvif = oifp;
 		CURVNET_SET_QUIET(oifp->if_vnet);
 		error = netisr_queue(NETISR_EPAIR, m);
 		CURVNET_RESTORE();
 		if (!error) {
 			if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 			/* Someone else received the packet. */
 			if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
 		} else {
 			/* The packet was freed already. */
 			epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
 			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 			(void) epair_add_ifp_for_draining(ifp);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			EPAIR_REFCOUNT_RELEASE(&sc->refcount);
 			EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
 			    ("%s: ifp=%p sc->refcount not >= 1: %d",
 			    __func__, oifp, sc->refcount));
 		}
 	}
 }
 
 static void
 epair_start(struct ifnet *ifp)
 {
 	struct epair_dpcpu *epair_dpcpu;
 
 	epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu);
 	EPAIR_LOCK(epair_dpcpu);
 	epair_start_locked(ifp);
 	EPAIR_UNLOCK(epair_dpcpu);
 }
 
 static int
 epair_transmit_locked(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct epair_softc *sc;
 	struct ifnet *oifp;
 	int error, len;
 	short mflags;
 
 	DPRINTF("ifp=%p m=%p\n", ifp, m);
 	sc = ifp->if_softc;
 	epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
 	EPAIR_LOCK_ASSERT(epair_dpcpu);
 
 	if (m == NULL)
 		return (0);
 	
 	/*
 	 * We are not going to use the interface en/dequeue mechanism
 	 * on the TX side. We are called from ether_output_frame()
 	 * and will put the packet into the incoming queue of the
 	 * other interface of our pair via the netsir.
 	 */
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		m_freem(m);
 		return (ENXIO);
 	}
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	BPF_MTAP(ifp, m);
 
 	/*
 	 * In case the outgoing interface is not usable,
 	 * drop the packet.
 	 */
 	oifp = sc->oifp;
 	if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    (oifp->if_flags & IFF_UP) ==0) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (0);
 	}
 	len = m->m_pkthdr.len;
 	mflags = m->m_flags;
 	DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname);
 
 #ifdef ALTQ
 	/* Support ALTQ via the clasic if_start() path. */
 	IF_LOCK(&ifp->if_snd);
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		ALTQ_ENQUEUE(&ifp->if_snd, m, NULL, error);
 		if (error)
 			if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 		IF_UNLOCK(&ifp->if_snd);
 		if (!error) {
 			if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 			if (mflags & (M_BCAST|M_MCAST))
 				if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 			
 			if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0)
 				epair_start_locked(ifp);
 			else
 				(void)epair_add_ifp_for_draining(ifp);
 		}
 		return (error);
 	}
 	IF_UNLOCK(&ifp->if_snd);
 #endif
 
 	if ((epair_dpcpu->epair_drv_flags & IFF_DRV_OACTIVE) != 0) {
 		/*
 		 * Our hardware queue is full, try to fall back
 		 * queuing to the ifq but do not call ifp->if_start.
 		 * Either we are lucky or the packet is gone.
 		 */
 		IFQ_ENQUEUE(&ifp->if_snd, m, error);
 		if (!error)
 			(void)epair_add_ifp_for_draining(ifp);
 		return (error);
 	}
 	sc = oifp->if_softc;
 	/*
 	 * Add a reference so the interface cannot go while the
 	 * packet is in transit as we rely on rcvif to stay valid.
 	 */
 	EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
 	m->m_pkthdr.rcvif = oifp;
 	CURVNET_SET_QUIET(oifp->if_vnet);
 	error = netisr_queue(NETISR_EPAIR, m);
 	CURVNET_RESTORE();
 	if (!error) {
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		/*
 		 * IFQ_HANDOFF_ADJ/ip_handoff() update statistics,
 		 * but as we bypass all this we have to duplicate
 		 * the logic another time.
 		 */
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 		if (mflags & (M_BCAST|M_MCAST))
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 		/* Someone else received the packet. */
 		if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
 	} else {
 		/* The packet was freed already. */
 		epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
 		ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		EPAIR_REFCOUNT_RELEASE(&sc->refcount);
 		EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
 		    ("%s: ifp=%p sc->refcount not >= 1: %d",
 		    __func__, oifp, sc->refcount));
 	}
 
 	return (error);
 }
 
 static int
 epair_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	int error;
 
 	epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu);
 	EPAIR_LOCK(epair_dpcpu);
 	error = epair_transmit_locked(ifp, m);
 	EPAIR_UNLOCK(epair_dpcpu);
 	return (error);
 }
 
 static void
 epair_qflush(struct ifnet *ifp)
 {
 	struct epair_softc *sc;
 	
 	sc = ifp->if_softc;
 	KASSERT(sc != NULL, ("%s: ifp=%p, epair_softc gone? sc=%p\n",
 	    __func__, ifp, sc));
 	/*
 	 * Remove this ifp from all backpointer lists. The interface will not
 	 * usable for flushing anyway nor should it have anything to flush
 	 * after if_qflush().
 	 */
 	epair_remove_ifp_from_draining(ifp);
 
 	if (sc->if_qflush)
 		sc->if_qflush(ifp);
 }
 
 static int
 epair_media_change(struct ifnet *ifp __unused)
 {
 
 	/* Do nothing. */
 	return (0);
 }
 
 static void
 epair_media_status(struct ifnet *ifp __unused, struct ifmediareq *imr)
 {
 
 	imr->ifm_status = IFM_AVALID | IFM_ACTIVE;
 	imr->ifm_active = IFM_ETHER | IFM_10G_T | IFM_FDX;
 }
 
 static int
 epair_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct epair_softc *sc;
 	struct ifreq *ifr;
 	int error;
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 	case SIOCSIFFLAGS:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		error = 0;
 		break;
 
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		sc = ifp->if_softc;
 		error = ifmedia_ioctl(ifp, ifr, &sc->media, cmd);
 		break;
 
 	case SIOCSIFMTU:
 		/* We basically allow all kinds of MTUs. */
 		ifp->if_mtu = ifr->ifr_mtu;
 		error = 0;
 		break;
 
 	default:
 		/* Let the common ethernet handler process this. */
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 
 	return (error);
 }
 
 static void
 epair_init(void *dummy __unused)
 {
 }
 
 
 /*
  * Interface cloning functions.
  * We use our private ones so that we can create/destroy our secondary
  * device along with the primary one.
  */
 static int
 epair_clone_match(struct if_clone *ifc, const char *name)
 {
 	const char *cp;
 
 	DPRINTF("name='%s'\n", name);
 
 	/*
 	 * Our base name is epair.
 	 * Our interfaces will be named epair<n>[ab].
 	 * So accept anything of the following list:
 	 * - epair
 	 * - epair<n>
 	 * but not the epair<n>[ab] versions.
 	 */
 	if (strncmp(epairname, name, sizeof(epairname)-1) != 0)
 		return (0);
 
 	for (cp = name + sizeof(epairname) - 1; *cp != '\0'; cp++) {
 		if (*cp < '0' || *cp > '9')
 			return (0);
 	}
 
 	return (1);
 }
 
 static int
 epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 {
 	struct epair_softc *sca, *scb;
 	struct ifnet *ifp;
 	char *dp;
 	int error, unit, wildcard;
 	uint8_t eaddr[ETHER_ADDR_LEN];	/* 00:00:00:00:00:00 */
 
 	/*
 	 * We are abusing params to create our second interface.
 	 * Actually we already created it and called if_clone_create()
 	 * for it to do the official insertion procedure the moment we knew
 	 * it cannot fail anymore. So just do attach it here.
 	 */
 	if (params) {
 		scb = (struct epair_softc *)params;
 		ifp = scb->ifp;
 		/* Assign a hopefully unique, locally administered etheraddr. */
 		eaddr[0] = 0x02;
 		eaddr[3] = (ifp->if_index >> 8) & 0xff;
 		eaddr[4] = ifp->if_index & 0xff;
 		eaddr[5] = 0x0b;
 		ether_ifattach(ifp, eaddr);
 		/* Correctly set the name for the cloner list. */
 		strlcpy(name, scb->ifp->if_xname, len);
 		return (0);
 	}
 
 	/* Try to see if a special unit was requested. */
 	error = ifc_name2unit(name, &unit);
 	if (error != 0)
 		return (error);
 	wildcard = (unit < 0);
 
 	error = ifc_alloc_unit(ifc, &unit);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * If no unit had been given, we need to adjust the ifName.
 	 * Also make sure there is space for our extra [ab] suffix.
 	 */
 	for (dp = name; *dp != '\0'; dp++);
 	if (wildcard) {
 		error = snprintf(dp, len - (dp - name), "%d", unit);
 		if (error > len - (dp - name) - 1) {
 			/* ifName too long. */
 			ifc_free_unit(ifc, unit);
 			return (ENOSPC);
 		}
 		dp += error;
 	}
 	if (len - (dp - name) - 1 < 1) {
 		/* No space left for our [ab] suffix. */
 		ifc_free_unit(ifc, unit);
 		return (ENOSPC);
 	}
 	*dp = 'b';
 	/* Must not change dp so we can replace 'a' by 'b' later. */
 	*(dp+1) = '\0';
 
 	/* Check if 'a' and 'b' interfaces already exist. */ 
 	if (ifunit(name) != NULL)
 		return (EEXIST);
 	*dp = 'a';
 	if (ifunit(name) != NULL)
 		return (EEXIST);
 
 	/* Allocate memory for both [ab] interfaces */
 	sca = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO);
 	EPAIR_REFCOUNT_INIT(&sca->refcount, 1);
 	sca->ifp = if_alloc(IFT_ETHER);
 	if (sca->ifp == NULL) {
 		free(sca, M_EPAIR);
 		ifc_free_unit(ifc, unit);
 		return (ENOSPC);
 	}
 
 	scb = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO);
 	EPAIR_REFCOUNT_INIT(&scb->refcount, 1);
 	scb->ifp = if_alloc(IFT_ETHER);
 	if (scb->ifp == NULL) {
 		free(scb, M_EPAIR);
 		if_free(sca->ifp);
 		free(sca, M_EPAIR);
 		ifc_free_unit(ifc, unit);
 		return (ENOSPC);
 	}
 	
 	/*
 	 * Cross-reference the interfaces so we will be able to free both.
 	 */
 	sca->oifp = scb->ifp;
 	scb->oifp = sca->ifp;
 
 	/*
 	 * Calculate the cpuid for netisr queueing based on the
 	 * ifIndex of the interfaces. As long as we cannot configure
 	 * this or use cpuset information easily we cannot guarantee
 	 * cache locality but we can at least allow parallelism.
 	 */
 	sca->cpuid =
 	    netisr_get_cpuid(sca->ifp->if_index % netisr_get_cpucount());
 	scb->cpuid =
 	    netisr_get_cpuid(scb->ifp->if_index % netisr_get_cpucount());
 
 	/* Initialise pseudo media types. */
 	ifmedia_init(&sca->media, 0, epair_media_change, epair_media_status);
 	ifmedia_add(&sca->media, IFM_ETHER | IFM_10G_T, 0, NULL);
 	ifmedia_set(&sca->media, IFM_ETHER | IFM_10G_T);
 	ifmedia_init(&scb->media, 0, epair_media_change, epair_media_status);
 	ifmedia_add(&scb->media, IFM_ETHER | IFM_10G_T, 0, NULL);
 	ifmedia_set(&scb->media, IFM_ETHER | IFM_10G_T);
 	
 	/* Finish initialization of interface <n>a. */
 	ifp = sca->ifp;
 	ifp->if_softc = sca;
 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
 	ifp->if_dname = epairname;
 	ifp->if_dunit = unit;
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_capabilities = IFCAP_VLAN_MTU;
 	ifp->if_capenable = IFCAP_VLAN_MTU;
 	ifp->if_start = epair_start;
 	ifp->if_ioctl = epair_ioctl;
 	ifp->if_init  = epair_init;
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	/* Assign a hopefully unique, locally administered etheraddr. */
 	eaddr[0] = 0x02;
 	eaddr[3] = (ifp->if_index >> 8) & 0xff;
 	eaddr[4] = ifp->if_index & 0xff;
 	eaddr[5] = 0x0a;
 	ether_ifattach(ifp, eaddr);
 	sca->if_qflush = ifp->if_qflush;
 	ifp->if_qflush = epair_qflush;
 	ifp->if_transmit = epair_transmit;
 	ifp->if_baudrate = IF_Gbps(10);	/* arbitrary maximum */
 
 	/* Swap the name and finish initialization of interface <n>b. */
 	*dp = 'b';
 
 	ifp = scb->ifp;
 	ifp->if_softc = scb;
 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
 	ifp->if_dname = epairname;
 	ifp->if_dunit = unit;
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_capabilities = IFCAP_VLAN_MTU;
 	ifp->if_capenable = IFCAP_VLAN_MTU;
 	ifp->if_start = epair_start;
 	ifp->if_ioctl = epair_ioctl;
 	ifp->if_init  = epair_init;
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	/* We need to play some tricks here for the second interface. */
 	strlcpy(name, epairname, len);
 	error = if_clone_create(name, len, (caddr_t)scb);
 	if (error)
 		panic("%s: if_clone_create() for our 2nd iface failed: %d",
 		    __func__, error);
 	scb->if_qflush = ifp->if_qflush;
 	ifp->if_qflush = epair_qflush;
 	ifp->if_transmit = epair_transmit;
 	ifp->if_baudrate = IF_Gbps(10);	/* arbitrary maximum */
 
 	/*
 	 * Restore name to <n>a as the ifp for this will go into the
 	 * cloner list for the initial call.
 	 */
 	strlcpy(name, sca->ifp->if_xname, len);
 	DPRINTF("name='%s/%db' created sca=%p scb=%p\n", name, unit, sca, scb);
 
 	/* Tell the world, that we are ready to rock. */
 	sca->ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	scb->ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	if_link_state_change(sca->ifp, LINK_STATE_UP);
 	if_link_state_change(scb->ifp, LINK_STATE_UP);
 
 	return (0);
 }
 
 static int
 epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
 {
 	struct ifnet *oifp;
 	struct epair_softc *sca, *scb;
 	int unit, error;
 
 	DPRINTF("ifp=%p\n", ifp);
 
 	/*
 	 * In case we called into if_clone_destroyif() ourselves
 	 * again to remove the second interface, the softc will be
 	 * NULL. In that case so not do anything but return success.
 	 */
 	if (ifp->if_softc == NULL)
 		return (0);
 	
 	unit = ifp->if_dunit;
 	sca = ifp->if_softc;
 	oifp = sca->oifp;
 	scb = oifp->if_softc;
 
 	DPRINTF("ifp=%p oifp=%p\n", ifp, oifp);
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 	if_link_state_change(oifp, LINK_STATE_DOWN);
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	oifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 
 	/*
 	 * Get rid of our second half. As the other of the two
 	 * interfaces may reside in a different vnet, we need to
 	 * switch before freeing them.
 	 */
 	CURVNET_SET_QUIET(oifp->if_vnet);
 	ether_ifdetach(oifp);
 	/*
 	 * Wait for all packets to be dispatched to if_input.
 	 * The numbers can only go down as the interface is
 	 * detached so there is no need to use atomics.
 	 */
 	DPRINTF("scb refcnt=%u\n", scb->refcount);
 	EPAIR_REFCOUNT_ASSERT(scb->refcount == 1,
 	    ("%s: ifp=%p scb->refcount!=1: %d", __func__, oifp, scb->refcount));
 	oifp->if_softc = NULL;
 	error = if_clone_destroyif(ifc, oifp);
 	if (error)
 		panic("%s: if_clone_destroyif() for our 2nd iface failed: %d",
 		    __func__, error);
 	if_free(oifp);
 	ifmedia_removeall(&scb->media);
 	free(scb, M_EPAIR);
 	CURVNET_RESTORE();
 
 	ether_ifdetach(ifp);
 	/*
 	 * Wait for all packets to be dispatched to if_input.
 	 */
 	DPRINTF("sca refcnt=%u\n", sca->refcount);
 	EPAIR_REFCOUNT_ASSERT(sca->refcount == 1,
 	    ("%s: ifp=%p sca->refcount!=1: %d", __func__, ifp, sca->refcount));
 	if_free(ifp);
 	ifmedia_removeall(&sca->media);
 	free(sca, M_EPAIR);
 	ifc_free_unit(ifc, unit);
 
 	return (0);
 }
 
 static void
 vnet_epair_init(const void *unused __unused)
 {
 
 	V_epair_cloner = if_clone_advanced(epairname, 0,
 	    epair_clone_match, epair_clone_create, epair_clone_destroy);
+	netisr_register(&epair_nh);
 }
 VNET_SYSINIT(vnet_epair_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_epair_init, NULL);
 
 static void
 vnet_epair_uninit(const void *unused __unused)
 {
 
+	netisr_unregister(&epair_nh);
 	if_clone_detach(V_epair_cloner);
 }
 VNET_SYSUNINIT(vnet_epair_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_epair_uninit, NULL);
 
 static int
 epair_modevent(module_t mod, int type, void *data)
 {
 	int qlimit;
 
 	switch (type) {
 	case MOD_LOAD:
 		/* For now limit us to one global mutex and one inq. */
 		epair_dpcpu_init();
 		epair_nh.nh_qlimit = 42 * ifqmaxlen; /* 42 shall be the number. */
 		if (TUNABLE_INT_FETCH("net.link.epair.netisr_maxqlen", &qlimit))
 		    epair_nh.nh_qlimit = qlimit;
-		netisr_register(&epair_nh);
 		if (bootverbose)
 			printf("%s initialized.\n", epairname);
 		break;
 	case MOD_UNLOAD:
-		netisr_unregister(&epair_nh);
 		epair_dpcpu_detach();
 		if (bootverbose)
 			printf("%s unloaded.\n", epairname);
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t epair_mod = {
 	"if_epair",
 	epair_modevent,
 	0
 };
 
-DECLARE_MODULE(if_epair, epair_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+DECLARE_MODULE(if_epair, epair_mod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE);
 MODULE_VERSION(if_epair, 1);
Index: projects/vnet/sys/net/if_ethersubr.c
===================================================================
--- projects/vnet/sys/net/if_ethersubr.c	(revision 298547)
+++ projects/vnet/sys/net/if_ethersubr.c	(revision 298548)
@@ -1,1198 +1,1200 @@
 /*-
  * Copyright (c) 1982, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_ethersubr.c	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_netgraph.h"
 #include "opt_mbuf_profiling.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mbuf.h>
 #include <sys/random.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/uuid.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arp.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/if_llc.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if_bridgevar.h>
 #include <net/if_vlan_var.h>
 #include <net/if_llatbl.h>
 #include <net/pfil.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netpfil/pf/pf_mtag.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip_carp.h>
 #include <netinet/ip_var.h>
 #endif
 #ifdef INET6
 #include <netinet6/nd6.h>
 #endif
 #include <security/mac/mac_framework.h>
 
 #ifdef CTASSERT
 CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2);
 CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN);
 #endif
 
 VNET_DEFINE(struct pfil_head, link_pfil_hook);	/* Packet filter hooks */
 
 /* netgraph node hooks for ng_ether(4) */
 void	(*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp);
 void	(*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m);
 int	(*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp);
 void	(*ng_ether_attach_p)(struct ifnet *ifp);
 void	(*ng_ether_detach_p)(struct ifnet *ifp);
 
 void	(*vlan_input_p)(struct ifnet *, struct mbuf *);
 
 /* if_bridge(4) support */
 struct mbuf *(*bridge_input_p)(struct ifnet *, struct mbuf *); 
 int	(*bridge_output_p)(struct ifnet *, struct mbuf *, 
 		struct sockaddr *, struct rtentry *);
 void	(*bridge_dn_p)(struct mbuf *, struct ifnet *);
 
 /* if_lagg(4) support */
 struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); 
 
 static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] =
 			{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 static	int ether_resolvemulti(struct ifnet *, struct sockaddr **,
 		struct sockaddr *);
 #ifdef VIMAGE
 static	void ether_reassign(struct ifnet *, struct vnet *, char *);
 #endif
 static	int ether_requestencap(struct ifnet *, struct if_encap_req *);
 
 #define	ETHER_IS_BROADCAST(addr) \
 	(bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0)
 
 #define senderr(e) do { error = (e); goto bad;} while (0)
 
 static void
 update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst)
 {
 	int csum_flags = 0;
 
 	if (src->m_pkthdr.csum_flags & CSUM_IP)
 		csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID);
 	if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
 		csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
 	if (src->m_pkthdr.csum_flags & CSUM_SCTP)
 		csum_flags |= CSUM_SCTP_VALID;
 	dst->m_pkthdr.csum_flags |= csum_flags;
 	if (csum_flags & CSUM_DATA_VALID)
 		dst->m_pkthdr.csum_data = 0xffff;
 }
 
 /*
  * Handle link-layer encapsulation requests.
  */
 static int
 ether_requestencap(struct ifnet *ifp, struct if_encap_req *req)
 {
 	struct ether_header *eh;
 	struct arphdr *ah;
 	uint16_t etype;
 	const u_char *lladdr;
 
 	if (req->rtype != IFENCAP_LL)
 		return (EOPNOTSUPP);
 
 	if (req->bufsize < ETHER_HDR_LEN)
 		return (ENOMEM);
 
 	eh = (struct ether_header *)req->buf;
 	lladdr = req->lladdr;
 	req->lladdr_off = 0;
 
 	switch (req->family) {
 	case AF_INET:
 		etype = htons(ETHERTYPE_IP);
 		break;
 	case AF_INET6:
 		etype = htons(ETHERTYPE_IPV6);
 		break;
 	case AF_ARP:
 		ah = (struct arphdr *)req->hdata;
 		ah->ar_hrd = htons(ARPHRD_ETHER);
 
 		switch(ntohs(ah->ar_op)) {
 		case ARPOP_REVREQUEST:
 		case ARPOP_REVREPLY:
 			etype = htons(ETHERTYPE_REVARP);
 			break;
 		case ARPOP_REQUEST:
 		case ARPOP_REPLY:
 		default:
 			etype = htons(ETHERTYPE_ARP);
 			break;
 		}
 
 		if (req->flags & IFENCAP_FLAG_BROADCAST)
 			lladdr = ifp->if_broadcastaddr;
 		break;
 	default:
 		return (EAFNOSUPPORT);
 	}
 
 	memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type));
 	memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN);
 	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 	req->bufsize = sizeof(struct ether_header);
 
 	return (0);
 }
 
 
 static int
 ether_resolve_addr(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro, u_char *phdr,
 	uint32_t *pflags)
 {
 	struct ether_header *eh;
 	uint32_t lleflags = 0;
 	int error = 0;
 #if defined(INET) || defined(INET6)
 	uint16_t etype;
 #endif
 
 	eh = (struct ether_header *)phdr;
 
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if ((m->m_flags & (M_BCAST | M_MCAST)) == 0)
 			error = arpresolve(ifp, 0, m, dst, phdr, &lleflags);
 		else {
 			if (m->m_flags & M_BCAST)
 				memcpy(eh->ether_dhost, ifp->if_broadcastaddr,
 				    ETHER_ADDR_LEN);
 			else {
 				const struct in_addr *a;
 				a = &(((const struct sockaddr_in *)dst)->sin_addr);
 				ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost);
 			}
 			etype = htons(ETHERTYPE_IP);
 			memcpy(&eh->ether_type, &etype, sizeof(etype));
 			memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if ((m->m_flags & M_MCAST) == 0)
 			error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags);
 		else {
 			const struct in6_addr *a6;
 			a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr);
 			ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost);
 			etype = htons(ETHERTYPE_IPV6);
 			memcpy(&eh->ether_type, &etype, sizeof(etype));
 			memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 		}
 		break;
 #endif
 	default:
 		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
 		if (m != NULL)
 			m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 
 	if (error == EHOSTDOWN) {
 		if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0)
 			error = EHOSTUNREACH;
 	}
 
 	if (error != 0)
 		return (error);
 
 	*pflags = RT_MAY_LOOP;
 	if (lleflags & LLE_IFADDR)
 		*pflags |= RT_L2_ME;
 
 	return (0);
 }
 
 /*
  * Ethernet output routine.
  * Encapsulate a packet of type family for the local net.
  * Use trailer local net encapsulation if enough data in first
  * packet leaves a multiple of 512 bytes of data in remainder.
  */
 int
 ether_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 	int error = 0;
 	char linkhdr[ETHER_HDR_LEN], *phdr;
 	struct ether_header *eh;
 	struct pf_mtag *t;
 	int loop_copy = 1;
 	int hlen;	/* link layer header length */
 	uint32_t pflags;
 
 	phdr = NULL;
 	pflags = 0;
 	if (ro != NULL) {
 		phdr = ro->ro_prepend;
 		hlen = ro->ro_plen;
 		pflags = ro->ro_flags;
 	}
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		senderr(error);
 #endif
 
 	M_PROFILE(m);
 	if (ifp->if_flags & IFF_MONITOR)
 		senderr(ENETDOWN);
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		senderr(ENETDOWN);
 
 	if (phdr == NULL) {
 		/* No prepend data supplied. Try to calculate ourselves. */
 		phdr = linkhdr;
 		hlen = ETHER_HDR_LEN;
 		error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags);
 		if (error != 0)
 			return (error == EWOULDBLOCK ? 0 : error);
 	}
 
 	if ((pflags & RT_L2_ME) != 0) {
 		update_mbuf_csumflags(m, m);
 		return (if_simloop(ifp, m, dst->sa_family, 0));
 	}
 	loop_copy = pflags & RT_MAY_LOOP;
 
 	/*
 	 * Add local net header.  If no space in first mbuf,
 	 * allocate another.
 	 *
 	 * Note that we do prepend regardless of RT_HAS_HEADER flag.
 	 * This is done because BPF code shifts m_data pointer
 	 * to the end of ethernet header prior to calling if_output().
 	 */
 	M_PREPEND(m, hlen, M_NOWAIT);
 	if (m == NULL)
 		senderr(ENOBUFS);
 	if ((pflags & RT_HAS_HEADER) == 0) {
 		eh = mtod(m, struct ether_header *);
 		memcpy(eh, phdr, hlen);
 	}
 
 	/*
 	 * If a simplex interface, and the packet is being sent to our
 	 * Ethernet address or a broadcast address, loopback a copy.
 	 * XXX To make a simplex device behave exactly like a duplex
 	 * device, we should copy in the case of sending to our own
 	 * ethernet address (thus letting the original actually appear
 	 * on the wire). However, we don't do that here for security
 	 * reasons and compatibility with the original behavior.
 	 */
 	if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) &&
 	    ((t = pf_find_mtag(m)) == NULL || !t->routed)) {
 		struct mbuf *n;
 
 		/*
 		 * Because if_simloop() modifies the packet, we need a
 		 * writable copy through m_dup() instead of a readonly
 		 * one as m_copy[m] would give us. The alternative would
 		 * be to modify if_simloop() to handle the readonly mbuf,
 		 * but performancewise it is mostly equivalent (trading
 		 * extra data copying vs. extra locking).
 		 *
 		 * XXX This is a local workaround.  A number of less
 		 * often used kernel parts suffer from the same bug.
 		 * See PR kern/105943 for a proposed general solution.
 		 */
 		if ((n = m_dup(m, M_NOWAIT)) != NULL) {
 			update_mbuf_csumflags(m, n);
 			(void)if_simloop(ifp, n, dst->sa_family, hlen);
 		} else
 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 	}
 
        /*
 	* Bridges require special output handling.
 	*/
 	if (ifp->if_bridge) {
 		BRIDGE_OUTPUT(ifp, m, error);
 		return (error);
 	}
 
 #if defined(INET) || defined(INET6)
 	if (ifp->if_carp &&
 	    (error = (*carp_output_p)(ifp, m, dst)))
 		goto bad;
 #endif
 
 	/* Handle ng_ether(4) processing, if any */
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_output_p != NULL,
 		    ("ng_ether_output_p is NULL"));
 		if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) {
 bad:			if (m != NULL)
 				m_freem(m);
 			return (error);
 		}
 		if (m == NULL)
 			return (0);
 	}
 
 	/* Continue with link-layer output */
 	return ether_output_frame(ifp, m);
 }
 
 /*
  * Ethernet link layer output routine to send a raw frame to the device.
  *
  * This assumes that the 14 byte Ethernet header is present and contiguous
  * in the first mbuf (if BRIDGE'ing).
  */
 int
 ether_output_frame(struct ifnet *ifp, struct mbuf *m)
 {
 	int i;
 
 	if (PFIL_HOOKED(&V_link_pfil_hook)) {
 		i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_OUT, NULL);
 
 		if (i != 0)
 			return (EACCES);
 
 		if (m == NULL)
 			return (0);
 	}
 
 	/*
 	 * Queue message on interface, update output statistics if
 	 * successful, and start output if interface not yet active.
 	 */
 	return ((ifp->if_transmit)(ifp, m));
 }
 
 #if defined(INET) || defined(INET6)
 #endif
 
 /*
  * Process a received Ethernet packet; the packet is in the
  * mbuf chain m with the ethernet header at the front.
  */
 static void
 ether_input_internal(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ether_header *eh;
 	u_short etype;
 
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		return;
 	}
 #ifdef DIAGNOSTIC
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n");
 		m_freem(m);
 		return;
 	}
 #endif
 	if (m->m_len < ETHER_HDR_LEN) {
 		/* XXX maybe should pullup? */
 		if_printf(ifp, "discard frame w/o leading ethernet "
 				"header (len %u pkt len %u)\n",
 				m->m_len, m->m_pkthdr.len);
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		return;
 	}
 	eh = mtod(m, struct ether_header *);
 	etype = ntohs(eh->ether_type);
 	random_harvest_queue(m, sizeof(*m), 2, RANDOM_NET_ETHER);
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 
 	if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 		if (ETHER_IS_BROADCAST(eh->ether_dhost))
 			m->m_flags |= M_BCAST;
 		else
 			m->m_flags |= M_MCAST;
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 	}
 
 #ifdef MAC
 	/*
 	 * Tag the mbuf with an appropriate MAC label before any other
 	 * consumers can get to it.
 	 */
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	/*
 	 * Give bpf a chance at the packet.
 	 */
 	ETHER_BPF_MTAP(ifp, m);
 
 	/*
 	 * If the CRC is still on the packet, trim it off. We do this once
 	 * and once only in case we are re-entered. Nothing else on the
 	 * Ethernet receive path expects to see the FCS.
 	 */
 	if (m->m_flags & M_HASFCS) {
 		m_adj(m, -ETHER_CRC_LEN);
 		m->m_flags &= ~M_HASFCS;
 	}
 
 	if (!(ifp->if_capenable & IFCAP_HWSTATS))
 		if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 
 	/* Allow monitor mode to claim this frame, after stats are updated. */
 	if (ifp->if_flags & IFF_MONITOR) {
 		m_freem(m);
 		CURVNET_RESTORE();
 		return;
 	}
 
 	/* Handle input from a lagg(4) port */
 	if (ifp->if_type == IFT_IEEE8023ADLAG) {
 		KASSERT(lagg_input_p != NULL,
 		    ("%s: if_lagg not loaded!", __func__));
 		m = (*lagg_input_p)(ifp, m);
 		if (m != NULL)
 			ifp = m->m_pkthdr.rcvif;
 		else {
 			CURVNET_RESTORE();
 			return;
 		}
 	}
 
 	/*
 	 * If the hardware did not process an 802.1Q tag, do this now,
 	 * to allow 802.1P priority frames to be passed to the main input
 	 * path correctly.
 	 * TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels.
 	 */
 	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) {
 		struct ether_vlan_header *evl;
 
 		if (m->m_len < sizeof(*evl) &&
 		    (m = m_pullup(m, sizeof(*evl))) == NULL) {
 #ifdef DIAGNOSTIC
 			if_printf(ifp, "cannot pullup VLAN header\n");
 #endif
 			if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 			CURVNET_RESTORE();
 			return;
 		}
 
 		evl = mtod(m, struct ether_vlan_header *);
 		m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
 		m->m_flags |= M_VLANTAG;
 
 		bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
 		    ETHER_HDR_LEN - ETHER_TYPE_LEN);
 		m_adj(m, ETHER_VLAN_ENCAP_LEN);
 		eh = mtod(m, struct ether_header *);
 	}
 
 	M_SETFIB(m, ifp->if_fib);
 
 	/* Allow ng_ether(4) to claim this frame. */
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_input_p != NULL,
 		    ("%s: ng_ether_input_p is NULL", __func__));
 		m->m_flags &= ~M_PROMISC;
 		(*ng_ether_input_p)(ifp, &m);
 		if (m == NULL) {
 			CURVNET_RESTORE();
 			return;
 		}
 		eh = mtod(m, struct ether_header *);
 	}
 
 	/*
 	 * Allow if_bridge(4) to claim this frame.
 	 * The BRIDGE_INPUT() macro will update ifp if the bridge changed it
 	 * and the frame should be delivered locally.
 	 */
 	if (ifp->if_bridge != NULL) {
 		m->m_flags &= ~M_PROMISC;
 		BRIDGE_INPUT(ifp, m);
 		if (m == NULL) {
 			CURVNET_RESTORE();
 			return;
 		}
 		eh = mtod(m, struct ether_header *);
 	}
 
 #if defined(INET) || defined(INET6)
 	/*
 	 * Clear M_PROMISC on frame so that carp(4) will see it when the
 	 * mbuf flows up to Layer 3.
 	 * FreeBSD's implementation of carp(4) uses the inprotosw
 	 * to dispatch IPPROTO_CARP. carp(4) also allocates its own
 	 * Ethernet addresses of the form 00:00:5e:00:01:xx, which
 	 * is outside the scope of the M_PROMISC test below.
 	 * TODO: Maintain a hash table of ethernet addresses other than
 	 * ether_dhost which may be active on this ifp.
 	 */
 	if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) {
 		m->m_flags &= ~M_PROMISC;
 	} else
 #endif
 	{
 		/*
 		 * If the frame received was not for our MAC address, set the
 		 * M_PROMISC flag on the mbuf chain. The frame may need to
 		 * be seen by the rest of the Ethernet input path in case of
 		 * re-entry (e.g. bridge, vlan, netgraph) but should not be
 		 * seen by upper protocol layers.
 		 */
 		if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
 		    bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0)
 			m->m_flags |= M_PROMISC;
 	}
 
 	ether_demux(ifp, m);
 	CURVNET_RESTORE();
 }
 
 /*
  * Ethernet input dispatch; by default, direct dispatch here regardless of
  * global configuration.  However, if RSS is enabled, hook up RSS affinity
  * so that when deferred or hybrid dispatch is enabled, we can redistribute
  * load based on RSS.
  *
  * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or
  * not it had already done work distribution via multi-queue.  Then we could
  * direct dispatch in the event load balancing was already complete and
  * handle the case of interfaces with different capabilities better.
  *
  * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions
  * at multiple layers?
  *
  * XXXRW: For now, enable all this only if RSS is compiled in, although it
  * works fine without RSS.  Need to characterise the performance overhead
  * of the detour through the netisr code in the event the result is always
  * direct dispatch.
  */
 static void
 ether_nh_input(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	KASSERT(m->m_pkthdr.rcvif != NULL,
 	    ("%s: NULL interface pointer", __func__));
 	ether_input_internal(m->m_pkthdr.rcvif, m);
 }
 
 static struct netisr_handler	ether_nh = {
 	.nh_name = "ether",
 	.nh_handler = ether_nh_input,
 	.nh_proto = NETISR_ETHER,
 #ifdef RSS
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_DIRECT,
 	.nh_m2cpuid = rss_m2cpuid,
 #else
 	.nh_policy = NETISR_POLICY_SOURCE,
 	.nh_dispatch = NETISR_DISPATCH_DIRECT,
 #endif
 };
 
 static void
-ether_init(__unused void *arg)
-{
-
-	netisr_register(&ether_nh);
-}
-SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL);
-
-static void
 vnet_ether_init(__unused void *arg)
 {
 	int i;
 
 	/* Initialize packet filter hooks. */
 	V_link_pfil_hook.ph_type = PFIL_TYPE_AF;
 	V_link_pfil_hook.ph_af = AF_LINK;
 	if ((i = pfil_head_register(&V_link_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to register pfil link hook, "
 			"error %d\n", __func__, i);
+	netisr_register(&ether_nh);
 }
 VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY,
     vnet_ether_init, NULL);
  
 static void
-vnet_ether_destroy(__unused void *arg)
+vnet_ether_pfil_destroy(__unused void *arg)
 {
 	int i;
 
 	if ((i = pfil_head_unregister(&V_link_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to unregister pfil link hook, "
 			"error %d\n", __func__, i);
 }
-VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY,
-    vnet_ether_destroy, NULL);
+VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY,
+    vnet_ether_pfil_destroy, NULL);
 
+static void
+vnet_ether_destroy(__unused void *arg)
+{
 
+	netisr_unregister(&ether_nh);
+}
+VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY,
+    vnet_ether_destroy, NULL);
 
 static void
 ether_input(struct ifnet *ifp, struct mbuf *m)
 {
 
 	struct mbuf *mn;
 
 	/*
 	 * The drivers are allowed to pass in a chain of packets linked with
 	 * m_nextpkt. We split them up into separate packets here and pass
 	 * them up. This allows the drivers to amortize the receive lock.
 	 */
 	while (m) {
 		mn = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 
 		/*
 		 * We will rely on rcvif being set properly in the deferred context,
 		 * so assert it is correct here.
 		 */
 		KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch", __func__));
+		CURVNET_SET_QUIET(ifp->if_vnet);
 		netisr_dispatch(NETISR_ETHER, m);
+		CURVNET_RESTORE();
 		m = mn;
 	}
 }
 
 /*
  * Upper layer processing for a received Ethernet packet.
  */
 void
 ether_demux(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ether_header *eh;
 	int i, isr;
 	u_short ether_type;
 
 	KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__));
 
 	/* Do not grab PROMISC frames in case we are re-entered. */
 	if (PFIL_HOOKED(&V_link_pfil_hook) && !(m->m_flags & M_PROMISC)) {
 		i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_IN, NULL);
 
 		if (i != 0 || m == NULL)
 			return;
 	}
 
 	eh = mtod(m, struct ether_header *);
 	ether_type = ntohs(eh->ether_type);
 
 	/*
 	 * If this frame has a VLAN tag other than 0, call vlan_input()
 	 * if its module is loaded. Otherwise, drop.
 	 */
 	if ((m->m_flags & M_VLANTAG) &&
 	    EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) {
 		if (ifp->if_vlantrunk == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			m_freem(m);
 			return;
 		}
 		KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!",
 		    __func__));
 		/* Clear before possibly re-entering ether_input(). */
 		m->m_flags &= ~M_PROMISC;
 		(*vlan_input_p)(ifp, m);
 		return;
 	}
 
 	/*
 	 * Pass promiscuously received frames to the upper layer if the user
 	 * requested this by setting IFF_PPROMISC. Otherwise, drop them.
 	 */
 	if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) {
 		m_freem(m);
 		return;
 	}
 
 	/*
 	 * Reset layer specific mbuf flags to avoid confusing upper layers.
 	 * Strip off Ethernet header.
 	 */
 	m->m_flags &= ~M_VLANTAG;
 	m_clrprotoflags(m);
 	m_adj(m, ETHER_HDR_LEN);
 
 	/*
 	 * Dispatch frame to upper layer.
 	 */
 	switch (ether_type) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		isr = NETISR_IP;
 		break;
 
 	case ETHERTYPE_ARP:
 		if (ifp->if_flags & IFF_NOARP) {
 			/* Discard packet if ARP is disabled on interface */
 			m_freem(m);
 			return;
 		}
 		isr = NETISR_ARP;
 		break;
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		goto discard;
 	}
 	netisr_dispatch(isr, m);
 	return;
 
 discard:
 	/*
 	 * Packet is to be discarded.  If netgraph is present,
 	 * hand the packet to it for last chance processing;
 	 * otherwise dispose of it.
 	 */
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_input_orphan_p != NULL,
 		    ("ng_ether_input_orphan_p is NULL"));
 		/*
 		 * Put back the ethernet header so netgraph has a
 		 * consistent view of inbound packets.
 		 */
 		M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
 		(*ng_ether_input_orphan_p)(ifp, m);
 		return;
 	}
 	m_freem(m);
 }
 
 /*
  * Convert Ethernet address to printable (loggable) representation.
  * This routine is for compatibility; it's better to just use
  *
  *	printf("%6D", <pointer to address>, ":");
  *
  * since there's no static buffer involved.
  */
 char *
 ether_sprintf(const u_char *ap)
 {
 	static char etherbuf[18];
 	snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":");
 	return (etherbuf);
 }
 
 /*
  * Perform common duties while attaching to interface list
  */
 void
 ether_ifattach(struct ifnet *ifp, const u_int8_t *lla)
 {
 	int i;
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 
 	ifp->if_addrlen = ETHER_ADDR_LEN;
 	ifp->if_hdrlen = ETHER_HDR_LEN;
 	if_attach(ifp);
 	ifp->if_mtu = ETHERMTU;
 	ifp->if_output = ether_output;
 	ifp->if_input = ether_input;
 	ifp->if_resolvemulti = ether_resolvemulti;
 	ifp->if_requestencap = ether_requestencap;
 #ifdef VIMAGE
 	ifp->if_reassign = ether_reassign;
 #endif
 	if (ifp->if_baudrate == 0)
 		ifp->if_baudrate = IF_Mbps(10);		/* just a default */
 	ifp->if_broadcastaddr = etherbroadcastaddr;
 
 	ifa = ifp->if_addr;
 	KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	sdl->sdl_type = IFT_ETHER;
 	sdl->sdl_alen = ifp->if_addrlen;
 	bcopy(lla, LLADDR(sdl), ifp->if_addrlen);
 
 	bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
 	if (ng_ether_attach_p != NULL)
 		(*ng_ether_attach_p)(ifp);
 
 	/* Announce Ethernet MAC address if non-zero. */
 	for (i = 0; i < ifp->if_addrlen; i++)
 		if (lla[i] != 0)
 			break; 
 	if (i != ifp->if_addrlen)
 		if_printf(ifp, "Ethernet address: %6D\n", lla, ":");
 
 	uuid_ether_add(LLADDR(sdl));
 }
 
 /*
  * Perform common duties while detaching an Ethernet interface
  */
 void
 ether_ifdetach(struct ifnet *ifp)
 {
 	struct sockaddr_dl *sdl;
 
 	sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr);
 	uuid_ether_del(LLADDR(sdl));
 
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_detach_p != NULL,
 		    ("ng_ether_detach_p is NULL"));
 		(*ng_ether_detach_p)(ifp);
 	}
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 }
 
 #ifdef VIMAGE
 void
 ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused)
 {
 
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_detach_p != NULL,
 		    ("ng_ether_detach_p is NULL"));
 		(*ng_ether_detach_p)(ifp);
 	}
 
 	if (ng_ether_attach_p != NULL) {
 		CURVNET_SET_QUIET(new_vnet);
 		(*ng_ether_attach_p)(ifp);
 		CURVNET_RESTORE();
 	}
 }
 #endif
 
 SYSCTL_DECL(_net_link);
 SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet");
 
 #if 0
 /*
  * This is for reference.  We have a table-driven version
  * of the little-endian crc32 generator, which is faster
  * than the double-loop.
  */
 uint32_t
 ether_crc32_le(const uint8_t *buf, size_t len)
 {
 	size_t i;
 	uint32_t crc;
 	int bit;
 	uint8_t data;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
 			carry = (crc ^ data) & 1;
 			crc >>= 1;
 			if (carry)
 				crc = (crc ^ ETHER_CRC_POLY_LE);
 		}
 	}
 
 	return (crc);
 }
 #else
 uint32_t
 ether_crc32_le(const uint8_t *buf, size_t len)
 {
 	static const uint32_t crctab[] = {
 		0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac,
 		0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c,
 		0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c,
 		0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c
 	};
 	size_t i;
 	uint32_t crc;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		crc ^= buf[i];
 		crc = (crc >> 4) ^ crctab[crc & 0xf];
 		crc = (crc >> 4) ^ crctab[crc & 0xf];
 	}
 
 	return (crc);
 }
 #endif
 
 uint32_t
 ether_crc32_be(const uint8_t *buf, size_t len)
 {
 	size_t i;
 	uint32_t crc, carry;
 	int bit;
 	uint8_t data;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
 			carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01);
 			crc <<= 1;
 			if (carry)
 				crc = (crc ^ ETHER_CRC_POLY_BE) | carry;
 		}
 	}
 
 	return (crc);
 }
 
 int
 ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	struct ifaddr *ifa = (struct ifaddr *) data;
 	struct ifreq *ifr = (struct ifreq *) data;
 	int error = 0;
 
 	switch (command) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 
 		switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			ifp->if_init(ifp->if_softc);	/* before arpwhohas */
 			arp_ifinit(ifp, ifa);
 			break;
 #endif
 		default:
 			ifp->if_init(ifp->if_softc);
 			break;
 		}
 		break;
 
 	case SIOCGIFADDR:
 		{
 			struct sockaddr *sa;
 
 			sa = (struct sockaddr *) & ifr->ifr_data;
 			bcopy(IF_LLADDR(ifp),
 			      (caddr_t) sa->sa_data, ETHER_ADDR_LEN);
 		}
 		break;
 
 	case SIOCSIFMTU:
 		/*
 		 * Set the interface MTU.
 		 */
 		if (ifr->ifr_mtu > ETHERMTU) {
 			error = EINVAL;
 		} else {
 			ifp->if_mtu = ifr->ifr_mtu;
 		}
 		break;
 	default:
 		error = EINVAL;			/* XXX netbsd has ENOTTY??? */
 		break;
 	}
 	return (error);
 }
 
 static int
 ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
 	struct sockaddr *sa)
 {
 	struct sockaddr_dl *sdl;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	u_char *e_addr;
 
 	switch(sa->sa_family) {
 	case AF_LINK:
 		/*
 		 * No mapping needed. Just check that it's a valid MC address.
 		 */
 		sdl = (struct sockaddr_dl *)sa;
 		e_addr = LLADDR(sdl);
 		if (!ETHER_IS_MULTICAST(e_addr))
 			return EADDRNOTAVAIL;
 		*llsa = 0;
 		return 0;
 
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)sa;
 		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 			return EADDRNOTAVAIL;
 		sdl = link_init_sdl(ifp, *llsa, IFT_ETHER);
 		sdl->sdl_alen = ETHER_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return 0;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)sa;
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			/*
 			 * An IP6 address of 0 means listen to all
 			 * of the Ethernet multicast address used for IP6.
 			 * (This is used for multicast routers.)
 			 */
 			ifp->if_flags |= IFF_ALLMULTI;
 			*llsa = 0;
 			return 0;
 		}
 		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 			return EADDRNOTAVAIL;
 		sdl = link_init_sdl(ifp, *llsa, IFT_ETHER);
 		sdl->sdl_alen = ETHER_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return 0;
 #endif
 
 	default:
 		/*
 		 * Well, the text isn't quite right, but it's the name
 		 * that counts...
 		 */
 		return EAFNOSUPPORT;
 	}
 }
 
 static moduledata_t ether_mod = {
 	.name = "ether",
 };
 
 void
 ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen)
 {
 	struct ether_vlan_header vlan;
 	struct mbuf mv, mb;
 
 	KASSERT((m->m_flags & M_VLANTAG) != 0,
 	    ("%s: vlan information not present", __func__));
 	KASSERT(m->m_len >= sizeof(struct ether_header),
 	    ("%s: mbuf not large enough for header", __func__));
 	bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header));
 	vlan.evl_proto = vlan.evl_encap_proto;
 	vlan.evl_encap_proto = htons(ETHERTYPE_VLAN);
 	vlan.evl_tag = htons(m->m_pkthdr.ether_vtag);
 	m->m_len -= sizeof(struct ether_header);
 	m->m_data += sizeof(struct ether_header);
 	/*
 	 * If a data link has been supplied by the caller, then we will need to
 	 * re-create a stack allocated mbuf chain with the following structure:
 	 *
 	 * (1) mbuf #1 will contain the supplied data link
 	 * (2) mbuf #2 will contain the vlan header
 	 * (3) mbuf #3 will contain the original mbuf's packet data
 	 *
 	 * Otherwise, submit the packet and vlan header via bpf_mtap2().
 	 */
 	if (data != NULL) {
 		mv.m_next = m;
 		mv.m_data = (caddr_t)&vlan;
 		mv.m_len = sizeof(vlan);
 		mb.m_next = &mv;
 		mb.m_data = data;
 		mb.m_len = dlen;
 		bpf_mtap(bp, &mb);
 	} else
 		bpf_mtap2(bp, &vlan, sizeof(vlan), m);
 	m->m_len += sizeof(struct ether_header);
 	m->m_data -= sizeof(struct ether_header);
 }
 
 struct mbuf *
 ether_vlanencap(struct mbuf *m, uint16_t tag)
 {
 	struct ether_vlan_header *evl;
 
 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
 	if (m == NULL)
 		return (NULL);
 	/* M_PREPEND takes care of m_len, m_pkthdr.len for us */
 
 	if (m->m_len < sizeof(*evl)) {
 		m = m_pullup(m, sizeof(*evl));
 		if (m == NULL)
 			return (NULL);
 	}
 
 	/*
 	 * Transform the Ethernet header into an Ethernet header
 	 * with 802.1Q encapsulation.
 	 */
 	evl = mtod(m, struct ether_vlan_header *);
 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
 	    (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
 	evl->evl_tag = htons(tag);
 	return (m);
 }
 
 DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
 MODULE_VERSION(ether, 1);
Index: projects/vnet/sys/net/netisr.c
===================================================================
--- projects/vnet/sys/net/netisr.c	(revision 298547)
+++ projects/vnet/sys/net/netisr.c	(revision 298548)
@@ -1,1372 +1,1380 @@
 /*-
  * Copyright (c) 2007-2009 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert N. M. Watson under contract
  * to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * netisr is a packet dispatch service, allowing synchronous (directly
  * dispatched) and asynchronous (deferred dispatch) processing of packets by
  * registered protocol handlers.  Callers pass a protocol identifier and
  * packet to netisr, along with a direct dispatch hint, and work will either
  * be immediately processed by the registered handler, or passed to a
  * software interrupt (SWI) thread for deferred dispatch.  Callers will
  * generally select one or the other based on:
  *
  * - Whether directly dispatching a netisr handler lead to code reentrance or
  *   lock recursion, such as entering the socket code from the socket code.
  * - Whether directly dispatching a netisr handler lead to recursive
  *   processing, such as when decapsulating several wrapped layers of tunnel
  *   information (IPSEC within IPSEC within ...).
  *
  * Maintaining ordering for protocol streams is a critical design concern.
  * Enforcing ordering limits the opportunity for concurrency, but maintains
  * the strong ordering requirements found in some protocols, such as TCP.  Of
  * related concern is CPU affinity--it is desirable to process all data
  * associated with a particular stream on the same CPU over time in order to
  * avoid acquiring locks associated with the connection on different CPUs,
  * keep connection data in one cache, and to generally encourage associated
  * user threads to live on the same CPU as the stream.  It's also desirable
  * to avoid lock migration and contention where locks are associated with
  * more than one flow.
  *
  * netisr supports several policy variations, represented by the
  * NETISR_POLICY_* constants, allowing protocols to play various roles in
  * identifying flows, assigning work to CPUs, etc.  These are described in
  * netisr.h.
  */
 
 #include "opt_ddb.h"
 #include "opt_device_polling.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/interrupt.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/rmlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #define	_WANT_NETISR_INTERNAL	/* Enable definitions from netisr_internal.h */
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 #include <net/netisr_internal.h>
 #include <net/vnet.h>
 
 /*-
  * Synchronize use and modification of the registered netisr data structures;
  * acquire a read lock while modifying the set of registered protocols to
  * prevent partially registered or unregistered protocols from being run.
  *
  * The following data structures and fields are protected by this lock:
  *
  * - The netisr_proto array, including all fields of struct netisr_proto.
  * - The nws array, including all fields of struct netisr_worker.
  * - The nws_array array.
  *
  * Note: the NETISR_LOCKING define controls whether read locks are acquired
  * in packet processing paths requiring netisr registration stability.  This
  * is disabled by default as it can lead to measurable performance
  * degradation even with rmlocks (3%-6% for loopback ping-pong traffic), and
  * because netisr registration and unregistration is extremely rare at
  * runtime.  If it becomes more common, this decision should be revisited.
  *
  * XXXRW: rmlocks don't support assertions.
  */
 static struct rmlock	netisr_rmlock;
 #define	NETISR_LOCK_INIT()	rm_init_flags(&netisr_rmlock, "netisr", \
 				    RM_NOWITNESS)
 #define	NETISR_LOCK_ASSERT()
 #define	NETISR_RLOCK(tracker)	rm_rlock(&netisr_rmlock, (tracker))
 #define	NETISR_RUNLOCK(tracker)	rm_runlock(&netisr_rmlock, (tracker))
 #define	NETISR_WLOCK()		rm_wlock(&netisr_rmlock)
 #define	NETISR_WUNLOCK()	rm_wunlock(&netisr_rmlock)
 /* #define	NETISR_LOCKING */
 
 static SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr");
 
 /*-
  * Three global direct dispatch policies are supported:
  *
  * NETISR_DISPATCH_DEFERRED: All work is deferred for a netisr, regardless of
  * context (may be overriden by protocols).
  *
  * NETISR_DISPATCH_HYBRID: If the executing context allows direct dispatch,
  * and we're running on the CPU the work would be performed on, then direct
  * dispatch it if it wouldn't violate ordering constraints on the workstream.
  *
  * NETISR_DISPATCH_DIRECT: If the executing context allows direct dispatch,
  * always direct dispatch.  (The default.)
  *
  * Notice that changing the global policy could lead to short periods of
  * misordered processing, but this is considered acceptable as compared to
  * the complexity of enforcing ordering during policy changes.  Protocols can
  * override the global policy (when they're not doing that, they select
  * NETISR_DISPATCH_DEFAULT).
  */
 #define	NETISR_DISPATCH_POLICY_DEFAULT	NETISR_DISPATCH_DIRECT
 #define	NETISR_DISPATCH_POLICY_MAXSTR	20 /* Used for temporary buffers. */
 static u_int	netisr_dispatch_policy = NETISR_DISPATCH_POLICY_DEFAULT;
 static int	sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_net_isr, OID_AUTO, dispatch, CTLTYPE_STRING | CTLFLAG_RWTUN,
     0, 0, sysctl_netisr_dispatch_policy, "A",
     "netisr dispatch policy");
 
 /*
  * Allow the administrator to limit the number of threads (CPUs) to use for
  * netisr.  We don't check netisr_maxthreads before creating the thread for
  * CPU 0. This must be set at boot. We will create at most one thread per CPU.
  * By default we initialize this to 1 which would assign just 1 cpu (cpu0) and
  * therefore only 1 workstream. If set to -1, netisr would use all cpus
  * (mp_ncpus) and therefore would have those many workstreams. One workstream
  * per thread (CPU).
  */
 static int	netisr_maxthreads = 1;		/* Max number of threads. */
 SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RDTUN,
     &netisr_maxthreads, 0,
     "Use at most this many CPUs for netisr processing");
 
 static int	netisr_bindthreads = 0;		/* Bind threads to CPUs. */
 SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RDTUN,
     &netisr_bindthreads, 0, "Bind netisr threads to CPUs.");
 
 /*
  * Limit per-workstream mbuf queue limits s to at most net.isr.maxqlimit,
  * both for initial configuration and later modification using
  * netisr_setqlimit().
  */
 #define	NETISR_DEFAULT_MAXQLIMIT	10240
 static u_int	netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT;
 SYSCTL_UINT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RDTUN,
     &netisr_maxqlimit, 0,
     "Maximum netisr per-protocol, per-CPU queue depth.");
 
 /*
  * The default per-workstream mbuf queue limit for protocols that don't
  * initialize the nh_qlimit field of their struct netisr_handler.  If this is
  * set above netisr_maxqlimit, we truncate it to the maximum during boot.
  */
 #define	NETISR_DEFAULT_DEFAULTQLIMIT	256
 static u_int	netisr_defaultqlimit = NETISR_DEFAULT_DEFAULTQLIMIT;
 SYSCTL_UINT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RDTUN,
     &netisr_defaultqlimit, 0,
     "Default netisr per-protocol, per-CPU queue limit if not set by protocol");
 
 /*
  * Store and export the compile-time constant NETISR_MAXPROT limit on the
  * number of protocols that can register with netisr at a time.  This is
  * required for crashdump analysis, as it sizes netisr_proto[].
  */
 static u_int	netisr_maxprot = NETISR_MAXPROT;
 SYSCTL_UINT(_net_isr, OID_AUTO, maxprot, CTLFLAG_RD,
     &netisr_maxprot, 0,
     "Compile-time limit on the number of protocols supported by netisr.");
 
 /*
  * The netisr_proto array describes all registered protocols, indexed by
  * protocol number.  See netisr_internal.h for more details.
  */
-static struct netisr_proto	netisr_proto[NETISR_MAXPROT];
+static VNET_DEFINE(struct netisr_proto,	netisr_proto[NETISR_MAXPROT]);
+#define	V_netisr_proto		VNET(netisr_proto)
 
 /*
  * Per-CPU workstream data.  See netisr_internal.h for more details.
  */
 DPCPU_DEFINE(struct netisr_workstream, nws);
 
 /*
  * Map contiguous values between 0 and nws_count into CPU IDs appropriate for
  * accessing workstreams.  This allows constructions of the form
  * DPCPU_ID_GET(nws_array[arbitraryvalue % nws_count], nws).
  */
 static u_int				 nws_array[MAXCPU];
 
 /*
  * Number of registered workstreams.  Will be at most the number of running
  * CPUs once fully started.
  */
 static u_int				 nws_count;
 SYSCTL_UINT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD,
     &nws_count, 0, "Number of extant netisr threads.");
 
 /*
  * Synchronization for each workstream: a mutex protects all mutable fields
  * in each stream, including per-protocol state (mbuf queues).  The SWI is
  * woken up if asynchronous dispatch is required.
  */
 #define	NWS_LOCK(s)		mtx_lock(&(s)->nws_mtx)
 #define	NWS_LOCK_ASSERT(s)	mtx_assert(&(s)->nws_mtx, MA_OWNED)
 #define	NWS_UNLOCK(s)		mtx_unlock(&(s)->nws_mtx)
 #define	NWS_SIGNAL(s)		swi_sched((s)->nws_swi_cookie, 0)
 
 /*
  * Utility routines for protocols that implement their own mapping of flows
  * to CPUs.
  */
 u_int
 netisr_get_cpucount(void)
 {
 
 	return (nws_count);
 }
 
 u_int
 netisr_get_cpuid(u_int cpunumber)
 {
 
 	KASSERT(cpunumber < nws_count, ("%s: %u > %u", __func__, cpunumber,
 	    nws_count));
 
 	return (nws_array[cpunumber]);
 }
 
 /*
  * The default implementation of flow -> CPU ID mapping.
  *
  * Non-static so that protocols can use it to map their own work to specific
  * CPUs in a manner consistent to netisr for affinity purposes.
  */
 u_int
 netisr_default_flow2cpu(u_int flowid)
 {
 
 	return (nws_array[flowid % nws_count]);
 }
 
 /*
  * Dispatch tunable and sysctl configuration.
  */
 struct netisr_dispatch_table_entry {
 	u_int		 ndte_policy;
 	const char	*ndte_policy_str;
 };
 static const struct netisr_dispatch_table_entry netisr_dispatch_table[] = {
 	{ NETISR_DISPATCH_DEFAULT, "default" },
 	{ NETISR_DISPATCH_DEFERRED, "deferred" },
 	{ NETISR_DISPATCH_HYBRID, "hybrid" },
 	{ NETISR_DISPATCH_DIRECT, "direct" },
 };
 static const u_int netisr_dispatch_table_len =
     (sizeof(netisr_dispatch_table) / sizeof(netisr_dispatch_table[0]));
 
 static void
 netisr_dispatch_policy_to_str(u_int dispatch_policy, char *buffer,
     u_int buflen)
 {
 	const struct netisr_dispatch_table_entry *ndtep;
 	const char *str;
 	u_int i;
 
 	str = "unknown";
 	for (i = 0; i < netisr_dispatch_table_len; i++) {
 		ndtep = &netisr_dispatch_table[i];
 		if (ndtep->ndte_policy == dispatch_policy) {
 			str = ndtep->ndte_policy_str;
 			break;
 		}
 	}
 	snprintf(buffer, buflen, "%s", str);
 }
 
 static int
 netisr_dispatch_policy_from_str(const char *str, u_int *dispatch_policyp)
 {
 	const struct netisr_dispatch_table_entry *ndtep;
 	u_int i;
 
 	for (i = 0; i < netisr_dispatch_table_len; i++) {
 		ndtep = &netisr_dispatch_table[i];
 		if (strcmp(ndtep->ndte_policy_str, str) == 0) {
 			*dispatch_policyp = ndtep->ndte_policy;
 			return (0);
 		}
 	}
 	return (EINVAL);
 }
 
 static int
 sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS)
 {
 	char tmp[NETISR_DISPATCH_POLICY_MAXSTR];
 	u_int dispatch_policy;
 	int error;
 
 	netisr_dispatch_policy_to_str(netisr_dispatch_policy, tmp,
 	    sizeof(tmp));
 	error = sysctl_handle_string(oidp, tmp, sizeof(tmp), req);
 	if (error == 0 && req->newptr != NULL) {
 		error = netisr_dispatch_policy_from_str(tmp,
 		    &dispatch_policy);
 		if (error == 0 && dispatch_policy == NETISR_DISPATCH_DEFAULT)
 			error = EINVAL;
 		if (error == 0)
 			netisr_dispatch_policy = dispatch_policy;
 	}
 	return (error);
 }
 
 /*
  * Register a new netisr handler, which requires initializing per-protocol
  * fields for each workstream.  All netisr work is briefly suspended while
  * the protocol is installed.
  */
 void
 netisr_register(const struct netisr_handler *nhp)
 {
 	struct netisr_work *npwp;
 	const char *name;
 	u_int i, proto;
 
 	proto = nhp->nh_proto;
 	name = nhp->nh_name;
 
 	/*
 	 * Test that the requested registration is valid.
 	 */
 	KASSERT(nhp->nh_name != NULL,
 	    ("%s: nh_name NULL for %u", __func__, proto));
 	KASSERT(nhp->nh_handler != NULL,
 	    ("%s: nh_handler NULL for %s", __func__, name));
 	KASSERT(nhp->nh_policy == NETISR_POLICY_SOURCE ||
 	    nhp->nh_policy == NETISR_POLICY_FLOW ||
 	    nhp->nh_policy == NETISR_POLICY_CPU,
 	    ("%s: unsupported nh_policy %u for %s", __func__,
 	    nhp->nh_policy, name));
 	KASSERT(nhp->nh_policy == NETISR_POLICY_FLOW ||
 	    nhp->nh_m2flow == NULL,
 	    ("%s: nh_policy != FLOW but m2flow defined for %s", __func__,
 	    name));
 	KASSERT(nhp->nh_policy == NETISR_POLICY_CPU || nhp->nh_m2cpuid == NULL,
 	    ("%s: nh_policy != CPU but m2cpuid defined for %s", __func__,
 	    name));
 	KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL,
 	    ("%s: nh_policy == CPU but m2cpuid not defined for %s", __func__,
 	    name));
 	KASSERT(nhp->nh_dispatch == NETISR_DISPATCH_DEFAULT ||
 	    nhp->nh_dispatch == NETISR_DISPATCH_DEFERRED ||
 	    nhp->nh_dispatch == NETISR_DISPATCH_HYBRID ||
 	    nhp->nh_dispatch == NETISR_DISPATCH_DIRECT,
 	    ("%s: invalid nh_dispatch (%u)", __func__, nhp->nh_dispatch));
 
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u, %s): protocol too big", __func__, proto, name));
 
 	/*
 	 * Test that no existing registration exists for this protocol.
 	 */
 	NETISR_WLOCK();
-	KASSERT(netisr_proto[proto].np_name == NULL,
+	KASSERT(V_netisr_proto[proto].np_name == NULL,
 	    ("%s(%u, %s): name present", __func__, proto, name));
-	KASSERT(netisr_proto[proto].np_handler == NULL,
+	KASSERT(V_netisr_proto[proto].np_handler == NULL,
 	    ("%s(%u, %s): handler present", __func__, proto, name));
 
-	netisr_proto[proto].np_name = name;
-	netisr_proto[proto].np_handler = nhp->nh_handler;
-	netisr_proto[proto].np_m2flow = nhp->nh_m2flow;
-	netisr_proto[proto].np_m2cpuid = nhp->nh_m2cpuid;
-	netisr_proto[proto].np_drainedcpu = nhp->nh_drainedcpu;
+	V_netisr_proto[proto].np_name = name;
+	V_netisr_proto[proto].np_handler = nhp->nh_handler;
+	V_netisr_proto[proto].np_m2flow = nhp->nh_m2flow;
+	V_netisr_proto[proto].np_m2cpuid = nhp->nh_m2cpuid;
+	V_netisr_proto[proto].np_drainedcpu = nhp->nh_drainedcpu;
 	if (nhp->nh_qlimit == 0)
-		netisr_proto[proto].np_qlimit = netisr_defaultqlimit;
+		V_netisr_proto[proto].np_qlimit = netisr_defaultqlimit;
 	else if (nhp->nh_qlimit > netisr_maxqlimit) {
 		printf("%s: %s requested queue limit %u capped to "
 		    "net.isr.maxqlimit %u\n", __func__, name, nhp->nh_qlimit,
 		    netisr_maxqlimit);
-		netisr_proto[proto].np_qlimit = netisr_maxqlimit;
+		V_netisr_proto[proto].np_qlimit = netisr_maxqlimit;
 	} else
-		netisr_proto[proto].np_qlimit = nhp->nh_qlimit;
-	netisr_proto[proto].np_policy = nhp->nh_policy;
-	netisr_proto[proto].np_dispatch = nhp->nh_dispatch;
+		V_netisr_proto[proto].np_qlimit = nhp->nh_qlimit;
+	V_netisr_proto[proto].np_policy = nhp->nh_policy;
+	V_netisr_proto[proto].np_dispatch = nhp->nh_dispatch;
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		bzero(npwp, sizeof(*npwp));
-		npwp->nw_qlimit = netisr_proto[proto].np_qlimit;
+		npwp->nw_qlimit = V_netisr_proto[proto].np_qlimit;
 	}
 	NETISR_WUNLOCK();
 }
 
 /*
  * Clear drop counters across all workstreams for a protocol.
  */
 void
 netisr_clearqdrops(const struct netisr_handler *nhp)
 {
 	struct netisr_work *npwp;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int i, proto;
 
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_WLOCK();
-	KASSERT(netisr_proto[proto].np_handler != NULL,
+	KASSERT(V_netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		npwp->nw_qdrops = 0;
 	}
 	NETISR_WUNLOCK();
 }
 
 /*
  * Query current drop counters across all workstreams for a protocol.
  */
 void
 netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp)
 {
 	struct netisr_work *npwp;
 	struct rm_priotracker tracker;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int i, proto;
 
 	*qdropp = 0;
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_RLOCK(&tracker);
-	KASSERT(netisr_proto[proto].np_handler != NULL,
+	KASSERT(V_netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		*qdropp += npwp->nw_qdrops;
 	}
 	NETISR_RUNLOCK(&tracker);
 }
 
 /*
  * Query current per-workstream queue limit for a protocol.
  */
 void
 netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp)
 {
 	struct rm_priotracker tracker;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int proto;
 
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_RLOCK(&tracker);
-	KASSERT(netisr_proto[proto].np_handler != NULL,
+	KASSERT(V_netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
-	*qlimitp = netisr_proto[proto].np_qlimit;
+	*qlimitp = V_netisr_proto[proto].np_qlimit;
 	NETISR_RUNLOCK(&tracker);
 }
 
 /*
  * Update the queue limit across per-workstream queues for a protocol.  We
  * simply change the limits, and don't drain overflowed packets as they will
  * (hopefully) take care of themselves shortly.
  */
 int
 netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit)
 {
 	struct netisr_work *npwp;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int i, proto;
 
 	if (qlimit > netisr_maxqlimit)
 		return (EINVAL);
 
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_WLOCK();
-	KASSERT(netisr_proto[proto].np_handler != NULL,
+	KASSERT(V_netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 
-	netisr_proto[proto].np_qlimit = qlimit;
+	V_netisr_proto[proto].np_qlimit = qlimit;
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		npwp->nw_qlimit = qlimit;
 	}
 	NETISR_WUNLOCK();
 	return (0);
 }
 
 /*
  * Drain all packets currently held in a particular protocol work queue.
  */
 static void
 netisr_drain_proto(struct netisr_work *npwp)
 {
 	struct mbuf *m;
 
 	/*
 	 * We would assert the lock on the workstream but it's not passed in.
 	 */
 	while ((m = npwp->nw_head) != NULL) {
 		npwp->nw_head = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		if (npwp->nw_head == NULL)
 			npwp->nw_tail = NULL;
 		npwp->nw_len--;
 		m_freem(m);
 	}
 	KASSERT(npwp->nw_tail == NULL, ("%s: tail", __func__));
 	KASSERT(npwp->nw_len == 0, ("%s: len", __func__));
 }
 
 /*
  * Remove the registration of a network protocol, which requires clearing
  * per-protocol fields across all workstreams, including freeing all mbufs in
  * the queues at time of unregister.  All work in netisr is briefly suspended
  * while this takes place.
  */
 void
 netisr_unregister(const struct netisr_handler *nhp)
 {
 	struct netisr_work *npwp;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int i, proto;
 
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_WLOCK();
-	KASSERT(netisr_proto[proto].np_handler != NULL,
+	KASSERT(V_netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 
-	netisr_proto[proto].np_name = NULL;
-	netisr_proto[proto].np_handler = NULL;
-	netisr_proto[proto].np_m2flow = NULL;
-	netisr_proto[proto].np_m2cpuid = NULL;
-	netisr_proto[proto].np_qlimit = 0;
-	netisr_proto[proto].np_policy = 0;
+	V_netisr_proto[proto].np_name = NULL;
+	V_netisr_proto[proto].np_handler = NULL;
+	V_netisr_proto[proto].np_m2flow = NULL;
+	V_netisr_proto[proto].np_m2cpuid = NULL;
+	V_netisr_proto[proto].np_qlimit = 0;
+	V_netisr_proto[proto].np_policy = 0;
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		netisr_drain_proto(npwp);
 		bzero(npwp, sizeof(*npwp));
 	}
 	NETISR_WUNLOCK();
 }
 
 /*
  * Compose the global and per-protocol policies on dispatch, and return the
  * dispatch policy to use.
  */
 static u_int
 netisr_get_dispatch(struct netisr_proto *npp)
 {
 
 	/*
 	 * Protocol-specific configuration overrides the global default.
 	 */
 	if (npp->np_dispatch != NETISR_DISPATCH_DEFAULT)
 		return (npp->np_dispatch);
 	return (netisr_dispatch_policy);
 }
 
 /*
  * Look up the workstream given a packet and source identifier.  Do this by
  * checking the protocol's policy, and optionally call out to the protocol
  * for assistance if required.
  */
 static struct mbuf *
 netisr_select_cpuid(struct netisr_proto *npp, u_int dispatch_policy,
     uintptr_t source, struct mbuf *m, u_int *cpuidp)
 {
 	struct ifnet *ifp;
 	u_int policy;
 
 	NETISR_LOCK_ASSERT();
 
 	/*
 	 * In the event we have only one worker, shortcut and deliver to it
 	 * without further ado.
 	 */
 	if (nws_count == 1) {
 		*cpuidp = nws_array[0];
 		return (m);
 	}
 
 	/*
 	 * What happens next depends on the policy selected by the protocol.
 	 * If we want to support per-interface policies, we should do that
 	 * here first.
 	 */
 	policy = npp->np_policy;
 	if (policy == NETISR_POLICY_CPU) {
 		m = npp->np_m2cpuid(m, source, cpuidp);
 		if (m == NULL)
 			return (NULL);
 
 		/*
 		 * It's possible for a protocol not to have a good idea about
 		 * where to process a packet, in which case we fall back on
 		 * the netisr code to decide.  In the hybrid case, return the
 		 * current CPU ID, which will force an immediate direct
 		 * dispatch.  In the queued case, fall back on the SOURCE
 		 * policy.
 		 */
 		if (*cpuidp != NETISR_CPUID_NONE)
 			return (m);
 		if (dispatch_policy == NETISR_DISPATCH_HYBRID) {
 			*cpuidp = curcpu;
 			return (m);
 		}
 		policy = NETISR_POLICY_SOURCE;
 	}
 
 	if (policy == NETISR_POLICY_FLOW) {
 		if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE &&
 		    npp->np_m2flow != NULL) {
 			m = npp->np_m2flow(m, source);
 			if (m == NULL)
 				return (NULL);
 		}
 		if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
 			*cpuidp =
 			    netisr_default_flow2cpu(m->m_pkthdr.flowid);
 			return (m);
 		}
 		policy = NETISR_POLICY_SOURCE;
 	}
 
 	KASSERT(policy == NETISR_POLICY_SOURCE,
 	    ("%s: invalid policy %u for %s", __func__, npp->np_policy,
 	    npp->np_name));
 
 	ifp = m->m_pkthdr.rcvif;
 	if (ifp != NULL)
 		*cpuidp = nws_array[(ifp->if_index + source) % nws_count];
 	else
 		*cpuidp = nws_array[source % nws_count];
 	return (m);
 }
 
 /*
  * Process packets associated with a workstream and protocol.  For reasons of
  * fairness, we process up to one complete netisr queue at a time, moving the
  * queue to a stack-local queue for processing, but do not loop refreshing
  * from the global queue.  The caller is responsible for deciding whether to
  * loop, and for setting the NWS_RUNNING flag.  The passed workstream will be
  * locked on entry and relocked before return, but will be released while
  * processing.  The number of packets processed is returned.
  */
 static u_int
 netisr_process_workstream_proto(struct netisr_workstream *nwsp, u_int proto)
 {
 	struct netisr_work local_npw, *npwp;
 	u_int handled;
 	struct mbuf *m;
 
 	NETISR_LOCK_ASSERT();
 	NWS_LOCK_ASSERT(nwsp);
 
 	KASSERT(nwsp->nws_flags & NWS_RUNNING,
 	    ("%s(%u): not running", __func__, proto));
 	KASSERT(proto >= 0 && proto < NETISR_MAXPROT,
 	    ("%s(%u): invalid proto\n", __func__, proto));
 
 	npwp = &nwsp->nws_work[proto];
 	if (npwp->nw_len == 0)
 		return (0);
 
 	/*
 	 * Move the global work queue to a thread-local work queue.
 	 *
 	 * Notice that this means the effective maximum length of the queue
 	 * is actually twice that of the maximum queue length specified in
 	 * the protocol registration call.
 	 */
 	handled = npwp->nw_len;
 	local_npw = *npwp;
 	npwp->nw_head = NULL;
 	npwp->nw_tail = NULL;
 	npwp->nw_len = 0;
 	nwsp->nws_pendingbits &= ~(1 << proto);
 	NWS_UNLOCK(nwsp);
 	while ((m = local_npw.nw_head) != NULL) {
 		local_npw.nw_head = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		if (local_npw.nw_head == NULL)
 			local_npw.nw_tail = NULL;
 		local_npw.nw_len--;
 		VNET_ASSERT(m->m_pkthdr.rcvif != NULL,
 		    ("%s:%d rcvif == NULL: m=%p", __func__, __LINE__, m));
 		CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
-		netisr_proto[proto].np_handler(m);
+		V_netisr_proto[proto].np_handler(m);
 		CURVNET_RESTORE();
 	}
 	KASSERT(local_npw.nw_len == 0,
 	    ("%s(%u): len %u", __func__, proto, local_npw.nw_len));
-	if (netisr_proto[proto].np_drainedcpu)
-		netisr_proto[proto].np_drainedcpu(nwsp->nws_cpu);
+	/* We can just use the one from the default VNET. */
+	CURVNET_SET_QUIET(vnet0);
+	if (V_netisr_proto[proto].np_drainedcpu)
+		V_netisr_proto[proto].np_drainedcpu(nwsp->nws_cpu);
+	CURVNET_RESTORE();
 	NWS_LOCK(nwsp);
 	npwp->nw_handled += handled;
 	return (handled);
 }
 
 /*
  * SWI handler for netisr -- processes packets in a set of workstreams that
  * it owns, woken up by calls to NWS_SIGNAL().  If this workstream is already
  * being direct dispatched, go back to sleep and wait for the dispatching
  * thread to wake us up again.
  */
 static void
 swi_net(void *arg)
 {
 #ifdef NETISR_LOCKING
 	struct rm_priotracker tracker;
 #endif
 	struct netisr_workstream *nwsp;
 	u_int bits, prot;
 
 	nwsp = arg;
 
 #ifdef DEVICE_POLLING
 	KASSERT(nws_count == 1,
 	    ("%s: device_polling but nws_count != 1", __func__));
 	netisr_poll();
 #endif
 #ifdef NETISR_LOCKING
 	NETISR_RLOCK(&tracker);
 #endif
 	NWS_LOCK(nwsp);
 	KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running"));
 	if (nwsp->nws_flags & NWS_DISPATCHING)
 		goto out;
 	nwsp->nws_flags |= NWS_RUNNING;
 	nwsp->nws_flags &= ~NWS_SCHEDULED;
 	while ((bits = nwsp->nws_pendingbits) != 0) {
 		while ((prot = ffs(bits)) != 0) {
 			prot--;
 			bits &= ~(1 << prot);
 			(void)netisr_process_workstream_proto(nwsp, prot);
 		}
 	}
 	nwsp->nws_flags &= ~NWS_RUNNING;
 out:
 	NWS_UNLOCK(nwsp);
 #ifdef NETISR_LOCKING
 	NETISR_RUNLOCK(&tracker);
 #endif
 #ifdef DEVICE_POLLING
 	netisr_pollmore();
 #endif
 }
 
 static int
 netisr_queue_workstream(struct netisr_workstream *nwsp, u_int proto,
     struct netisr_work *npwp, struct mbuf *m, int *dosignalp)
 {
 
 	NWS_LOCK_ASSERT(nwsp);
 
 	*dosignalp = 0;
 	if (npwp->nw_len < npwp->nw_qlimit) {
 		m->m_nextpkt = NULL;
 		if (npwp->nw_head == NULL) {
 			npwp->nw_head = m;
 			npwp->nw_tail = m;
 		} else {
 			npwp->nw_tail->m_nextpkt = m;
 			npwp->nw_tail = m;
 		}
 		npwp->nw_len++;
 		if (npwp->nw_len > npwp->nw_watermark)
 			npwp->nw_watermark = npwp->nw_len;
 
 		/*
 		 * We must set the bit regardless of NWS_RUNNING, so that
 		 * swi_net() keeps calling netisr_process_workstream_proto().
 		 */
 		nwsp->nws_pendingbits |= (1 << proto);
 		if (!(nwsp->nws_flags & 
 		    (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED))) {
 			nwsp->nws_flags |= NWS_SCHEDULED;
 			*dosignalp = 1;	/* Defer until unlocked. */
 		}
 		npwp->nw_queued++;
 		return (0);
 	} else {
 		m_freem(m);
 		npwp->nw_qdrops++;
 		return (ENOBUFS);
 	}
 }
 
 static int
 netisr_queue_internal(u_int proto, struct mbuf *m, u_int cpuid)
 {
 	struct netisr_workstream *nwsp;
 	struct netisr_work *npwp;
 	int dosignal, error;
 
 #ifdef NETISR_LOCKING
 	NETISR_LOCK_ASSERT();
 #endif
 	KASSERT(cpuid <= mp_maxid, ("%s: cpuid too big (%u, %u)", __func__,
 	    cpuid, mp_maxid));
 	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
 
 	dosignal = 0;
 	error = 0;
 	nwsp = DPCPU_ID_PTR(cpuid, nws);
 	npwp = &nwsp->nws_work[proto];
 	NWS_LOCK(nwsp);
 	error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal);
 	NWS_UNLOCK(nwsp);
 	if (dosignal)
 		NWS_SIGNAL(nwsp);
 	return (error);
 }
 
 int
 netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m)
 {
 #ifdef NETISR_LOCKING
 	struct rm_priotracker tracker;
 #endif
 	u_int cpuid;
 	int error;
 
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s: invalid proto %u", __func__, proto));
 
 #ifdef NETISR_LOCKING
 	NETISR_RLOCK(&tracker);
 #endif
-	KASSERT(netisr_proto[proto].np_handler != NULL,
-	    ("%s: invalid proto %u", __func__, proto));
+	if (V_netisr_proto[proto].np_handler == NULL) {
+		m_freem(m);
+		return (ENOPROTOOPT);
+	}
 
-	m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_DEFERRED,
+	m = netisr_select_cpuid(&V_netisr_proto[proto], NETISR_DISPATCH_DEFERRED,
 	    source, m, &cpuid);
 	if (m != NULL) {
 		KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__,
 		    cpuid));
 		error = netisr_queue_internal(proto, m, cpuid);
 	} else
 		error = ENOBUFS;
 #ifdef NETISR_LOCKING
 	NETISR_RUNLOCK(&tracker);
 #endif
 	return (error);
 }
 
 int
 netisr_queue(u_int proto, struct mbuf *m)
 {
 
 	return (netisr_queue_src(proto, 0, m));
 }
 
 /*
  * Dispatch a packet for netisr processing; direct dispatch is permitted by
  * calling context.
  */
 int
 netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m)
 {
 #ifdef NETISR_LOCKING
 	struct rm_priotracker tracker;
 #endif
 	struct netisr_workstream *nwsp;
 	struct netisr_proto *npp;
 	struct netisr_work *npwp;
 	int dosignal, error;
 	u_int cpuid, dispatch_policy;
 
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s: invalid proto %u", __func__, proto));
 #ifdef NETISR_LOCKING
 	NETISR_RLOCK(&tracker);
 #endif
-	npp = &netisr_proto[proto];
-	KASSERT(npp->np_handler != NULL, ("%s: invalid proto %u", __func__,
-	    proto));
+	npp = &V_netisr_proto[proto];
+	if (npp->np_handler == NULL) {
+		m_freem(m);
+		return (ENOPROTOOPT);
+	}
 
 	dispatch_policy = netisr_get_dispatch(npp);
 	if (dispatch_policy == NETISR_DISPATCH_DEFERRED)
 		return (netisr_queue_src(proto, source, m));
 
 	/*
 	 * If direct dispatch is forced, then unconditionally dispatch
 	 * without a formal CPU selection.  Borrow the current CPU's stats,
 	 * even if there's no worker on it.  In this case we don't update
 	 * nws_flags because all netisr processing will be source ordered due
 	 * to always being forced to directly dispatch.
 	 */
 	if (dispatch_policy == NETISR_DISPATCH_DIRECT) {
 		nwsp = DPCPU_PTR(nws);
 		npwp = &nwsp->nws_work[proto];
 		npwp->nw_dispatched++;
 		npwp->nw_handled++;
-		netisr_proto[proto].np_handler(m);
+		V_netisr_proto[proto].np_handler(m);
 		error = 0;
 		goto out_unlock;
 	}
 
 	KASSERT(dispatch_policy == NETISR_DISPATCH_HYBRID,
 	    ("%s: unknown dispatch policy (%u)", __func__, dispatch_policy));
 
 	/*
 	 * Otherwise, we execute in a hybrid mode where we will try to direct
 	 * dispatch if we're on the right CPU and the netisr worker isn't
 	 * already running.
 	 */
 	sched_pin();
-	m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_HYBRID,
+	m = netisr_select_cpuid(&V_netisr_proto[proto], NETISR_DISPATCH_HYBRID,
 	    source, m, &cpuid);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto out_unpin;
 	}
 	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
 	if (cpuid != curcpu)
 		goto queue_fallback;
 	nwsp = DPCPU_PTR(nws);
 	npwp = &nwsp->nws_work[proto];
 
 	/*-
 	 * We are willing to direct dispatch only if three conditions hold:
 	 *
 	 * (1) The netisr worker isn't already running,
 	 * (2) Another thread isn't already directly dispatching, and
 	 * (3) The netisr hasn't already been woken up.
 	 */
 	NWS_LOCK(nwsp);
 	if (nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED)) {
 		error = netisr_queue_workstream(nwsp, proto, npwp, m,
 		    &dosignal);
 		NWS_UNLOCK(nwsp);
 		if (dosignal)
 			NWS_SIGNAL(nwsp);
 		goto out_unpin;
 	}
 
 	/*
 	 * The current thread is now effectively the netisr worker, so set
 	 * the dispatching flag to prevent concurrent processing of the
 	 * stream from another thread (even the netisr worker), which could
 	 * otherwise lead to effective misordering of the stream.
 	 */
 	nwsp->nws_flags |= NWS_DISPATCHING;
 	NWS_UNLOCK(nwsp);
-	netisr_proto[proto].np_handler(m);
+	V_netisr_proto[proto].np_handler(m);
 	NWS_LOCK(nwsp);
 	nwsp->nws_flags &= ~NWS_DISPATCHING;
 	npwp->nw_handled++;
 	npwp->nw_hybrid_dispatched++;
 
 	/*
 	 * If other work was enqueued by another thread while we were direct
 	 * dispatching, we need to signal the netisr worker to do that work.
 	 * In the future, we might want to do some of that work in the
 	 * current thread, rather than trigger further context switches.  If
 	 * so, we'll want to establish a reasonable bound on the work done in
 	 * the "borrowed" context.
 	 */
 	if (nwsp->nws_pendingbits != 0) {
 		nwsp->nws_flags |= NWS_SCHEDULED;
 		dosignal = 1;
 	} else
 		dosignal = 0;
 	NWS_UNLOCK(nwsp);
 	if (dosignal)
 		NWS_SIGNAL(nwsp);
 	error = 0;
 	goto out_unpin;
 
 queue_fallback:
 	error = netisr_queue_internal(proto, m, cpuid);
 out_unpin:
 	sched_unpin();
 out_unlock:
 #ifdef NETISR_LOCKING
 	NETISR_RUNLOCK(&tracker);
 #endif
 	return (error);
 }
 
 int
 netisr_dispatch(u_int proto, struct mbuf *m)
 {
 
 	return (netisr_dispatch_src(proto, 0, m));
 }
 
 #ifdef DEVICE_POLLING
 /*
  * Kernel polling borrows a netisr thread to run interface polling in; this
  * function allows kernel polling to request that the netisr thread be
  * scheduled even if no packets are pending for protocols.
  */
 void
 netisr_sched_poll(void)
 {
 	struct netisr_workstream *nwsp;
 
 	nwsp = DPCPU_ID_PTR(nws_array[0], nws);
 	NWS_SIGNAL(nwsp);
 }
 #endif
 
 static void
 netisr_start_swi(u_int cpuid, struct pcpu *pc)
 {
 	char swiname[12];
 	struct netisr_workstream *nwsp;
 	int error;
 
 	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
 
 	nwsp = DPCPU_ID_PTR(cpuid, nws);
 	mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF);
 	nwsp->nws_cpu = cpuid;
 	snprintf(swiname, sizeof(swiname), "netisr %u", cpuid);
 	error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, nwsp,
 	    SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie);
 	if (error)
 		panic("%s: swi_add %d", __func__, error);
 	pc->pc_netisr = nwsp->nws_intr_event;
 	if (netisr_bindthreads) {
 		error = intr_event_bind(nwsp->nws_intr_event, cpuid);
 		if (error != 0)
 			printf("%s: cpu %u: intr_event_bind: %d", __func__,
 			    cpuid, error);
 	}
 	NETISR_WLOCK();
 	nws_array[nws_count] = nwsp->nws_cpu;
 	nws_count++;
 	NETISR_WUNLOCK();
 }
 
 /*
  * Initialize the netisr subsystem.  We rely on BSS and static initialization
  * of most fields in global data structures.
  *
  * Start a worker thread for the boot CPU so that we can support network
  * traffic immediately in case the network stack is used before additional
  * CPUs are started (for example, diskless boot).
  */
 static void
 netisr_init(void *arg)
 {
 	KASSERT(curcpu == 0, ("%s: not on CPU 0", __func__));
 
 	NETISR_LOCK_INIT();
 	if (netisr_maxthreads == 0 || netisr_maxthreads < -1 )
 		netisr_maxthreads = 1;		/* default behavior */
 	else if (netisr_maxthreads == -1)
 		netisr_maxthreads = mp_ncpus;	/* use max cpus */
 	if (netisr_maxthreads > mp_ncpus) {
 		printf("netisr_init: forcing maxthreads from %d to %d\n",
 		    netisr_maxthreads, mp_ncpus);
 		netisr_maxthreads = mp_ncpus;
 	}
 	if (netisr_defaultqlimit > netisr_maxqlimit) {
 		printf("netisr_init: forcing defaultqlimit from %d to %d\n",
 		    netisr_defaultqlimit, netisr_maxqlimit);
 		netisr_defaultqlimit = netisr_maxqlimit;
 	}
 #ifdef DEVICE_POLLING
 	/*
 	 * The device polling code is not yet aware of how to deal with
 	 * multiple netisr threads, so for the time being compiling in device
 	 * polling disables parallel netisr workers.
 	 */
 	if (netisr_maxthreads != 1 || netisr_bindthreads != 0) {
 		printf("netisr_init: forcing maxthreads to 1 and "
 		    "bindthreads to 0 for device polling\n");
 		netisr_maxthreads = 1;
 		netisr_bindthreads = 0;
 	}
 #endif
 	netisr_start_swi(curcpu, pcpu_find(curcpu));
 }
 SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL);
 
 /*
  * Start worker threads for additional CPUs.  No attempt to gracefully handle
  * work reassignment, we don't yet support dynamic reconfiguration.
  */
 static void
 netisr_start(void *arg)
 {
 	struct pcpu *pc;
 
 	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 		if (nws_count >= netisr_maxthreads)
 			break;
 		/* Worker will already be present for boot CPU. */
 		if (pc->pc_netisr != NULL)
 			continue;
 		netisr_start_swi(pc->pc_cpuid, pc);
 	}
 }
 SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL);
 
 /*
  * Sysctl monitoring for netisr: query a list of registered protocols.
  */
 static int
 sysctl_netisr_proto(SYSCTL_HANDLER_ARGS)
 {
 	struct rm_priotracker tracker;
 	struct sysctl_netisr_proto *snpp, *snp_array;
 	struct netisr_proto *npp;
 	u_int counter, proto;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EINVAL);
 	snp_array = malloc(sizeof(*snp_array) * NETISR_MAXPROT, M_TEMP,
 	    M_ZERO | M_WAITOK);
 	counter = 0;
 	NETISR_RLOCK(&tracker);
 	for (proto = 0; proto < NETISR_MAXPROT; proto++) {
-		npp = &netisr_proto[proto];
+		npp = &V_netisr_proto[proto];
 		if (npp->np_name == NULL)
 			continue;
 		snpp = &snp_array[counter];
 		snpp->snp_version = sizeof(*snpp);
 		strlcpy(snpp->snp_name, npp->np_name, NETISR_NAMEMAXLEN);
 		snpp->snp_proto = proto;
 		snpp->snp_qlimit = npp->np_qlimit;
 		snpp->snp_policy = npp->np_policy;
 		snpp->snp_dispatch = npp->np_dispatch;
 		if (npp->np_m2flow != NULL)
 			snpp->snp_flags |= NETISR_SNP_FLAGS_M2FLOW;
 		if (npp->np_m2cpuid != NULL)
 			snpp->snp_flags |= NETISR_SNP_FLAGS_M2CPUID;
 		if (npp->np_drainedcpu != NULL)
 			snpp->snp_flags |= NETISR_SNP_FLAGS_DRAINEDCPU;
 		counter++;
 	}
 	NETISR_RUNLOCK(&tracker);
 	KASSERT(counter <= NETISR_MAXPROT,
 	    ("sysctl_netisr_proto: counter too big (%d)", counter));
 	error = SYSCTL_OUT(req, snp_array, sizeof(*snp_array) * counter);
 	free(snp_array, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_isr, OID_AUTO, proto,
     CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_proto,
     "S,sysctl_netisr_proto",
     "Return list of protocols registered with netisr");
 
 /*
  * Sysctl monitoring for netisr: query a list of workstreams.
  */
 static int
 sysctl_netisr_workstream(SYSCTL_HANDLER_ARGS)
 {
 	struct rm_priotracker tracker;
 	struct sysctl_netisr_workstream *snwsp, *snws_array;
 	struct netisr_workstream *nwsp;
 	u_int counter, cpuid;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EINVAL);
 	snws_array = malloc(sizeof(*snws_array) * MAXCPU, M_TEMP,
 	    M_ZERO | M_WAITOK);
 	counter = 0;
 	NETISR_RLOCK(&tracker);
 	CPU_FOREACH(cpuid) {
 		nwsp = DPCPU_ID_PTR(cpuid, nws);
 		if (nwsp->nws_intr_event == NULL)
 			continue;
 		NWS_LOCK(nwsp);
 		snwsp = &snws_array[counter];
 		snwsp->snws_version = sizeof(*snwsp);
 
 		/*
 		 * For now, we equate workstream IDs and CPU IDs in the
 		 * kernel, but expose them independently to userspace in case
 		 * that assumption changes in the future.
 		 */
 		snwsp->snws_wsid = cpuid;
 		snwsp->snws_cpu = cpuid;
 		if (nwsp->nws_intr_event != NULL)
 			snwsp->snws_flags |= NETISR_SNWS_FLAGS_INTR;
 		NWS_UNLOCK(nwsp);
 		counter++;
 	}
 	NETISR_RUNLOCK(&tracker);
 	KASSERT(counter <= MAXCPU,
 	    ("sysctl_netisr_workstream: counter too big (%d)", counter));
 	error = SYSCTL_OUT(req, snws_array, sizeof(*snws_array) * counter);
 	free(snws_array, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_isr, OID_AUTO, workstream,
     CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_workstream,
     "S,sysctl_netisr_workstream",
     "Return list of workstreams implemented by netisr");
 
 /*
  * Sysctl monitoring for netisr: query per-protocol data across all
  * workstreams.
  */
 static int
 sysctl_netisr_work(SYSCTL_HANDLER_ARGS)
 {
 	struct rm_priotracker tracker;
 	struct sysctl_netisr_work *snwp, *snw_array;
 	struct netisr_workstream *nwsp;
 	struct netisr_proto *npp;
 	struct netisr_work *nwp;
 	u_int counter, cpuid, proto;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EINVAL);
 	snw_array = malloc(sizeof(*snw_array) * MAXCPU * NETISR_MAXPROT,
 	    M_TEMP, M_ZERO | M_WAITOK);
 	counter = 0;
 	NETISR_RLOCK(&tracker);
 	CPU_FOREACH(cpuid) {
 		nwsp = DPCPU_ID_PTR(cpuid, nws);
 		if (nwsp->nws_intr_event == NULL)
 			continue;
 		NWS_LOCK(nwsp);
 		for (proto = 0; proto < NETISR_MAXPROT; proto++) {
-			npp = &netisr_proto[proto];
+			npp = &V_netisr_proto[proto];
 			if (npp->np_name == NULL)
 				continue;
 			nwp = &nwsp->nws_work[proto];
 			snwp = &snw_array[counter];
 			snwp->snw_version = sizeof(*snwp);
 			snwp->snw_wsid = cpuid;		/* See comment above. */
 			snwp->snw_proto = proto;
 			snwp->snw_len = nwp->nw_len;
 			snwp->snw_watermark = nwp->nw_watermark;
 			snwp->snw_dispatched = nwp->nw_dispatched;
 			snwp->snw_hybrid_dispatched =
 			    nwp->nw_hybrid_dispatched;
 			snwp->snw_qdrops = nwp->nw_qdrops;
 			snwp->snw_queued = nwp->nw_queued;
 			snwp->snw_handled = nwp->nw_handled;
 			counter++;
 		}
 		NWS_UNLOCK(nwsp);
 	}
 	KASSERT(counter <= MAXCPU * NETISR_MAXPROT,
 	    ("sysctl_netisr_work: counter too big (%d)", counter));
 	NETISR_RUNLOCK(&tracker);
 	error = SYSCTL_OUT(req, snw_array, sizeof(*snw_array) * counter);
 	free(snw_array, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_isr, OID_AUTO, work,
     CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_work,
     "S,sysctl_netisr_work",
     "Return list of per-workstream, per-protocol work in netisr");
 
 #ifdef DDB
 DB_SHOW_COMMAND(netisr, db_show_netisr)
 {
 	struct netisr_workstream *nwsp;
 	struct netisr_work *nwp;
 	int first, proto;
 	u_int cpuid;
 
 	db_printf("%3s %6s %5s %5s %5s %8s %8s %8s %8s\n", "CPU", "Proto",
 	    "Len", "WMark", "Max", "Disp", "HDisp", "Drop", "Queue");
 	CPU_FOREACH(cpuid) {
 		nwsp = DPCPU_ID_PTR(cpuid, nws);
 		if (nwsp->nws_intr_event == NULL)
 			continue;
 		first = 1;
 		for (proto = 0; proto < NETISR_MAXPROT; proto++) {
-			if (netisr_proto[proto].np_handler == NULL)
+			if (V_netisr_proto[proto].np_handler == NULL)
 				continue;
 			nwp = &nwsp->nws_work[proto];
 			if (first) {
 				db_printf("%3d ", cpuid);
 				first = 0;
 			} else
 				db_printf("%3s ", "");
 			db_printf(
 			    "%6s %5d %5d %5d %8ju %8ju %8ju %8ju\n",
-			    netisr_proto[proto].np_name, nwp->nw_len,
+			    V_netisr_proto[proto].np_name, nwp->nw_len,
 			    nwp->nw_watermark, nwp->nw_qlimit,
 			    nwp->nw_dispatched, nwp->nw_hybrid_dispatched,
 			    nwp->nw_qdrops, nwp->nw_queued);
 		}
 	}
 }
 #endif
Index: projects/vnet/sys/net/rtsock.c
===================================================================
--- projects/vnet/sys/net/rtsock.c	(revision 298547)
+++ projects/vnet/sys/net/rtsock.c	(revision 298548)
@@ -1,1925 +1,1942 @@
 /*-
  * Copyright (c) 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)rtsock.c	8.7 (Berkeley) 10/12/95
  * $FreeBSD$
  */
 #include "opt_compat.h"
 #include "opt_mpath.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/domain.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/raw_cb.h>
 #include <net/route.h>
 #include <net/route_var.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip_carp.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #endif
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 
 struct if_msghdr32 {
 	uint16_t ifm_msglen;
 	uint8_t	ifm_version;
 	uint8_t	ifm_type;
 	int32_t	ifm_addrs;
 	int32_t	ifm_flags;
 	uint16_t ifm_index;
 	struct	if_data ifm_data;
 };
 
 struct if_msghdrl32 {
 	uint16_t ifm_msglen;
 	uint8_t	ifm_version;
 	uint8_t	ifm_type;
 	int32_t	ifm_addrs;
 	int32_t	ifm_flags;
 	uint16_t ifm_index;
 	uint16_t _ifm_spare1;
 	uint16_t ifm_len;
 	uint16_t ifm_data_off;
 	struct	if_data ifm_data;
 };
 
 struct ifa_msghdrl32 {
 	uint16_t ifam_msglen;
 	uint8_t	ifam_version;
 	uint8_t	ifam_type;
 	int32_t	ifam_addrs;
 	int32_t	ifam_flags;
 	uint16_t ifam_index;
 	uint16_t _ifam_spare1;
 	uint16_t ifam_len;
 	uint16_t ifam_data_off;
 	int32_t	ifam_metric;
 	struct	if_data ifam_data;
 };
 #endif /* COMPAT_FREEBSD32 */
 
 MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables");
 
 /* NB: these are not modified */
 static struct	sockaddr route_src = { 2, PF_ROUTE, };
 static struct	sockaddr sa_zero   = { sizeof(sa_zero), AF_INET, };
 
 /* These are external hooks for CARP. */
 int	(*carp_get_vhid_p)(struct ifaddr *);
 
 /*
  * Used by rtsock/raw_input callback code to decide whether to filter the update
  * notification to a socket bound to a particular FIB.
  */
 #define	RTS_FILTER_FIB	M_PROTO8
 
 typedef struct {
 	int	ip_count;	/* attached w/ AF_INET */
 	int	ip6_count;	/* attached w/ AF_INET6 */
 	int	any_count;	/* total attached */
 } route_cb_t;
 static VNET_DEFINE(route_cb_t, route_cb);
 #define	V_route_cb VNET(route_cb)
 
 struct mtx rtsock_mtx;
 MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF);
 
 #define	RTSOCK_LOCK()	mtx_lock(&rtsock_mtx)
 #define	RTSOCK_UNLOCK()	mtx_unlock(&rtsock_mtx)
 #define	RTSOCK_LOCK_ASSERT()	mtx_assert(&rtsock_mtx, MA_OWNED)
 
 static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, "");
 
 struct walkarg {
 	int	w_tmemsize;
 	int	w_op, w_arg;
 	caddr_t	w_tmem;
 	struct sysctl_req *w_req;
 };
 
 static void	rts_input(struct mbuf *m);
 static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo);
 static int	rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo,
 			struct walkarg *w, int *plen);
 static int	rt_xaddrs(caddr_t cp, caddr_t cplim,
 			struct rt_addrinfo *rtinfo);
 static int	sysctl_dumpentry(struct radix_node *rn, void *vw);
 static int	sysctl_iflist(int af, struct walkarg *w);
 static int	sysctl_ifmalist(int af, struct walkarg *w);
 static int	route_output(struct mbuf *m, struct socket *so, ...);
 static void	rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out);
 static void	rt_dispatch(struct mbuf *, sa_family_t);
 static struct sockaddr	*rtsock_fix_netmask(struct sockaddr *dst,
 			struct sockaddr *smask, struct sockaddr_storage *dmask);
 
 static struct netisr_handler rtsock_nh = {
 	.nh_name = "rtsock",
 	.nh_handler = rts_input,
 	.nh_proto = NETISR_ROUTE,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 static int
 sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&rtsock_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
         if (error || !req->newptr)
                 return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&rtsock_nh, qlimit));
 }
 SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_route_netisr_maxqlen, "I",
     "maximum routing socket dispatch queue length");
 
 static void
 rts_init(void)
 {
 	int tmp;
 
 	if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp))
 		rtsock_nh.nh_qlimit = tmp;
-	netisr_register(&rtsock_nh);
 }
 SYSINIT(rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rts_init, 0);
+
+static void
+vnet_rts_init(void)
+{
+
+	netisr_register(&rtsock_nh);
+}
+VNET_SYSINIT(vnet_rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
+    vnet_rts_init, 0);
+
+static void
+vnet_rts_uninit(void)
+{
+
+	netisr_unregister(&rtsock_nh);
+}
+VNET_SYSUNINIT(vnet_rts_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
+    vnet_rts_uninit, 0);
 
 static int
 raw_input_rts_cb(struct mbuf *m, struct sockproto *proto, struct sockaddr *src,
     struct rawcb *rp)
 {
 	int fibnum;
 
 	KASSERT(m != NULL, ("%s: m is NULL", __func__));
 	KASSERT(proto != NULL, ("%s: proto is NULL", __func__));
 	KASSERT(rp != NULL, ("%s: rp is NULL", __func__));
 
 	/* No filtering requested. */
 	if ((m->m_flags & RTS_FILTER_FIB) == 0)
 		return (0);
 
 	/* Check if it is a rts and the fib matches the one of the socket. */
 	fibnum = M_GETFIB(m);
 	if (proto->sp_family != PF_ROUTE ||
 	    rp->rcb_socket == NULL ||
 	    rp->rcb_socket->so_fibnum == fibnum)
 		return (0);
 
 	/* Filtering requested and no match, the socket shall be skipped. */
 	return (1);
 }
 
 static void
 rts_input(struct mbuf *m)
 {
 	struct sockproto route_proto;
 	unsigned short *family;
 	struct m_tag *tag;
 
 	route_proto.sp_family = PF_ROUTE;
 	tag = m_tag_find(m, PACKET_TAG_RTSOCKFAM, NULL);
 	if (tag != NULL) {
 		family = (unsigned short *)(tag + 1);
 		route_proto.sp_protocol = *family;
 		m_tag_delete(m, tag);
 	} else
 		route_proto.sp_protocol = 0;
 
 	raw_input_ext(m, &route_proto, &route_src, raw_input_rts_cb);
 }
 
 /*
  * It really doesn't make any sense at all for this code to share much
  * with raw_usrreq.c, since its functionality is so restricted.  XXX
  */
 static void
 rts_abort(struct socket *so)
 {
 
 	raw_usrreqs.pru_abort(so);
 }
 
 static void
 rts_close(struct socket *so)
 {
 
 	raw_usrreqs.pru_close(so);
 }
 
 /* pru_accept is EOPNOTSUPP */
 
 static int
 rts_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct rawcb *rp;
 	int error;
 
 	KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL"));
 
 	/* XXX */
 	rp = malloc(sizeof *rp, M_PCB, M_WAITOK | M_ZERO);
 
 	so->so_pcb = (caddr_t)rp;
 	so->so_fibnum = td->td_proc->p_fibnum;
 	error = raw_attach(so, proto);
 	rp = sotorawcb(so);
 	if (error) {
 		so->so_pcb = NULL;
 		free(rp, M_PCB);
 		return error;
 	}
 	RTSOCK_LOCK();
 	switch(rp->rcb_proto.sp_protocol) {
 	case AF_INET:
 		V_route_cb.ip_count++;
 		break;
 	case AF_INET6:
 		V_route_cb.ip6_count++;
 		break;
 	}
 	V_route_cb.any_count++;
 	RTSOCK_UNLOCK();
 	soisconnected(so);
 	so->so_options |= SO_USELOOPBACK;
 	return 0;
 }
 
 static int
 rts_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */
 }
 
 static int
 rts_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (raw_usrreqs.pru_connect(so, nam, td)); /* XXX just EINVAL */
 }
 
 /* pru_connect2 is EOPNOTSUPP */
 /* pru_control is EOPNOTSUPP */
 
 static void
 rts_detach(struct socket *so)
 {
 	struct rawcb *rp = sotorawcb(so);
 
 	KASSERT(rp != NULL, ("rts_detach: rp == NULL"));
 
 	RTSOCK_LOCK();
 	switch(rp->rcb_proto.sp_protocol) {
 	case AF_INET:
 		V_route_cb.ip_count--;
 		break;
 	case AF_INET6:
 		V_route_cb.ip6_count--;
 		break;
 	}
 	V_route_cb.any_count--;
 	RTSOCK_UNLOCK();
 	raw_usrreqs.pru_detach(so);
 }
 
 static int
 rts_disconnect(struct socket *so)
 {
 
 	return (raw_usrreqs.pru_disconnect(so));
 }
 
 /* pru_listen is EOPNOTSUPP */
 
 static int
 rts_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 
 	return (raw_usrreqs.pru_peeraddr(so, nam));
 }
 
 /* pru_rcvd is EOPNOTSUPP */
 /* pru_rcvoob is EOPNOTSUPP */
 
 static int
 rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 	 struct mbuf *control, struct thread *td)
 {
 
 	return (raw_usrreqs.pru_send(so, flags, m, nam, control, td));
 }
 
 /* pru_sense is null */
 
 static int
 rts_shutdown(struct socket *so)
 {
 
 	return (raw_usrreqs.pru_shutdown(so));
 }
 
 static int
 rts_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 
 	return (raw_usrreqs.pru_sockaddr(so, nam));
 }
 
 static struct pr_usrreqs route_usrreqs = {
 	.pru_abort =		rts_abort,
 	.pru_attach =		rts_attach,
 	.pru_bind =		rts_bind,
 	.pru_connect =		rts_connect,
 	.pru_detach =		rts_detach,
 	.pru_disconnect =	rts_disconnect,
 	.pru_peeraddr =		rts_peeraddr,
 	.pru_send =		rts_send,
 	.pru_shutdown =		rts_shutdown,
 	.pru_sockaddr =		rts_sockaddr,
 	.pru_close =		rts_close,
 };
 
 #ifndef _SOCKADDR_UNION_DEFINED
 #define	_SOCKADDR_UNION_DEFINED
 /*
  * The union of all possible address formats we handle.
  */
 union sockaddr_union {
 	struct sockaddr		sa;
 	struct sockaddr_in	sin;
 	struct sockaddr_in6	sin6;
 };
 #endif /* _SOCKADDR_UNION_DEFINED */
 
 static int
 rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp,
     struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred)
 {
 
 	/* First, see if the returned address is part of the jail. */
 	if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) {
 		info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
 		return (0);
 	}
 
 	switch (info->rti_info[RTAX_DST]->sa_family) {
 #ifdef INET
 	case AF_INET:
 	{
 		struct in_addr ia;
 		struct ifaddr *ifa;
 		int found;
 
 		found = 0;
 		/*
 		 * Try to find an address on the given outgoing interface
 		 * that belongs to the jail.
 		 */
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa;
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			ia = ((struct sockaddr_in *)sa)->sin_addr;
 			if (prison_check_ip4(cred, &ia) == 0) {
 				found = 1;
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (!found) {
 			/*
 			 * As a last resort return the 'default' jail address.
 			 */
 			ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)->
 			    sin_addr;
 			if (prison_get_ip4(cred, &ia) != 0)
 				return (ESRCH);
 		}
 		bzero(&saun->sin, sizeof(struct sockaddr_in));
 		saun->sin.sin_len = sizeof(struct sockaddr_in);
 		saun->sin.sin_family = AF_INET;
 		saun->sin.sin_addr.s_addr = ia.s_addr;
 		info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin;
 		break;
 	}
 #endif
 #ifdef INET6
 	case AF_INET6:
 	{
 		struct in6_addr ia6;
 		struct ifaddr *ifa;
 		int found;
 
 		found = 0;
 		/*
 		 * Try to find an address on the given outgoing interface
 		 * that belongs to the jail.
 		 */
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa;
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET6)
 				continue;
 			bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr,
 			    &ia6, sizeof(struct in6_addr));
 			if (prison_check_ip6(cred, &ia6) == 0) {
 				found = 1;
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (!found) {
 			/*
 			 * As a last resort return the 'default' jail address.
 			 */
 			ia6 = ((struct sockaddr_in6 *)rt->rt_ifa->ifa_addr)->
 			    sin6_addr;
 			if (prison_get_ip6(cred, &ia6) != 0)
 				return (ESRCH);
 		}
 		bzero(&saun->sin6, sizeof(struct sockaddr_in6));
 		saun->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		saun->sin6.sin6_family = AF_INET6;
 		bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr));
 		if (sa6_recoverscope(&saun->sin6) != 0)
 			return (ESRCH);
 		info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6;
 		break;
 	}
 #endif
 	default:
 		return (ESRCH);
 	}
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 route_output(struct mbuf *m, struct socket *so, ...)
 {
 	struct rt_msghdr *rtm = NULL;
 	struct rtentry *rt = NULL;
 	struct rib_head *rnh;
 	struct rt_addrinfo info;
 	struct sockaddr_storage ss;
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 	int i, rti_need_deembed = 0;
 #endif
 	int alloc_len = 0, len, error = 0, fibnum;
 	struct ifnet *ifp = NULL;
 	union sockaddr_union saun;
 	sa_family_t saf = AF_UNSPEC;
 	struct rawcb *rp = NULL;
 	struct walkarg w;
 
 	fibnum = so->so_fibnum;
 
 #define senderr(e) { error = e; goto flush;}
 	if (m == NULL || ((m->m_len < sizeof(long)) &&
 		       (m = m_pullup(m, sizeof(long))) == NULL))
 		return (ENOBUFS);
 	if ((m->m_flags & M_PKTHDR) == 0)
 		panic("route_output");
 	len = m->m_pkthdr.len;
 	if (len < sizeof(*rtm) ||
 	    len != mtod(m, struct rt_msghdr *)->rtm_msglen)
 		senderr(EINVAL);
 
 	/*
 	 * Most of current messages are in range 200-240 bytes,
 	 * minimize possible re-allocation on reply using larger size
 	 * buffer aligned on 1k boundaty.
 	 */
 	alloc_len = roundup2(len, 1024);
 	if ((rtm = malloc(alloc_len, M_TEMP, M_NOWAIT)) == NULL)
 		senderr(ENOBUFS);
 
 	m_copydata(m, 0, len, (caddr_t)rtm);
 	bzero(&info, sizeof(info));
 	bzero(&w, sizeof(w));
 
 	if (rtm->rtm_version != RTM_VERSION) {
 		/* Do not touch message since format is unknown */
 		free(rtm, M_TEMP);
 		rtm = NULL;
 		senderr(EPROTONOSUPPORT);
 	}
 
 	/*
 	 * Starting from here, it is possible
 	 * to alter original message and insert
 	 * caller PID and error value.
 	 */
 
 	rtm->rtm_pid = curproc->p_pid;
 	info.rti_addrs = rtm->rtm_addrs;
 
 	info.rti_mflags = rtm->rtm_inits;
 	info.rti_rmx = &rtm->rtm_rmx;
 
 	/*
 	 * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6
 	 * link-local address because rtrequest requires addresses with
 	 * embedded scope id.
 	 */
 	if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info))
 		senderr(EINVAL);
 
 	info.rti_flags = rtm->rtm_flags;
 	if (info.rti_info[RTAX_DST] == NULL ||
 	    info.rti_info[RTAX_DST]->sa_family >= AF_MAX ||
 	    (info.rti_info[RTAX_GATEWAY] != NULL &&
 	     info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX))
 		senderr(EINVAL);
 	saf = info.rti_info[RTAX_DST]->sa_family;
 	/*
 	 * Verify that the caller has the appropriate privilege; RTM_GET
 	 * is the only operation the non-superuser is allowed.
 	 */
 	if (rtm->rtm_type != RTM_GET) {
 		error = priv_check(curthread, PRIV_NET_ROUTE);
 		if (error)
 			senderr(error);
 	}
 
 	/*
 	 * The given gateway address may be an interface address.
 	 * For example, issuing a "route change" command on a route
 	 * entry that was created from a tunnel, and the gateway
 	 * address given is the local end point. In this case the 
 	 * RTF_GATEWAY flag must be cleared or the destination will
 	 * not be reachable even though there is no error message.
 	 */
 	if (info.rti_info[RTAX_GATEWAY] != NULL &&
 	    info.rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) {
 		struct rt_addrinfo ginfo;
 		struct sockaddr *gdst;
 
 		bzero(&ginfo, sizeof(ginfo));
 		bzero(&ss, sizeof(ss));
 		ss.ss_len = sizeof(ss);
 
 		ginfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&ss;
 		gdst = info.rti_info[RTAX_GATEWAY];
 
 		/* 
 		 * A host route through the loopback interface is 
 		 * installed for each interface adddress. In pre 8.0
 		 * releases the interface address of a PPP link type
 		 * is not reachable locally. This behavior is fixed as 
 		 * part of the new L2/L3 redesign and rewrite work. The
 		 * signature of this interface address route is the
 		 * AF_LINK sa_family type of the rt_gateway, and the
 		 * rt_ifp has the IFF_LOOPBACK flag set.
 		 */
 		if (rib_lookup_info(fibnum, gdst, NHR_REF, 0, &ginfo) == 0) {
 			if (ss.ss_family == AF_LINK &&
 			    ginfo.rti_ifp->if_flags & IFF_LOOPBACK) {
 				info.rti_flags &= ~RTF_GATEWAY;
 				info.rti_flags |= RTF_GWFLAG_COMPAT;
 			}
 			rib_free_info(&ginfo);
 		}
 	}
 
 	switch (rtm->rtm_type) {
 		struct rtentry *saved_nrt;
 
 	case RTM_ADD:
 	case RTM_CHANGE:
 		if (info.rti_info[RTAX_GATEWAY] == NULL)
 			senderr(EINVAL);
 		saved_nrt = NULL;
 
 		/* support for new ARP code */
 		if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK &&
 		    (rtm->rtm_flags & RTF_LLDATA) != 0) {
 			error = lla_rt_output(rtm, &info);
 #ifdef INET6
 			if (error == 0)
 				rti_need_deembed = (V_deembed_scopeid) ? 1 : 0;
 #endif
 			break;
 		}
 		error = rtrequest1_fib(rtm->rtm_type, &info, &saved_nrt,
 		    fibnum);
 		if (error == 0 && saved_nrt != NULL) {
 #ifdef INET6
 			rti_need_deembed = (V_deembed_scopeid) ? 1 : 0;
 #endif
 			RT_LOCK(saved_nrt);
 			rtm->rtm_index = saved_nrt->rt_ifp->if_index;
 			RT_REMREF(saved_nrt);
 			RT_UNLOCK(saved_nrt);
 		}
 		break;
 
 	case RTM_DELETE:
 		saved_nrt = NULL;
 		/* support for new ARP code */
 		if (info.rti_info[RTAX_GATEWAY] && 
 		    (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) &&
 		    (rtm->rtm_flags & RTF_LLDATA) != 0) {
 			error = lla_rt_output(rtm, &info);
 #ifdef INET6
 			if (error == 0)
 				rti_need_deembed = (V_deembed_scopeid) ? 1 : 0;
 #endif
 			break;
 		}
 		error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt, fibnum);
 		if (error == 0) {
 			RT_LOCK(saved_nrt);
 			rt = saved_nrt;
 			goto report;
 		}
 #ifdef INET6
 		/* rt_msg2() will not be used when RTM_DELETE fails. */
 		rti_need_deembed = (V_deembed_scopeid) ? 1 : 0;
 #endif
 		break;
 
 	case RTM_GET:
 		rnh = rt_tables_get_rnh(fibnum, saf);
 		if (rnh == NULL)
 			senderr(EAFNOSUPPORT);
 
 		RIB_RLOCK(rnh);
 
 		if (info.rti_info[RTAX_NETMASK] == NULL &&
 		    rtm->rtm_type == RTM_GET) {
 			/*
 			 * Provide logest prefix match for
 			 * address lookup (no mask).
 			 * 'route -n get addr'
 			 */
 			rt = (struct rtentry *) rnh->rnh_matchaddr(
 			    info.rti_info[RTAX_DST], &rnh->head);
 		} else
 			rt = (struct rtentry *) rnh->rnh_lookup(
 			    info.rti_info[RTAX_DST],
 			    info.rti_info[RTAX_NETMASK], &rnh->head);
 
 		if (rt == NULL) {
 			RIB_RUNLOCK(rnh);
 			senderr(ESRCH);
 		}
 #ifdef RADIX_MPATH
 		/*
 		 * for RTM_CHANGE/LOCK, if we got multipath routes,
 		 * we require users to specify a matching RTAX_GATEWAY.
 		 *
 		 * for RTM_GET, gate is optional even with multipath.
 		 * if gate == NULL the first match is returned.
 		 * (no need to call rt_mpath_matchgate if gate == NULL)
 		 */
 		if (rt_mpath_capable(rnh) &&
 		    (rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) {
 			rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]);
 			if (!rt) {
 				RIB_RUNLOCK(rnh);
 				senderr(ESRCH);
 			}
 		}
 #endif
 		/*
 		 * If performing proxied L2 entry insertion, and
 		 * the actual PPP host entry is found, perform
 		 * another search to retrieve the prefix route of
 		 * the local end point of the PPP link.
 		 */
 		if (rtm->rtm_flags & RTF_ANNOUNCE) {
 			struct sockaddr laddr;
 
 			if (rt->rt_ifp != NULL && 
 			    rt->rt_ifp->if_type == IFT_PROPVIRTUAL) {
 				struct ifaddr *ifa;
 
 				ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1,
 						RT_ALL_FIBS);
 				if (ifa != NULL)
 					rt_maskedcopy(ifa->ifa_addr,
 						      &laddr,
 						      ifa->ifa_netmask);
 			} else
 				rt_maskedcopy(rt->rt_ifa->ifa_addr,
 					      &laddr,
 					      rt->rt_ifa->ifa_netmask);
 			/* 
 			 * refactor rt and no lock operation necessary
 			 */
 			rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr,
 			    &rnh->head);
 			if (rt == NULL) {
 				RIB_RUNLOCK(rnh);
 				senderr(ESRCH);
 			}
 		} 
 		RT_LOCK(rt);
 		RT_ADDREF(rt);
 		RIB_RUNLOCK(rnh);
 
 report:
 		RT_LOCK_ASSERT(rt);
 		if ((rt->rt_flags & RTF_HOST) == 0
 		    ? jailed_without_vnet(curthread->td_ucred)
 		    : prison_if(curthread->td_ucred,
 		    rt_key(rt)) != 0) {
 			RT_UNLOCK(rt);
 			senderr(ESRCH);
 		}
 		info.rti_info[RTAX_DST] = rt_key(rt);
 		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 		info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt),
 		    rt_mask(rt), &ss);
 		info.rti_info[RTAX_GENMASK] = 0;
 		if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) {
 			ifp = rt->rt_ifp;
 			if (ifp) {
 				info.rti_info[RTAX_IFP] =
 				    ifp->if_addr->ifa_addr;
 				error = rtm_get_jailed(&info, ifp, rt,
 				    &saun, curthread->td_ucred);
 				if (error != 0) {
 					RT_UNLOCK(rt);
 					senderr(error);
 				}
 				if (ifp->if_flags & IFF_POINTOPOINT)
 					info.rti_info[RTAX_BRD] =
 					    rt->rt_ifa->ifa_dstaddr;
 				rtm->rtm_index = ifp->if_index;
 			} else {
 				info.rti_info[RTAX_IFP] = NULL;
 				info.rti_info[RTAX_IFA] = NULL;
 			}
 		} else if ((ifp = rt->rt_ifp) != NULL) {
 			rtm->rtm_index = ifp->if_index;
 		}
 
 		/* Check if we need to realloc storage */
 		rtsock_msg_buffer(rtm->rtm_type, &info, NULL, &len);
 		if (len > alloc_len) {
 			struct rt_msghdr *new_rtm;
 			new_rtm = malloc(len, M_TEMP, M_NOWAIT);
 			if (new_rtm == NULL) {
 				RT_UNLOCK(rt);
 				senderr(ENOBUFS);
 			}
 			bcopy(rtm, new_rtm, rtm->rtm_msglen);
 			free(rtm, M_TEMP);
 			rtm = new_rtm;
 			alloc_len = len;
 		}
 
 		w.w_tmem = (caddr_t)rtm;
 		w.w_tmemsize = alloc_len;
 		rtsock_msg_buffer(rtm->rtm_type, &info, &w, &len);
 
 		if (rt->rt_flags & RTF_GWFLAG_COMPAT)
 			rtm->rtm_flags = RTF_GATEWAY | 
 				(rt->rt_flags & ~RTF_GWFLAG_COMPAT);
 		else
 			rtm->rtm_flags = rt->rt_flags;
 		rt_getmetrics(rt, &rtm->rtm_rmx);
 		rtm->rtm_addrs = info.rti_addrs;
 
 		RT_UNLOCK(rt);
 		break;
 
 	default:
 		senderr(EOPNOTSUPP);
 	}
 
 flush:
 	if (rt != NULL)
 		RTFREE(rt);
 	/*
 	 * Check to see if we don't want our own messages.
 	 */
 	if ((so->so_options & SO_USELOOPBACK) == 0) {
 		if (V_route_cb.any_count <= 1) {
 			if (rtm != NULL)
 				free(rtm, M_TEMP);
 			m_freem(m);
 			return (error);
 		}
 		/* There is another listener, so construct message */
 		rp = sotorawcb(so);
 	}
 
 	if (rtm != NULL) {
 #ifdef INET6
 		if (rti_need_deembed) {
 			/* sin6_scope_id is recovered before sending rtm. */
 			sin6 = (struct sockaddr_in6 *)&ss;
 			for (i = 0; i < RTAX_MAX; i++) {
 				if (info.rti_info[i] == NULL)
 					continue;
 				if (info.rti_info[i]->sa_family != AF_INET6)
 					continue;
 				bcopy(info.rti_info[i], sin6, sizeof(*sin6));
 				if (sa6_recoverscope(sin6) == 0)
 					bcopy(sin6, info.rti_info[i],
 						    sizeof(*sin6));
 			}
 		}
 #endif
 		if (error != 0)
 			rtm->rtm_errno = error;
 		else
 			rtm->rtm_flags |= RTF_DONE;
 
 		m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm);
 		if (m->m_pkthdr.len < rtm->rtm_msglen) {
 			m_freem(m);
 			m = NULL;
 		} else if (m->m_pkthdr.len > rtm->rtm_msglen)
 			m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len);
 
 		free(rtm, M_TEMP);
 	}
 	if (m != NULL) {
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 		if (rp) {
 			/*
 			 * XXX insure we don't get a copy by
 			 * invalidating our protocol
 			 */
 			unsigned short family = rp->rcb_proto.sp_family;
 			rp->rcb_proto.sp_family = 0;
 			rt_dispatch(m, saf);
 			rp->rcb_proto.sp_family = family;
 		} else
 			rt_dispatch(m, saf);
 	}
 
 	return (error);
 }
 
 static void
 rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out)
 {
 
 	bzero(out, sizeof(*out));
 	out->rmx_mtu = rt->rt_mtu;
 	out->rmx_weight = rt->rt_weight;
 	out->rmx_pksent = counter_u64_fetch(rt->rt_pksent);
 	/* Kernel -> userland timebase conversion. */
 	out->rmx_expire = rt->rt_expire ?
 	    rt->rt_expire - time_uptime + time_second : 0;
 }
 
 /*
  * Extract the addresses of the passed sockaddrs.
  * Do a little sanity checking so as to avoid bad memory references.
  * This data is derived straight from userland.
  */
 static int
 rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo)
 {
 	struct sockaddr *sa;
 	int i;
 
 	for (i = 0; i < RTAX_MAX && cp < cplim; i++) {
 		if ((rtinfo->rti_addrs & (1 << i)) == 0)
 			continue;
 		sa = (struct sockaddr *)cp;
 		/*
 		 * It won't fit.
 		 */
 		if (cp + sa->sa_len > cplim)
 			return (EINVAL);
 		/*
 		 * there are no more.. quit now
 		 * If there are more bits, they are in error.
 		 * I've seen this. route(1) can evidently generate these. 
 		 * This causes kernel to core dump.
 		 * for compatibility, If we see this, point to a safe address.
 		 */
 		if (sa->sa_len == 0) {
 			rtinfo->rti_info[i] = &sa_zero;
 			return (0); /* should be EINVAL but for compat */
 		}
 		/* accept it */
 #ifdef INET6
 		if (sa->sa_family == AF_INET6)
 			sa6_embedscope((struct sockaddr_in6 *)sa,
 			    V_ip6_use_defzone);
 #endif
 		rtinfo->rti_info[i] = sa;
 		cp += SA_SIZE(sa);
 	}
 	return (0);
 }
 
 /*
  * Fill in @dmask with valid netmask leaving original @smask
  * intact. Mostly used with radix netmasks.
  */
 static struct sockaddr *
 rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask,
     struct sockaddr_storage *dmask)
 {
 	if (dst == NULL || smask == NULL)
 		return (NULL);
 
 	memset(dmask, 0, dst->sa_len);
 	memcpy(dmask, smask, smask->sa_len);
 	dmask->ss_len = dst->sa_len;
 	dmask->ss_family = dst->sa_family;
 
 	return ((struct sockaddr *)dmask);
 }
 
 /*
  * Writes information related to @rtinfo object to newly-allocated mbuf.
  * Assumes MCLBYTES is enough to construct any message.
  * Used for OS notifications of vaious events (if/ifa announces,etc)
  *
  * Returns allocated mbuf or NULL on failure.
  */
 static struct mbuf *
 rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo)
 {
 	struct rt_msghdr *rtm;
 	struct mbuf *m;
 	int i;
 	struct sockaddr *sa;
 #ifdef INET6
 	struct sockaddr_storage ss;
 	struct sockaddr_in6 *sin6;
 #endif
 	int len, dlen;
 
 	switch (type) {
 
 	case RTM_DELADDR:
 	case RTM_NEWADDR:
 		len = sizeof(struct ifa_msghdr);
 		break;
 
 	case RTM_DELMADDR:
 	case RTM_NEWMADDR:
 		len = sizeof(struct ifma_msghdr);
 		break;
 
 	case RTM_IFINFO:
 		len = sizeof(struct if_msghdr);
 		break;
 
 	case RTM_IFANNOUNCE:
 	case RTM_IEEE80211:
 		len = sizeof(struct if_announcemsghdr);
 		break;
 
 	default:
 		len = sizeof(struct rt_msghdr);
 	}
 
 	/* XXXGL: can we use MJUMPAGESIZE cluster here? */
 	KASSERT(len <= MCLBYTES, ("%s: message too big", __func__));
 	if (len > MHLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	else
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (m);
 
 	m->m_pkthdr.len = m->m_len = len;
 	rtm = mtod(m, struct rt_msghdr *);
 	bzero((caddr_t)rtm, len);
 	for (i = 0; i < RTAX_MAX; i++) {
 		if ((sa = rtinfo->rti_info[i]) == NULL)
 			continue;
 		rtinfo->rti_addrs |= (1 << i);
 		dlen = SA_SIZE(sa);
 #ifdef INET6
 		if (V_deembed_scopeid && sa->sa_family == AF_INET6) {
 			sin6 = (struct sockaddr_in6 *)&ss;
 			bcopy(sa, sin6, sizeof(*sin6));
 			if (sa6_recoverscope(sin6) == 0)
 				sa = (struct sockaddr *)sin6;
 		}
 #endif
 		m_copyback(m, len, dlen, (caddr_t)sa);
 		len += dlen;
 	}
 	if (m->m_pkthdr.len != len) {
 		m_freem(m);
 		return (NULL);
 	}
 	rtm->rtm_msglen = len;
 	rtm->rtm_version = RTM_VERSION;
 	rtm->rtm_type = type;
 	return (m);
 }
 
 /*
  * Writes information related to @rtinfo object to preallocated buffer.
  * Stores needed size in @plen. If @w is NULL, calculates size without
  * writing.
  * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation.
  *
  * Returns 0 on success.
  *
  */
 static int
 rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen)
 {
 	int i;
 	int len, buflen = 0, dlen;
 	caddr_t cp = NULL;
 	struct rt_msghdr *rtm = NULL;
 #ifdef INET6
 	struct sockaddr_storage ss;
 	struct sockaddr_in6 *sin6;
 #endif
 
 	switch (type) {
 
 	case RTM_DELADDR:
 	case RTM_NEWADDR:
 		if (w != NULL && w->w_op == NET_RT_IFLISTL) {
 #ifdef COMPAT_FREEBSD32
 			if (w->w_req->flags & SCTL_MASK32)
 				len = sizeof(struct ifa_msghdrl32);
 			else
 #endif
 				len = sizeof(struct ifa_msghdrl);
 		} else
 			len = sizeof(struct ifa_msghdr);
 		break;
 
 	case RTM_IFINFO:
 #ifdef COMPAT_FREEBSD32
 		if (w != NULL && w->w_req->flags & SCTL_MASK32) {
 			if (w->w_op == NET_RT_IFLISTL)
 				len = sizeof(struct if_msghdrl32);
 			else
 				len = sizeof(struct if_msghdr32);
 			break;
 		}
 #endif
 		if (w != NULL && w->w_op == NET_RT_IFLISTL)
 			len = sizeof(struct if_msghdrl);
 		else
 			len = sizeof(struct if_msghdr);
 		break;
 
 	case RTM_NEWMADDR:
 		len = sizeof(struct ifma_msghdr);
 		break;
 
 	default:
 		len = sizeof(struct rt_msghdr);
 	}
 
 	if (w != NULL) {
 		rtm = (struct rt_msghdr *)w->w_tmem;
 		buflen = w->w_tmemsize - len;
 		cp = (caddr_t)w->w_tmem + len;
 	}
 
 	rtinfo->rti_addrs = 0;
 	for (i = 0; i < RTAX_MAX; i++) {
 		struct sockaddr *sa;
 
 		if ((sa = rtinfo->rti_info[i]) == NULL)
 			continue;
 		rtinfo->rti_addrs |= (1 << i);
 		dlen = SA_SIZE(sa);
 		if (cp != NULL && buflen >= dlen) {
 #ifdef INET6
 			if (V_deembed_scopeid && sa->sa_family == AF_INET6) {
 				sin6 = (struct sockaddr_in6 *)&ss;
 				bcopy(sa, sin6, sizeof(*sin6));
 				if (sa6_recoverscope(sin6) == 0)
 					sa = (struct sockaddr *)sin6;
 			}
 #endif
 			bcopy((caddr_t)sa, cp, (unsigned)dlen);
 			cp += dlen;
 			buflen -= dlen;
 		} else if (cp != NULL) {
 			/*
 			 * Buffer too small. Count needed size
 			 * and return with error.
 			 */
 			cp = NULL;
 		}
 
 		len += dlen;
 	}
 
 	if (cp != NULL) {
 		dlen = ALIGN(len) - len;
 		if (buflen < dlen)
 			cp = NULL;
 		else
 			buflen -= dlen;
 	}
 	len = ALIGN(len);
 
 	if (cp != NULL) {
 		/* fill header iff buffer is large enough */
 		rtm->rtm_version = RTM_VERSION;
 		rtm->rtm_type = type;
 		rtm->rtm_msglen = len;
 	}
 
 	*plen = len;
 
 	if (w != NULL && cp == NULL)
 		return (ENOBUFS);
 
 	return (0);
 }
 
 /*
  * This routine is called to generate a message from the routing
  * socket indicating that a redirect has occured, a routing lookup
  * has failed, or that a protocol has detected timeouts to a particular
  * destination.
  */
 void
 rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error,
     int fibnum)
 {
 	struct rt_msghdr *rtm;
 	struct mbuf *m;
 	struct sockaddr *sa = rtinfo->rti_info[RTAX_DST];
 
 	if (V_route_cb.any_count == 0)
 		return;
 	m = rtsock_msg_mbuf(type, rtinfo);
 	if (m == NULL)
 		return;
 
 	if (fibnum != RT_ALL_FIBS) {
 		KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out "
 		    "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs));
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rtm = mtod(m, struct rt_msghdr *);
 	rtm->rtm_flags = RTF_DONE | flags;
 	rtm->rtm_errno = error;
 	rtm->rtm_addrs = rtinfo->rti_addrs;
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 }
 
 void
 rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error)
 {
 
 	rt_missmsg_fib(type, rtinfo, flags, error, RT_ALL_FIBS);
 }
 
 /*
  * This routine is called to generate a message from the routing
  * socket indicating that the status of a network interface has changed.
  */
 void
 rt_ifmsg(struct ifnet *ifp)
 {
 	struct if_msghdr *ifm;
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	if (V_route_cb.any_count == 0)
 		return;
 	bzero((caddr_t)&info, sizeof(info));
 	m = rtsock_msg_mbuf(RTM_IFINFO, &info);
 	if (m == NULL)
 		return;
 	ifm = mtod(m, struct if_msghdr *);
 	ifm->ifm_index = ifp->if_index;
 	ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 	if_data_copy(ifp, &ifm->ifm_data);
 	ifm->ifm_addrs = 0;
 	rt_dispatch(m, AF_UNSPEC);
 }
 
 /*
  * Announce interface address arrival/withdraw.
  * Please do not call directly, use rt_addrmsg().
  * Assume input data to be valid.
  * Returns 0 on success.
  */
 int
 rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum)
 {
 	struct rt_addrinfo info;
 	struct sockaddr *sa;
 	int ncmd;
 	struct mbuf *m;
 	struct ifa_msghdr *ifam;
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct sockaddr_storage ss;
 
 	if (V_route_cb.any_count == 0)
 		return (0);
 
 	ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR;
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr;
 	info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr;
 	info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(
 	    info.rti_info[RTAX_IFP], ifa->ifa_netmask, &ss);
 	info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
 	if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL)
 		return (ENOBUFS);
 	ifam = mtod(m, struct ifa_msghdr *);
 	ifam->ifam_index = ifp->if_index;
 	ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 	ifam->ifam_flags = ifa->ifa_flags;
 	ifam->ifam_addrs = info.rti_addrs;
 
 	if (fibnum != RT_ALL_FIBS) {
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 
 	return (0);
 }
 
 /*
  * Announce route addition/removal.
  * Please do not call directly, use rt_routemsg().
  * Note that @rt data MAY be inconsistent/invalid:
  * if some userland app sends us "invalid" route message (invalid mask,
  * no dst, wrong address families, etc...) we need to pass it back
  * to app (and any other rtsock consumers) with rtm_errno field set to
  * non-zero value.
  *
  * Returns 0 on success.
  */
 int
 rtsock_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt,
     int fibnum)
 {
 	struct rt_addrinfo info;
 	struct sockaddr *sa;
 	struct mbuf *m;
 	struct rt_msghdr *rtm;
 	struct sockaddr_storage ss;
 
 	if (V_route_cb.any_count == 0)
 		return (0);
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = sa = rt_key(rt);
 	info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(sa, rt_mask(rt), &ss);
 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 	if ((m = rtsock_msg_mbuf(cmd, &info)) == NULL)
 		return (ENOBUFS);
 	rtm = mtod(m, struct rt_msghdr *);
 	rtm->rtm_index = ifp->if_index;
 	rtm->rtm_flags |= rt->rt_flags;
 	rtm->rtm_errno = error;
 	rtm->rtm_addrs = info.rti_addrs;
 
 	if (fibnum != RT_ALL_FIBS) {
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 
 	return (0);
 }
 
 /*
  * This is the analogue to the rt_newaddrmsg which performs the same
  * function but for multicast group memberhips.  This is easier since
  * there is no route state to worry about.
  */
 void
 rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma)
 {
 	struct rt_addrinfo info;
 	struct mbuf *m = NULL;
 	struct ifnet *ifp = ifma->ifma_ifp;
 	struct ifma_msghdr *ifmam;
 
 	if (V_route_cb.any_count == 0)
 		return;
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_IFA] = ifma->ifma_addr;
 	info.rti_info[RTAX_IFP] = ifp ? ifp->if_addr->ifa_addr : NULL;
 	/*
 	 * If a link-layer address is present, present it as a ``gateway''
 	 * (similarly to how ARP entries, e.g., are presented).
 	 */
 	info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr;
 	m = rtsock_msg_mbuf(cmd, &info);
 	if (m == NULL)
 		return;
 	ifmam = mtod(m, struct ifma_msghdr *);
 	KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n",
 	    __func__));
 	ifmam->ifmam_index = ifp->if_index;
 	ifmam->ifmam_addrs = info.rti_addrs;
 	rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC);
 }
 
 static struct mbuf *
 rt_makeifannouncemsg(struct ifnet *ifp, int type, int what,
 	struct rt_addrinfo *info)
 {
 	struct if_announcemsghdr *ifan;
 	struct mbuf *m;
 
 	if (V_route_cb.any_count == 0)
 		return NULL;
 	bzero((caddr_t)info, sizeof(*info));
 	m = rtsock_msg_mbuf(type, info);
 	if (m != NULL) {
 		ifan = mtod(m, struct if_announcemsghdr *);
 		ifan->ifan_index = ifp->if_index;
 		strlcpy(ifan->ifan_name, ifp->if_xname,
 			sizeof(ifan->ifan_name));
 		ifan->ifan_what = what;
 	}
 	return m;
 }
 
 /*
  * This is called to generate routing socket messages indicating
  * IEEE80211 wireless events.
  * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way.
  */
 void
 rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len)
 {
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info);
 	if (m != NULL) {
 		/*
 		 * Append the ieee80211 data.  Try to stick it in the
 		 * mbuf containing the ifannounce msg; otherwise allocate
 		 * a new mbuf and append.
 		 *
 		 * NB: we assume m is a single mbuf.
 		 */
 		if (data_len > M_TRAILINGSPACE(m)) {
 			struct mbuf *n = m_get(M_NOWAIT, MT_DATA);
 			if (n == NULL) {
 				m_freem(m);
 				return;
 			}
 			bcopy(data, mtod(n, void *), data_len);
 			n->m_len = data_len;
 			m->m_next = n;
 		} else if (data_len > 0) {
 			bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len);
 			m->m_len += data_len;
 		}
 		if (m->m_flags & M_PKTHDR)
 			m->m_pkthdr.len += data_len;
 		mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len;
 		rt_dispatch(m, AF_UNSPEC);
 	}
 }
 
 /*
  * This is called to generate routing socket messages indicating
  * network interface arrival and departure.
  */
 void
 rt_ifannouncemsg(struct ifnet *ifp, int what)
 {
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info);
 	if (m != NULL)
 		rt_dispatch(m, AF_UNSPEC);
 }
 
 static void
 rt_dispatch(struct mbuf *m, sa_family_t saf)
 {
 	struct m_tag *tag;
 
 	/*
 	 * Preserve the family from the sockaddr, if any, in an m_tag for
 	 * use when injecting the mbuf into the routing socket buffer from
 	 * the netisr.
 	 */
 	if (saf != AF_UNSPEC) {
 		tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short),
 		    M_NOWAIT);
 		if (tag == NULL) {
 			m_freem(m);
 			return;
 		}
 		*(unsigned short *)(tag + 1) = saf;
 		m_tag_prepend(m, tag);
 	}
 #ifdef VIMAGE
 	if (V_loif)
 		m->m_pkthdr.rcvif = V_loif;
 	else {
 		m_freem(m);
 		return;
 	}
 #endif
 	netisr_queue(NETISR_ROUTE, m);	/* mbuf is free'd on failure. */
 }
 
 /*
  * This is used in dumping the kernel table via sysctl().
  */
 static int
 sysctl_dumpentry(struct radix_node *rn, void *vw)
 {
 	struct walkarg *w = vw;
 	struct rtentry *rt = (struct rtentry *)rn;
 	int error = 0, size;
 	struct rt_addrinfo info;
 	struct sockaddr_storage ss;
 
 	if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
 		return 0;
 	if ((rt->rt_flags & RTF_HOST) == 0
 	    ? jailed_without_vnet(w->w_req->td->td_ucred)
 	    : prison_if(w->w_req->td->td_ucred, rt_key(rt)) != 0)
 		return (0);
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = rt_key(rt);
 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 	info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt),
 	    rt_mask(rt), &ss);
 	info.rti_info[RTAX_GENMASK] = 0;
 	if (rt->rt_ifp) {
 		info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr;
 		info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
 		if (rt->rt_ifp->if_flags & IFF_POINTOPOINT)
 			info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
 	}
 	if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0)
 		return (error);
 	if (w->w_req && w->w_tmem) {
 		struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
 
 		if (rt->rt_flags & RTF_GWFLAG_COMPAT)
 			rtm->rtm_flags = RTF_GATEWAY | 
 				(rt->rt_flags & ~RTF_GWFLAG_COMPAT);
 		else
 			rtm->rtm_flags = rt->rt_flags;
 		rt_getmetrics(rt, &rtm->rtm_rmx);
 		rtm->rtm_index = rt->rt_ifp->if_index;
 		rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0;
 		rtm->rtm_addrs = info.rti_addrs;
 		error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
 		return (error);
 	}
 	return (error);
 }
 
 static int
 sysctl_iflist_ifml(struct ifnet *ifp, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct if_msghdrl *ifm;
 	struct if_data *ifd;
 
 	ifm = (struct if_msghdrl *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct if_msghdrl32 *ifm32;
 
 		ifm32 = (struct if_msghdrl32 *)ifm;
 		ifm32->ifm_addrs = info->rti_addrs;
 		ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm32->ifm_index = ifp->if_index;
 		ifm32->_ifm_spare1 = 0;
 		ifm32->ifm_len = sizeof(*ifm32);
 		ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data);
 		ifd = &ifm32->ifm_data;
 	} else
 #endif
 	{
 		ifm->ifm_addrs = info->rti_addrs;
 		ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm->ifm_index = ifp->if_index;
 		ifm->_ifm_spare1 = 0;
 		ifm->ifm_len = sizeof(*ifm);
 		ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data);
 		ifd = &ifm->ifm_data;
 	}
 
 	if_data_copy(ifp, ifd);
 
 	return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len));
 }
 
 static int
 sysctl_iflist_ifm(struct ifnet *ifp, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct if_msghdr *ifm;
 	struct if_data *ifd;
 
 	ifm = (struct if_msghdr *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct if_msghdr32 *ifm32;
 
 		ifm32 = (struct if_msghdr32 *)ifm;
 		ifm32->ifm_addrs = info->rti_addrs;
 		ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm32->ifm_index = ifp->if_index;
 		ifd = &ifm32->ifm_data;
 	} else
 #endif
 	{
 		ifm->ifm_addrs = info->rti_addrs;
 		ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm->ifm_index = ifp->if_index;
 		ifd = &ifm->ifm_data;
 	}
 
 	if_data_copy(ifp, ifd);
 
 	return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len));
 }
 
 static int
 sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct ifa_msghdrl *ifam;
 	struct if_data *ifd;
 
 	ifam = (struct ifa_msghdrl *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct ifa_msghdrl32 *ifam32;
 
 		ifam32 = (struct ifa_msghdrl32 *)ifam;
 		ifam32->ifam_addrs = info->rti_addrs;
 		ifam32->ifam_flags = ifa->ifa_flags;
 		ifam32->ifam_index = ifa->ifa_ifp->if_index;
 		ifam32->_ifam_spare1 = 0;
 		ifam32->ifam_len = sizeof(*ifam32);
 		ifam32->ifam_data_off =
 		    offsetof(struct ifa_msghdrl32, ifam_data);
 		ifam32->ifam_metric = ifa->ifa_ifp->if_metric;
 		ifd = &ifam32->ifam_data;
 	} else
 #endif
 	{
 		ifam->ifam_addrs = info->rti_addrs;
 		ifam->ifam_flags = ifa->ifa_flags;
 		ifam->ifam_index = ifa->ifa_ifp->if_index;
 		ifam->_ifam_spare1 = 0;
 		ifam->ifam_len = sizeof(*ifam);
 		ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data);
 		ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 		ifd = &ifam->ifam_data;
 	}
 
 	bzero(ifd, sizeof(*ifd));
 	ifd->ifi_datalen = sizeof(struct if_data);
 	ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets);
 	ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets);
 	ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes);
 	ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes);
 
 	/* Fixup if_data carp(4) vhid. */
 	if (carp_get_vhid_p != NULL)
 		ifd->ifi_vhid = (*carp_get_vhid_p)(ifa);
 
 	return (SYSCTL_OUT(w->w_req, w->w_tmem, len));
 }
 
 static int
 sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct ifa_msghdr *ifam;
 
 	ifam = (struct ifa_msghdr *)w->w_tmem;
 	ifam->ifam_addrs = info->rti_addrs;
 	ifam->ifam_flags = ifa->ifa_flags;
 	ifam->ifam_index = ifa->ifa_ifp->if_index;
 	ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 
 	return (SYSCTL_OUT(w->w_req, w->w_tmem, len));
 }
 
 static int
 sysctl_iflist(int af, struct walkarg *w)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct rt_addrinfo info;
 	int len, error = 0;
 	struct sockaddr_storage ss;
 
 	bzero((caddr_t)&info, sizeof(info));
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (w->w_arg && w->w_arg != ifp->if_index)
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		ifa = ifp->if_addr;
 		info.rti_info[RTAX_IFP] = ifa->ifa_addr;
 		error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len);
 		if (error != 0)
 			goto done;
 		info.rti_info[RTAX_IFP] = NULL;
 		if (w->w_req && w->w_tmem) {
 			if (w->w_op == NET_RT_IFLISTL)
 				error = sysctl_iflist_ifml(ifp, &info, w, len);
 			else
 				error = sysctl_iflist_ifm(ifp, &info, w, len);
 			if (error)
 				goto done;
 		}
 		while ((ifa = TAILQ_NEXT(ifa, ifa_link)) != NULL) {
 			if (af && af != ifa->ifa_addr->sa_family)
 				continue;
 			if (prison_if(w->w_req->td->td_ucred,
 			    ifa->ifa_addr) != 0)
 				continue;
 			info.rti_info[RTAX_IFA] = ifa->ifa_addr;
 			info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(
 			    ifa->ifa_addr, ifa->ifa_netmask, &ss);
 			info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
 			error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len);
 			if (error != 0)
 				goto done;
 			if (w->w_req && w->w_tmem) {
 				if (w->w_op == NET_RT_IFLISTL)
 					error = sysctl_iflist_ifaml(ifa, &info,
 					    w, len);
 				else
 					error = sysctl_iflist_ifam(ifa, &info,
 					    w, len);
 				if (error)
 					goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		info.rti_info[RTAX_IFA] = NULL;
 		info.rti_info[RTAX_NETMASK] = NULL;
 		info.rti_info[RTAX_BRD] = NULL;
 	}
 done:
 	if (ifp != NULL)
 		IF_ADDR_RUNLOCK(ifp);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (error);
 }
 
 static int
 sysctl_ifmalist(int af, struct walkarg *w)
 {
 	struct ifnet *ifp;
 	struct ifmultiaddr *ifma;
 	struct	rt_addrinfo info;
 	int	len, error = 0;
 	struct ifaddr *ifa;
 
 	bzero((caddr_t)&info, sizeof(info));
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (w->w_arg && w->w_arg != ifp->if_index)
 			continue;
 		ifa = ifp->if_addr;
 		info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (af && af != ifma->ifma_addr->sa_family)
 				continue;
 			if (prison_if(w->w_req->td->td_ucred,
 			    ifma->ifma_addr) != 0)
 				continue;
 			info.rti_info[RTAX_IFA] = ifma->ifma_addr;
 			info.rti_info[RTAX_GATEWAY] =
 			    (ifma->ifma_addr->sa_family != AF_LINK) ?
 			    ifma->ifma_lladdr : NULL;
 			error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len);
 			if (error != 0)
 				goto done;
 			if (w->w_req && w->w_tmem) {
 				struct ifma_msghdr *ifmam;
 
 				ifmam = (struct ifma_msghdr *)w->w_tmem;
 				ifmam->ifmam_index = ifma->ifma_ifp->if_index;
 				ifmam->ifmam_flags = 0;
 				ifmam->ifmam_addrs = info.rti_addrs;
 				error = SYSCTL_OUT(w->w_req, w->w_tmem, len);
 				if (error) {
 					IF_ADDR_RUNLOCK(ifp);
 					goto done;
 				}
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (error);
 }
 
 static int
 sysctl_rtsock(SYSCTL_HANDLER_ARGS)
 {
 	int	*name = (int *)arg1;
 	u_int	namelen = arg2;
 	struct rib_head *rnh = NULL; /* silence compiler. */
 	int	i, lim, error = EINVAL;
 	int	fib = 0;
 	u_char	af;
 	struct	walkarg w;
 
 	name ++;
 	namelen--;
 	if (req->newptr)
 		return (EPERM);
 	if (name[1] == NET_RT_DUMP) {
 		if (namelen == 3)
 			fib = req->td->td_proc->p_fibnum;
 		else if (namelen == 4)
 			fib = (name[3] == RT_ALL_FIBS) ?
 			    req->td->td_proc->p_fibnum : name[3];
 		else
 			return ((namelen < 3) ? EISDIR : ENOTDIR);
 		if (fib < 0 || fib >= rt_numfibs)
 			return (EINVAL);
 	} else if (namelen != 3)
 		return ((namelen < 3) ? EISDIR : ENOTDIR);
 	af = name[0];
 	if (af > AF_MAX)
 		return (EINVAL);
 	bzero(&w, sizeof(w));
 	w.w_op = name[1];
 	w.w_arg = name[2];
 	w.w_req = req;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error)
 		return (error);
 	
 	/*
 	 * Allocate reply buffer in advance.
 	 * All rtsock messages has maximum length of u_short.
 	 */
 	w.w_tmemsize = 65536;
 	w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK);
 
 	switch (w.w_op) {
 
 	case NET_RT_DUMP:
 	case NET_RT_FLAGS:
 		if (af == 0) {			/* dump all tables */
 			i = 1;
 			lim = AF_MAX;
 		} else				/* dump only one table */
 			i = lim = af;
 
 		/*
 		 * take care of llinfo entries, the caller must
 		 * specify an AF
 		 */
 		if (w.w_op == NET_RT_FLAGS &&
 		    (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) {
 			if (af != 0)
 				error = lltable_sysctl_dumparp(af, w.w_req);
 			else
 				error = EINVAL;
 			break;
 		}
 		/*
 		 * take care of routing entries
 		 */
 		for (error = 0; error == 0 && i <= lim; i++) {
 			rnh = rt_tables_get_rnh(fib, i);
 			if (rnh != NULL) {
 				RIB_RLOCK(rnh); 
 			    	error = rnh->rnh_walktree(&rnh->head,
 				    sysctl_dumpentry, &w);
 				RIB_RUNLOCK(rnh);
 			} else if (af != 0)
 				error = EAFNOSUPPORT;
 		}
 		break;
 
 	case NET_RT_IFLIST:
 	case NET_RT_IFLISTL:
 		error = sysctl_iflist(af, &w);
 		break;
 
 	case NET_RT_IFMALIST:
 		error = sysctl_ifmalist(af, &w);
 		break;
 	}
 
 	free(w.w_tmem, M_TEMP);
 	return (error);
 }
 
 static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, "");
 
 /*
  * Definitions of protocols supported in the ROUTE domain.
  */
 
 static struct domain routedomain;		/* or at least forward */
 
 static struct protosw routesw[] = {
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&routedomain,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_output =		route_output,
 	.pr_ctlinput =		raw_ctlinput,
 	.pr_init =		raw_init,
 	.pr_usrreqs =		&route_usrreqs
 }
 };
 
 static struct domain routedomain = {
 	.dom_family =		PF_ROUTE,
 	.dom_name =		 "route",
 	.dom_protosw =		routesw,
 	.dom_protoswNPROTOSW =	&routesw[sizeof(routesw)/sizeof(routesw[0])]
 };
 
 VNET_DOMAIN_SET(route);
Index: projects/vnet/sys/netinet/if_ether.c
===================================================================
--- projects/vnet/sys/netinet/if_ether.c	(revision 298547)
+++ projects/vnet/sys/netinet/if_ether.c	(revision 298548)
@@ -1,1338 +1,1353 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_ether.c	8.1 (Berkeley) 6/10/93
  */
 
 /*
  * Ethernet address resolution protocol.
  * TODO:
  *	add "inuse/lock" bit (or ref. count) along with valid bit
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/ethernet.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_var.h>
 #include <net/if_llatbl.h>
 #include <netinet/if_ether.h>
 #ifdef INET
 #include <netinet/ip_carp.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 #define SIN(s) ((const struct sockaddr_in *)(s))
 
 static struct timeval arp_lastlog;
 static int arp_curpps;
 static int arp_maxpps = 1;
 
 /* Simple ARP state machine */
 enum arp_llinfo_state {
 	ARP_LLINFO_INCOMPLETE = 0, /* No LLE data */
 	ARP_LLINFO_REACHABLE,	/* LLE is valid */
 	ARP_LLINFO_VERIFY,	/* LLE is valid, need refresh */
 	ARP_LLINFO_DELETED,	/* LLE is deleted */
 };
 
 SYSCTL_DECL(_net_link_ether);
 static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, "");
 static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, "");
 
 /* timer values */
 static VNET_DEFINE(int, arpt_keep) = (20*60);	/* once resolved, good for 20
 						 * minutes */
 static VNET_DEFINE(int, arp_maxtries) = 5;
 static VNET_DEFINE(int, arp_proxyall) = 0;
 static VNET_DEFINE(int, arpt_down) = 20;	/* keep incomplete entries for
 						 * 20 seconds */
 static VNET_DEFINE(int, arpt_rexmit) = 1;	/* retransmit arp entries, sec*/
 VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat);  /* ARP statistics, see if_arp.h */
 VNET_PCPUSTAT_SYSINIT(arpstat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(arpstat);
 #endif /* VIMAGE */
 
 static VNET_DEFINE(int, arp_maxhold) = 1;
 
 #define	V_arpt_keep		VNET(arpt_keep)
 #define	V_arpt_down		VNET(arpt_down)
 #define	V_arpt_rexmit		VNET(arpt_rexmit)
 #define	V_arp_maxtries		VNET(arp_maxtries)
 #define	V_arp_proxyall		VNET(arp_proxyall)
 #define	V_arp_maxhold		VNET(arp_maxhold)
 
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arpt_keep), 0,
 	"ARP entry lifetime in seconds");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_maxtries), 0,
 	"ARP resolution attempts before returning error");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_proxyall), 0,
 	"Enable proxy ARP for all suitable requests");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arpt_down), 0,
 	"Incomplete ARP entry lifetime in seconds");
 SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat,
     arpstat, "ARP statistics (struct arpstat, net/if_arp.h)");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_maxhold), 0,
 	"Number of packets to hold per ARP entry");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second,
 	CTLFLAG_RW, &arp_maxpps, 0,
 	"Maximum number of remotely triggered ARP messages that can be "
 	"logged per second");
 
 #define	ARP_LOG(pri, ...)	do {					\
 	if (ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps))	\
 		log((pri), "arp: " __VA_ARGS__);			\
 } while (0)
 
 
-static void	arp_init(void);
 static void	arpintr(struct mbuf *);
 static void	arptimer(void *);
 #ifdef INET
 static void	in_arpinput(struct mbuf *);
 #endif
 
 static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr,
     struct ifnet *ifp, int bridged, struct llentry *la);
 static void arp_mark_lle_reachable(struct llentry *la);
 static void arp_iflladdr(void *arg __unused, struct ifnet *ifp);
 
 static eventhandler_tag iflladdr_tag;
 
 static const struct netisr_handler arp_nh = {
 	.nh_name = "arp",
 	.nh_handler = arpintr,
 	.nh_proto = NETISR_ARP,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 /*
  * Timeout routine.  Age arp_tab entries periodically.
  */
 static void
 arptimer(void *arg)
 {
 	struct llentry *lle = (struct llentry *)arg;
 	struct ifnet *ifp;
 	int r_skip_req;
 
 	if (lle->la_flags & LLE_STATIC) {
 		return;
 	}
 	LLE_WLOCK(lle);
 	if (callout_pending(&lle->lle_timer)) {
 		/*
 		 * Here we are a bit odd here in the treatment of 
 		 * active/pending. If the pending bit is set, it got
 		 * rescheduled before I ran. The active
 		 * bit we ignore, since if it was stopped
 		 * in ll_tablefree() and was currently running
 		 * it would have return 0 so the code would
 		 * not have deleted it since the callout could
 		 * not be stopped so we want to go through
 		 * with the delete here now. If the callout
 		 * was restarted, the pending bit will be back on and
 		 * we just want to bail since the callout_reset would
 		 * return 1 and our reference would have been removed
 		 * by arpresolve() below.
 		 */
 		LLE_WUNLOCK(lle);
  		return;
  	}
 	ifp = lle->lle_tbl->llt_ifp;
 	CURVNET_SET(ifp->if_vnet);
 
 	switch (lle->ln_state) {
 	case ARP_LLINFO_REACHABLE:
 
 		/*
 		 * Expiration time is approaching.
 		 * Let's try to refresh entry if it is still
 		 * in use.
 		 *
 		 * Set r_skip_req to get feedback from
 		 * fast path. Change state and re-schedule
 		 * ourselves.
 		 */
 		LLE_REQ_LOCK(lle);
 		lle->r_skip_req = 1;
 		LLE_REQ_UNLOCK(lle);
 		lle->ln_state = ARP_LLINFO_VERIFY;
 		callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
 		LLE_WUNLOCK(lle);
 		CURVNET_RESTORE();
 		return;
 	case ARP_LLINFO_VERIFY:
 		LLE_REQ_LOCK(lle);
 		r_skip_req = lle->r_skip_req;
 		LLE_REQ_UNLOCK(lle);
 
 		if (r_skip_req == 0 && lle->la_preempt > 0) {
 			/* Entry was used, issue refresh request */
 			struct in_addr dst;
 			dst = lle->r_l3addr.addr4;
 			lle->la_preempt--;
 			callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
 			LLE_WUNLOCK(lle);
 			arprequest(ifp, NULL, &dst, NULL);
 			CURVNET_RESTORE();
 			return;
 		}
 		/* Nothing happened. Reschedule if not too late */
 		if (lle->la_expire > time_uptime) {
 			callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
 			LLE_WUNLOCK(lle);
 			CURVNET_RESTORE();
 			return;
 		}
 		break;
 	case ARP_LLINFO_INCOMPLETE:
 	case ARP_LLINFO_DELETED:
 		break;
 	}
 
 	if ((lle->la_flags & LLE_DELETED) == 0) {
 		int evt;
 
 		if (lle->la_flags & LLE_VALID)
 			evt = LLENTRY_EXPIRED;
 		else
 			evt = LLENTRY_TIMEDOUT;
 		EVENTHANDLER_INVOKE(lle_event, lle, evt);
 	}
 
 	callout_stop(&lle->lle_timer);
 
 	/* XXX: LOR avoidance. We still have ref on lle. */
 	LLE_WUNLOCK(lle);
 	IF_AFDATA_LOCK(ifp);
 	LLE_WLOCK(lle);
 
 	/* Guard against race with other llentry_free(). */
 	if (lle->la_flags & LLE_LINKED) {
 		LLE_REMREF(lle);
 		lltable_unlink_entry(lle->lle_tbl, lle);
 	}
 	IF_AFDATA_UNLOCK(ifp);
 
 	size_t pkts_dropped = llentry_free(lle);
 
 	ARPSTAT_ADD(dropped, pkts_dropped);
 	ARPSTAT_INC(timeouts);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Stores link-layer header for @ifp in format suitable for if_output()
  * into buffer @buf. Resulting header length is stored in @bufsize.
  *
  * Returns 0 on success.
  */
 static int
 arp_fillheader(struct ifnet *ifp, struct arphdr *ah, int bcast, u_char *buf,
     size_t *bufsize)
 {
 	struct if_encap_req ereq;
 	int error;
 
 	bzero(buf, *bufsize);
 	bzero(&ereq, sizeof(ereq));
 	ereq.buf = buf;
 	ereq.bufsize = *bufsize;
 	ereq.rtype = IFENCAP_LL;
 	ereq.family = AF_ARP;
 	ereq.lladdr = ar_tha(ah);
 	ereq.hdata = (u_char *)ah;
 	if (bcast)
 		ereq.flags = IFENCAP_FLAG_BROADCAST;
 	error = ifp->if_requestencap(ifp, &ereq);
 	if (error == 0)
 		*bufsize = ereq.bufsize;
 
 	return (error);
 }
 
 
 /*
  * Broadcast an ARP request. Caller specifies:
  *	- arp header source ip address
  *	- arp header target ip address
  *	- arp header source ethernet address
  */
 void
 arprequest(struct ifnet *ifp, const struct in_addr *sip,
     const struct in_addr *tip, u_char *enaddr)
 {
 	struct mbuf *m;
 	struct arphdr *ah;
 	struct sockaddr sa;
 	u_char *carpaddr = NULL;
 	uint8_t linkhdr[LLE_MAX_LINKHDR];
 	size_t linkhdrsize;
 	struct route ro;
 	int error;
 
 	if (sip == NULL) {
 		/*
 		 * The caller did not supply a source address, try to find
 		 * a compatible one among those assigned to this interface.
 		 */
 		struct ifaddr *ifa;
 
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 
 			if (ifa->ifa_carp) {
 				if ((*carp_iamatch_p)(ifa, &carpaddr) == 0)
 					continue;
 				sip = &IA_SIN(ifa)->sin_addr;
 			} else {
 				carpaddr = NULL;
 				sip = &IA_SIN(ifa)->sin_addr;
 			}
 
 			if (0 == ((sip->s_addr ^ tip->s_addr) &
 			    IA_MASKSIN(ifa)->sin_addr.s_addr))
 				break;  /* found it. */
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (sip == NULL) {
 			printf("%s: cannot find matching address\n", __func__);
 			return;
 		}
 	}
 	if (enaddr == NULL)
 		enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp);
 
 	if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
 		return;
 	m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) +
 		2 * ifp->if_addrlen;
 	m->m_pkthdr.len = m->m_len;
 	M_ALIGN(m, m->m_len);
 	ah = mtod(m, struct arphdr *);
 	bzero((caddr_t)ah, m->m_len);
 #ifdef MAC
 	mac_netinet_arp_send(ifp, m);
 #endif
 	ah->ar_pro = htons(ETHERTYPE_IP);
 	ah->ar_hln = ifp->if_addrlen;		/* hardware address length */
 	ah->ar_pln = sizeof(struct in_addr);	/* protocol address length */
 	ah->ar_op = htons(ARPOP_REQUEST);
 	bcopy(enaddr, ar_sha(ah), ah->ar_hln);
 	bcopy(sip, ar_spa(ah), ah->ar_pln);
 	bcopy(tip, ar_tpa(ah), ah->ar_pln);
 	sa.sa_family = AF_ARP;
 	sa.sa_len = 2;
 
 	/* Calculate link header for sending frame */
 	bzero(&ro, sizeof(ro));
 	linkhdrsize = sizeof(linkhdr);
 	error = arp_fillheader(ifp, ah, 1, linkhdr, &linkhdrsize);
 	if (error != 0 && error != EAFNOSUPPORT) {
 		ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n",
 		    if_name(ifp), error);
 		return;
 	}
 
 	ro.ro_prepend = linkhdr;
 	ro.ro_plen = linkhdrsize;
 	ro.ro_flags = 0;
 
 	m->m_flags |= M_BCAST;
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	(*ifp->if_output)(ifp, m, &sa, &ro);
 	ARPSTAT_INC(txrequests);
 }
 
 
 /*
  * Resolve an IP address into an ethernet address - heavy version.
  * Used internally by arpresolve().
  * We have already checked than  we can't use existing lle without
  * modification so we have to acquire LLE_EXCLUSIVE lle lock.
  *
  * On success, desten and flags are filled in and the function returns 0;
  * If the packet must be held pending resolution, we return EWOULDBLOCK
  * On other errors, we return the corresponding error code.
  * Note that m_freem() handles NULL.
  */
 static int
 arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m,
 	const struct sockaddr *dst, u_char *desten, uint32_t *pflags)
 {
 	struct llentry *la = NULL, *la_tmp;
 	struct mbuf *curr = NULL;
 	struct mbuf *next = NULL;
 	int error, renew;
 	char *lladdr;
 	int ll_len;
 
 	if (pflags != NULL)
 		*pflags = 0;
 
 	if ((flags & LLE_CREATE) == 0) {
 		IF_AFDATA_RLOCK(ifp);
 		la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 		IF_AFDATA_RUNLOCK(ifp);
 	}
 	if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) {
 		la = lltable_alloc_entry(LLTABLE(ifp), 0, dst);
 		if (la == NULL) {
 			log(LOG_DEBUG,
 			    "arpresolve: can't allocate llinfo for %s on %s\n",
 			    inet_ntoa(SIN(dst)->sin_addr), if_name(ifp));
 			m_freem(m);
 			return (EINVAL);
 		}
 
 		IF_AFDATA_WLOCK(ifp);
 		LLE_WLOCK(la);
 		la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 		/* Prefer ANY existing lle over newly-created one */
 		if (la_tmp == NULL)
 			lltable_link_entry(LLTABLE(ifp), la);
 		IF_AFDATA_WUNLOCK(ifp);
 		if (la_tmp != NULL) {
 			lltable_free_entry(LLTABLE(ifp), la);
 			la = la_tmp;
 		}
 	}
 	if (la == NULL) {
 		m_freem(m);
 		return (EINVAL);
 	}
 
 	if ((la->la_flags & LLE_VALID) &&
 	    ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
 		if (flags & LLE_ADDRONLY) {
 			lladdr = la->ll_addr;
 			ll_len = ifp->if_addrlen;
 		} else {
 			lladdr = la->r_linkdata;
 			ll_len = la->r_hdrlen;
 		}
 		bcopy(lladdr, desten, ll_len);
 
 		/* Check if we have feedback request from arptimer() */
 		if (la->r_skip_req != 0) {
 			LLE_REQ_LOCK(la);
 			la->r_skip_req = 0; /* Notify that entry was used */
 			LLE_REQ_UNLOCK(la);
 		}
 		if (pflags != NULL)
 			*pflags = la->la_flags & (LLE_VALID|LLE_IFADDR);
 		LLE_WUNLOCK(la);
 		return (0);
 	}
 
 	renew = (la->la_asked == 0 || la->la_expire != time_uptime);
 	/*
 	 * There is an arptab entry, but no ethernet address
 	 * response yet.  Add the mbuf to the list, dropping
 	 * the oldest packet if we have exceeded the system
 	 * setting.
 	 */
 	if (m != NULL) {
 		if (la->la_numheld >= V_arp_maxhold) {
 			if (la->la_hold != NULL) {
 				next = la->la_hold->m_nextpkt;
 				m_freem(la->la_hold);
 				la->la_hold = next;
 				la->la_numheld--;
 				ARPSTAT_INC(dropped);
 			}
 		}
 		if (la->la_hold != NULL) {
 			curr = la->la_hold;
 			while (curr->m_nextpkt != NULL)
 				curr = curr->m_nextpkt;
 			curr->m_nextpkt = m;
 		} else
 			la->la_hold = m;
 		la->la_numheld++;
 	}
 	/*
 	 * Return EWOULDBLOCK if we have tried less than arp_maxtries. It
 	 * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH
 	 * if we have already sent arp_maxtries ARP requests. Retransmit the
 	 * ARP request, but not faster than one request per second.
 	 */
 	if (la->la_asked < V_arp_maxtries)
 		error = EWOULDBLOCK;	/* First request. */
 	else
 		error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN;
 
 	if (renew) {
 		int canceled;
 
 		LLE_ADDREF(la);
 		la->la_expire = time_uptime;
 		canceled = callout_reset(&la->lle_timer, hz * V_arpt_down,
 		    arptimer, la);
 		if (canceled)
 			LLE_REMREF(la);
 		la->la_asked++;
 		LLE_WUNLOCK(la);
 		arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL);
 		return (error);
 	}
 
 	LLE_WUNLOCK(la);
 	return (error);
 }
 
 /*
  * Resolve an IP address into an ethernet address.
  */
 int
 arpresolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst,
     char *desten, uint32_t *pflags)
 {
 	int error;
 
 	flags |= LLE_ADDRONLY;
 	error = arpresolve_full(ifp, 0, flags, NULL, dst, desten, pflags);
 	return (error);
 }
 
 
 /*
  * Lookups link header based on an IP address.
  * On input:
  *    ifp is the interface we use
  *    is_gw != 0 if @dst represents gateway to some destination
  *    m is the mbuf. May be NULL if we don't have a packet.
  *    dst is the next hop,
  *    desten is the storage to put LL header.
  *    flags returns subset of lle flags: LLE_VALID | LLE_IFADDR
  *
  * On success, full/partial link header and flags are filled in and
  * the function returns 0.
  * If the packet must be held pending resolution, we return EWOULDBLOCK
  * On other errors, we return the corresponding error code.
  * Note that m_freem() handles NULL.
  */
 int
 arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
 	const struct sockaddr *dst, u_char *desten, uint32_t *pflags)
 {
 	struct llentry *la = 0;
 
 	if (pflags != NULL)
 		*pflags = 0;
 
 	if (m != NULL) {
 		if (m->m_flags & M_BCAST) {
 			/* broadcast */
 			(void)memcpy(desten,
 			    ifp->if_broadcastaddr, ifp->if_addrlen);
 			return (0);
 		}
 		if (m->m_flags & M_MCAST) {
 			/* multicast */
 			ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
 			return (0);
 		}
 	}
 
 	IF_AFDATA_RLOCK(ifp);
 	la = lla_lookup(LLTABLE(ifp), LLE_UNLOCKED, dst);
 	if (la != NULL && (la->r_flags & RLLE_VALID) != 0) {
 		/* Entry found, let's copy lle info */
 		bcopy(la->r_linkdata, desten, la->r_hdrlen);
 		if (pflags != NULL)
 			*pflags = LLE_VALID | (la->r_flags & RLLE_IFADDR);
 		/* Check if we have feedback request from arptimer() */
 		if (la->r_skip_req != 0) {
 			LLE_REQ_LOCK(la);
 			la->r_skip_req = 0; /* Notify that entry was used */
 			LLE_REQ_UNLOCK(la);
 		}
 		IF_AFDATA_RUNLOCK(ifp);
 		return (0);
 	}
 	IF_AFDATA_RUNLOCK(ifp);
 
 	return (arpresolve_full(ifp, is_gw, la == NULL ? LLE_CREATE : 0, m, dst,
 	    desten, pflags));
 }
 
 /*
  * Common length and type checks are done here,
  * then the protocol-specific routine is called.
  */
 static void
 arpintr(struct mbuf *m)
 {
 	struct arphdr *ar;
 	struct ifnet *ifp;
 	char *layer;
 	int hlen;
 
 	ifp = m->m_pkthdr.rcvif;
 
 	if (m->m_len < sizeof(struct arphdr) &&
 	    ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) {
 		ARP_LOG(LOG_NOTICE, "packet with short header received on %s\n",
 		    if_name(ifp));
 		return;
 	}
 	ar = mtod(m, struct arphdr *);
 
 	/* Check if length is sufficient */
 	if (m->m_len <  arphdr_len(ar)) {
 		m = m_pullup(m, arphdr_len(ar));
 		if (m == NULL) {
 			ARP_LOG(LOG_NOTICE, "short packet received on %s\n",
 			    if_name(ifp));
 			return;
 		}
 		ar = mtod(m, struct arphdr *);
 	}
 
 	hlen = 0;
 	layer = "";
 	switch (ntohs(ar->ar_hrd)) {
 	case ARPHRD_ETHER:
 		hlen = ETHER_ADDR_LEN; /* RFC 826 */
 		layer = "ethernet";
 		break;
 	case ARPHRD_IEEE802:
 		hlen = 6; /* RFC 1390, FDDI_ADDR_LEN */
 		layer = "fddi";
 		break;
 	case ARPHRD_ARCNET:
 		hlen = 1; /* RFC 1201, ARC_ADDR_LEN */
 		layer = "arcnet";
 		break;
 	case ARPHRD_INFINIBAND:
 		hlen = 20;	/* RFC 4391, INFINIBAND_ALEN */ 
 		layer = "infiniband";
 		break;
 	case ARPHRD_IEEE1394:
 		hlen = 0; /* SHALL be 16 */ /* RFC 2734 */
 		layer = "firewire";
 
 		/*
 		 * Restrict too long hardware addresses.
 		 * Currently we are capable of handling 20-byte
 		 * addresses ( sizeof(lle->ll_addr) )
 		 */
 		if (ar->ar_hln >= 20)
 			hlen = 16;
 		break;
 	default:
 		ARP_LOG(LOG_NOTICE,
 		    "packet with unknown hardware format 0x%02d received on "
 		    "%s\n", ntohs(ar->ar_hrd), if_name(ifp));
 		m_freem(m);
 		return;
 	}
 
 	if (hlen != 0 && hlen != ar->ar_hln) {
 		ARP_LOG(LOG_NOTICE,
 		    "packet with invalid %s address length %d received on %s\n",
 		    layer, ar->ar_hln, if_name(ifp));
 		m_freem(m);
 		return;
 	}
 
 	ARPSTAT_INC(received);
 	switch (ntohs(ar->ar_pro)) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		in_arpinput(m);
 		return;
 #endif
 	}
 	m_freem(m);
 }
 
 #ifdef INET
 /*
  * ARP for Internet protocols on 10 Mb/s Ethernet.
  * Algorithm is that given in RFC 826.
  * In addition, a sanity check is performed on the sender
  * protocol address, to catch impersonators.
  * We no longer handle negotiations for use of trailer protocol:
  * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent
  * along with IP replies if we wanted trailers sent to us,
  * and also sent them in response to IP replies.
  * This allowed either end to announce the desire to receive
  * trailer packets.
  * We no longer reply to requests for ETHERTYPE_TRAIL protocol either,
  * but formerly didn't normally send requests.
  */
 static int log_arp_wrong_iface = 1;
 static int log_arp_movements = 1;
 static int log_arp_permanent_modify = 1;
 static int allow_multicast = 0;
 
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW,
 	&log_arp_wrong_iface, 0,
 	"log arp packets arriving on the wrong interface");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW,
 	&log_arp_movements, 0,
 	"log arp replies from MACs different than the one in the cache");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW,
 	&log_arp_permanent_modify, 0,
 	"log arp replies from MACs different than the one in the permanent arp entry");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW,
 	&allow_multicast, 0, "accept multicast addresses");
 
 static void
 in_arpinput(struct mbuf *m)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct arphdr *ah;
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct llentry *la = NULL, *la_tmp;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	struct sockaddr sa;
 	struct in_addr isaddr, itaddr, myaddr;
 	u_int8_t *enaddr = NULL;
 	int op;
 	int bridged = 0, is_bridge = 0;
 	int carped;
 	struct sockaddr_in sin;
 	struct sockaddr *dst;
 	struct nhop4_basic nh4;
 	uint8_t linkhdr[LLE_MAX_LINKHDR];
 	struct route ro;
 	size_t linkhdrsize;
 	int lladdr_off;
 	int error;
 
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = 0;
 
 	if (ifp->if_bridge)
 		bridged = 1;
 	if (ifp->if_type == IFT_BRIDGE)
 		is_bridge = 1;
 
 	/*
 	 * We already have checked that mbuf contains enough contiguous data
 	 * to hold entire arp message according to the arp header.
 	 */
 	ah = mtod(m, struct arphdr *);
 
 	/*
 	 * ARP is only for IPv4 so we can reject packets with
 	 * a protocol length not equal to an IPv4 address.
 	 */
 	if (ah->ar_pln != sizeof(struct in_addr)) {
 		ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n",
 		    sizeof(struct in_addr));
 		goto drop;
 	}
 
 	if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) {
 		ARP_LOG(LOG_NOTICE, "%*D is multicast\n",
 		    ifp->if_addrlen, (u_char *)ar_sha(ah), ":");
 		goto drop;
 	}
 
 	op = ntohs(ah->ar_op);
 	(void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr));
 	(void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr));
 
 	if (op == ARPOP_REPLY)
 		ARPSTAT_INC(rxreplies);
 
 	/*
 	 * For a bridge, we want to check the address irrespective
 	 * of the receive interface. (This will change slightly
 	 * when we have clusters of interfaces).
 	 */
 	IN_IFADDR_RLOCK(&in_ifa_tracker);
 	LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
 		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
 		    ia->ia_ifp == ifp) &&
 		    itaddr.s_addr == ia->ia_addr.sin_addr.s_addr &&
 		    (ia->ia_ifa.ifa_carp == NULL ||
 		    (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) {
 			ifa_ref(&ia->ia_ifa);
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			goto match;
 		}
 	}
 	LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash)
 		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
 		    ia->ia_ifp == ifp) &&
 		    isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
 			ifa_ref(&ia->ia_ifa);
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			goto match;
 		}
 
 #define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia)				\
   (ia->ia_ifp->if_bridge == ifp->if_softc &&				\
   !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) &&	\
   addr == ia->ia_addr.sin_addr.s_addr)
 	/*
 	 * Check the case when bridge shares its MAC address with
 	 * some of its children, so packets are claimed by bridge
 	 * itself (bridge_input() does it first), but they are really
 	 * meant to be destined to the bridge member.
 	 */
 	if (is_bridge) {
 		LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
 			if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) {
 				ifa_ref(&ia->ia_ifa);
 				ifp = ia->ia_ifp;
 				IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 				goto match;
 			}
 		}
 	}
 #undef BDG_MEMBER_MATCHES_ARP
 	IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 
 	/*
 	 * No match, use the first inet address on the receive interface
 	 * as a dummy address for the rest of the function.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (ifa->ifa_addr->sa_family == AF_INET &&
 		    (ifa->ifa_carp == NULL ||
 		    (*carp_iamatch_p)(ifa, &enaddr))) {
 			ia = ifatoia(ifa);
 			ifa_ref(ifa);
 			IF_ADDR_RUNLOCK(ifp);
 			goto match;
 		}
 	IF_ADDR_RUNLOCK(ifp);
 
 	/*
 	 * If bridging, fall back to using any inet address.
 	 */
 	IN_IFADDR_RLOCK(&in_ifa_tracker);
 	if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) {
 		IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 		goto drop;
 	}
 	ifa_ref(&ia->ia_ifa);
 	IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 match:
 	if (!enaddr)
 		enaddr = (u_int8_t *)IF_LLADDR(ifp);
 	carped = (ia->ia_ifa.ifa_carp != NULL);
 	myaddr = ia->ia_addr.sin_addr;
 	ifa_free(&ia->ia_ifa);
 	if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen))
 		goto drop;	/* it's from me, ignore it. */
 	if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) {
 		ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address "
 		    "%s!\n", inet_ntoa(isaddr));
 		goto drop;
 	}
 
 	if (ifp->if_addrlen != ah->ar_hln) {
 		ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, "
 		    "i/f %d (ignored)\n", ifp->if_addrlen,
 		    (u_char *) ar_sha(ah), ":", ah->ar_hln,
 		    ifp->if_addrlen);
 		goto drop;
 	}
 
 	/*
 	 * Warn if another host is using the same IP address, but only if the
 	 * IP address isn't 0.0.0.0, which is used for DHCP only, in which
 	 * case we suppress the warning to avoid false positive complaints of
 	 * potential misconfiguration.
 	 */
 	if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr &&
 	    myaddr.s_addr != 0) {
 		ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n",
 		   ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 		   inet_ntoa(isaddr), ifp->if_xname);
 		itaddr = myaddr;
 		ARPSTAT_INC(dupips);
 		goto reply;
 	}
 	if (ifp->if_flags & IFF_STATICARP)
 		goto reply;
 
 	bzero(&sin, sizeof(sin));
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_addr = isaddr;
 	dst = (struct sockaddr *)&sin;
 	IF_AFDATA_RLOCK(ifp);
 	la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 	IF_AFDATA_RUNLOCK(ifp);
 	if (la != NULL)
 		arp_check_update_lle(ah, isaddr, ifp, bridged, la);
 	else if (itaddr.s_addr == myaddr.s_addr) {
 		/*
 		 * Request/reply to our address, but no lle exists yet.
 		 * Calculate full link prepend to use in lle.
 		 */
 		linkhdrsize = sizeof(linkhdr);
 		if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr,
 		    &linkhdrsize, &lladdr_off) != 0)
 			goto reply;
 
 		/* Allocate new entry */
 		la = lltable_alloc_entry(LLTABLE(ifp), 0, dst);
 		if (la == NULL) {
 
 			/*
 			 * lle creation may fail if source address belongs
 			 * to non-directly connected subnet. However, we
 			 * will try to answer the request instead of dropping
 			 * frame.
 			 */
 			goto reply;
 		}
 		lltable_set_entry_addr(ifp, la, linkhdr, linkhdrsize,
 		    lladdr_off);
 
 		IF_AFDATA_WLOCK(ifp);
 		LLE_WLOCK(la);
 		la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 
 		/*
 		 * Check if lle still does not exists.
 		 * If it does, that means that we either
 		 * 1) have configured it explicitly, via
 		 * 1a) 'arp -s' static entry or
 		 * 1b) interface address static record
 		 * or
 		 * 2) it was the result of sending first packet to-host
 		 * or
 		 * 3) it was another arp reply packet we handled in
 		 * different thread.
 		 *
 		 * In all cases except 3) we definitely need to prefer
 		 * existing lle. For the sake of simplicity, prefer any
 		 * existing lle over newly-create one.
 		 */
 		if (la_tmp == NULL)
 			lltable_link_entry(LLTABLE(ifp), la);
 		IF_AFDATA_WUNLOCK(ifp);
 
 		if (la_tmp == NULL) {
 			arp_mark_lle_reachable(la);
 			LLE_WUNLOCK(la);
 		} else {
 			/* Free newly-create entry and handle packet */
 			lltable_free_entry(LLTABLE(ifp), la);
 			la = la_tmp;
 			la_tmp = NULL;
 			arp_check_update_lle(ah, isaddr, ifp, bridged, la);
 			/* arp_check_update_lle() returns @la unlocked */
 		}
 		la = NULL;
 	}
 reply:
 	if (op != ARPOP_REQUEST)
 		goto drop;
 	ARPSTAT_INC(rxrequests);
 
 	if (itaddr.s_addr == myaddr.s_addr) {
 		/* Shortcut.. the receiving interface is the target. */
 		(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 		(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
 	} else {
 		struct llentry *lle = NULL;
 
 		sin.sin_addr = itaddr;
 		IF_AFDATA_RLOCK(ifp);
 		lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin);
 		IF_AFDATA_RUNLOCK(ifp);
 
 		if ((lle != NULL) && (lle->la_flags & LLE_PUB)) {
 			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 			(void)memcpy(ar_sha(ah), lle->ll_addr, ah->ar_hln);
 			LLE_RUNLOCK(lle);
 		} else {
 
 			if (lle != NULL)
 				LLE_RUNLOCK(lle);
 
 			if (!V_arp_proxyall)
 				goto drop;
 
 			/* XXX MRT use table 0 for arp reply  */
 			if (fib4_lookup_nh_basic(0, itaddr, 0, 0, &nh4) != 0)
 				goto drop;
 
 			/*
 			 * Don't send proxies for nodes on the same interface
 			 * as this one came out of, or we'll get into a fight
 			 * over who claims what Ether address.
 			 */
 			if (nh4.nh_ifp == ifp)
 				goto drop;
 
 			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 			(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
 
 			/*
 			 * Also check that the node which sent the ARP packet
 			 * is on the interface we expect it to be on. This
 			 * avoids ARP chaos if an interface is connected to the
 			 * wrong network.
 			 */
 
 			/* XXX MRT use table 0 for arp checks */
 			if (fib4_lookup_nh_basic(0, isaddr, 0, 0, &nh4) != 0)
 				goto drop;
 			if (nh4.nh_ifp != ifp) {
 				ARP_LOG(LOG_INFO, "proxy: ignoring request"
 				    " from %s via %s\n",
 				    inet_ntoa(isaddr), ifp->if_xname);
 				goto drop;
 			}
 
 #ifdef DEBUG_PROXY
 			printf("arp: proxying for %s\n", inet_ntoa(itaddr));
 #endif
 		}
 	}
 
 	if (itaddr.s_addr == myaddr.s_addr &&
 	    IN_LINKLOCAL(ntohl(itaddr.s_addr))) {
 		/* RFC 3927 link-local IPv4; always reply by broadcast. */
 #ifdef DEBUG_LINKLOCAL
 		printf("arp: sending reply for link-local addr %s\n",
 		    inet_ntoa(itaddr));
 #endif
 		m->m_flags |= M_BCAST;
 		m->m_flags &= ~M_MCAST;
 	} else {
 		/* default behaviour; never reply by broadcast. */
 		m->m_flags &= ~(M_BCAST|M_MCAST);
 	}
 	(void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln);
 	(void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln);
 	ah->ar_op = htons(ARPOP_REPLY);
 	ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */
 	m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln);
 	m->m_pkthdr.len = m->m_len;
 	m->m_pkthdr.rcvif = NULL;
 	sa.sa_family = AF_ARP;
 	sa.sa_len = 2;
 
 	/* Calculate link header for sending frame */
 	bzero(&ro, sizeof(ro));
 	linkhdrsize = sizeof(linkhdr);
 	error = arp_fillheader(ifp, ah, 0, linkhdr, &linkhdrsize);
 
 	/*
 	 * arp_fillheader() may fail due to lack of support inside encap request
 	 * routing. This is not necessary an error, AF_ARP can/should be handled
 	 * by if_output().
 	 */
 	if (error != 0 && error != EAFNOSUPPORT) {
 		ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n",
 		    if_name(ifp), error);
 		return;
 	}
 
 	ro.ro_prepend = linkhdr;
 	ro.ro_plen = linkhdrsize;
 	ro.ro_flags = 0;
 
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	(*ifp->if_output)(ifp, m, &sa, &ro);
 	ARPSTAT_INC(txreplies);
 	return;
 
 drop:
 	m_freem(m);
 }
 #endif
 
 /*
  * Checks received arp data against existing @la.
  * Updates lle state/performs notification if necessary.
  */
 static void
 arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp,
     int bridged, struct llentry *la)
 {
 	struct sockaddr sa;
 	struct mbuf *m_hold, *m_hold_next;
 	uint8_t linkhdr[LLE_MAX_LINKHDR];
 	size_t linkhdrsize;
 	int lladdr_off;
 
 	LLE_WLOCK_ASSERT(la);
 
 	/* the following is not an error when doing bridging */
 	if (!bridged && la->lle_tbl->llt_ifp != ifp) {
 		if (log_arp_wrong_iface)
 			ARP_LOG(LOG_WARNING, "%s is on %s "
 			    "but got reply from %*D on %s\n",
 			    inet_ntoa(isaddr),
 			    la->lle_tbl->llt_ifp->if_xname,
 			    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 			    ifp->if_xname);
 		LLE_WUNLOCK(la);
 		return;
 	}
 	if ((la->la_flags & LLE_VALID) &&
 	    bcmp(ar_sha(ah), la->ll_addr, ifp->if_addrlen)) {
 		if (la->la_flags & LLE_STATIC) {
 			LLE_WUNLOCK(la);
 			if (log_arp_permanent_modify)
 				ARP_LOG(LOG_ERR,
 				    "%*D attempts to modify "
 				    "permanent entry for %s on %s\n",
 				    ifp->if_addrlen,
 				    (u_char *)ar_sha(ah), ":",
 				    inet_ntoa(isaddr), ifp->if_xname);
 			return;
 		}
 		if (log_arp_movements) {
 			ARP_LOG(LOG_INFO, "%s moved from %*D "
 			    "to %*D on %s\n",
 			    inet_ntoa(isaddr),
 			    ifp->if_addrlen,
 			    (u_char *)&la->ll_addr, ":",
 			    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 			    ifp->if_xname);
 		}
 	}
 
 	/* Calculate full link prepend to use in lle */
 	linkhdrsize = sizeof(linkhdr);
 	if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr,
 	    &linkhdrsize, &lladdr_off) != 0)
 		return;
 
 	/* Check if something has changed */
 	if (memcmp(la->r_linkdata, linkhdr, linkhdrsize) != 0 ||
 	    (la->la_flags & LLE_VALID) == 0) {
 		/* Try to perform LLE update */
 		if (lltable_try_set_entry_addr(ifp, la, linkhdr, linkhdrsize,
 		    lladdr_off) == 0)
 			return;
 
 		/* Clear fast path feedback request if set */
 		la->r_skip_req = 0;
 	}
 
 	arp_mark_lle_reachable(la);
 
 	/*
 	 * The packets are all freed within the call to the output
 	 * routine.
 	 *
 	 * NB: The lock MUST be released before the call to the
 	 * output routine.
 	 */
 	if (la->la_hold != NULL) {
 		m_hold = la->la_hold;
 		la->la_hold = NULL;
 		la->la_numheld = 0;
 		lltable_fill_sa_entry(la, &sa);
 		LLE_WUNLOCK(la);
 		for (; m_hold != NULL; m_hold = m_hold_next) {
 			m_hold_next = m_hold->m_nextpkt;
 			m_hold->m_nextpkt = NULL;
 			/* Avoid confusing lower layers. */
 			m_clrprotoflags(m_hold);
 			(*ifp->if_output)(ifp, m_hold, &sa, NULL);
 		}
 	} else
 		LLE_WUNLOCK(la);
 }
 
 static void
 arp_mark_lle_reachable(struct llentry *la)
 {
 	int canceled, wtime;
 
 	LLE_WLOCK_ASSERT(la);
 
 	la->ln_state = ARP_LLINFO_REACHABLE;
 	EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
 
 	if (!(la->la_flags & LLE_STATIC)) {
 		LLE_ADDREF(la);
 		la->la_expire = time_uptime + V_arpt_keep;
 		wtime = V_arpt_keep - V_arp_maxtries * V_arpt_rexmit;
 		if (wtime < 0)
 			wtime = V_arpt_keep;
 		canceled = callout_reset(&la->lle_timer,
 		    hz * wtime, arptimer, la);
 		if (canceled)
 			LLE_REMREF(la);
 	}
 	la->la_asked = 0;
 	la->la_preempt = V_arp_maxtries;
 }
 
 /*
  * Add pernament link-layer record for given interface address.
  */
 static __noinline void
 arp_add_ifa_lle(struct ifnet *ifp, const struct sockaddr *dst)
 {
 	struct llentry *lle, *lle_tmp;
 
 	/*
 	 * Interface address LLE record is considered static
 	 * because kernel code relies on LLE_STATIC flag to check
 	 * if these entries can be rewriten by arp updates.
 	 */
 	lle = lltable_alloc_entry(LLTABLE(ifp), LLE_IFADDR | LLE_STATIC, dst);
 	if (lle == NULL) {
 		log(LOG_INFO, "arp_ifinit: cannot create arp "
 		    "entry for interface address\n");
 		return;
 	}
 
 	IF_AFDATA_WLOCK(ifp);
 	LLE_WLOCK(lle);
 	/* Unlink any entry if exists */
 	lle_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 	if (lle_tmp != NULL)
 		lltable_unlink_entry(LLTABLE(ifp), lle_tmp);
 
 	lltable_link_entry(LLTABLE(ifp), lle);
 	IF_AFDATA_WUNLOCK(ifp);
 
 	if (lle_tmp != NULL)
 		EVENTHANDLER_INVOKE(lle_event, lle_tmp, LLENTRY_EXPIRED);
 
 	EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED);
 	LLE_WUNLOCK(lle);
 	if (lle_tmp != NULL)
 		lltable_free_entry(LLTABLE(ifp), lle_tmp);
 }
 
 void
 arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa)
 {
 	const struct sockaddr_in *dst_in;
 	const struct sockaddr *dst;
 
 	if (ifa->ifa_carp != NULL)
 		return;
 
 	dst = ifa->ifa_addr;
 	dst_in = (const struct sockaddr_in *)dst;
 
 	if (ntohl(dst_in->sin_addr.s_addr) == INADDR_ANY)
 		return;
 	arp_announce_ifaddr(ifp, dst_in->sin_addr, IF_LLADDR(ifp));
 
 	arp_add_ifa_lle(ifp, dst);
 }
 
 void
 arp_announce_ifaddr(struct ifnet *ifp, struct in_addr addr, u_char *enaddr)
 {
 
 	if (ntohl(addr.s_addr) != INADDR_ANY)
 		arprequest(ifp, &addr, &addr, enaddr);
 }
 
 /*
  * Sends gratuitous ARPs for each ifaddr to notify other
  * nodes about the address change.
  */
 static __noinline void
 arp_handle_ifllchange(struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			arp_ifinit(ifp, ifa);
 	}
 }
 
 /*
  * A handler for interface link layer address change event.
  */
 static void
 arp_iflladdr(void *arg __unused, struct ifnet *ifp)
 {
 
 	lltable_update_ifaddr(LLTABLE(ifp));
 
 	if ((ifp->if_flags & IFF_UP) != 0)
 		arp_handle_ifllchange(ifp);
 }
 
 static void
-arp_init(void)
+vnet_arp_init(void)
 {
 
 	netisr_register(&arp_nh);
 	if (IS_DEFAULT_VNET(curvnet))
 		iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event,
 		    arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
 }
-SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, arp_init, 0);
+VNET_SYSINIT(vnet_arp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND,
+    vnet_arp_init, 0);
+
+#ifdef VIMAGE
+/*
+ * We have to unregister ARP along with IP otherwise we risk doing INADDR_HASH
+ * lookups after destroying the hash.  Ideally this would go on SI_ORDER_3.5.
+ */
+static void
+vnet_arp_destroy(__unused void *arg)
+{
+
+	netisr_unregister(&arp_nh);
+}
+VNET_SYSUNINIT(vnet_arp_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
+    vnet_arp_destroy, NULL);
+#endif
Index: projects/vnet/sys/netinet/igmp.c
===================================================================
--- projects/vnet/sys/netinet/igmp.c	(revision 298547)
+++ projects/vnet/sys/netinet/igmp.c	(revision 298548)
@@ -1,3667 +1,3676 @@
 /*-
  * Copyright (c) 2007-2009 Bruce Simpson.
  * Copyright (c) 1988 Stephen Deering.
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
  */
 
 /*
  * Internet Group Management Protocol (IGMP) routines.
  * [RFC1112, RFC2236, RFC3376]
  *
  * Written by Steve Deering, Stanford, May 1988.
  * Modified by Rosen Sharma, Stanford, Aug 1994.
  * Modified by Bill Fenner, Xerox PARC, Feb 1995.
  * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995.
  * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson.
  *
  * MULTICAST Revision: 3.5.1.4
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/sysctl.h>
 #include <sys/ktr.h>
 #include <sys/condvar.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/igmp.h>
 #include <netinet/igmp_var.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifndef KTR_IGMPV3
 #define KTR_IGMPV3 KTR_INET
 #endif
 
 static struct igmp_ifsoftc *
 		igi_alloc_locked(struct ifnet *);
 static void	igi_delete_locked(const struct ifnet *);
 static void	igmp_dispatch_queue(struct mbufq *, int, const int);
 static void	igmp_fasttimo_vnet(void);
 static void	igmp_final_leave(struct in_multi *, struct igmp_ifsoftc *);
 static int	igmp_handle_state_change(struct in_multi *,
 		    struct igmp_ifsoftc *);
 static int	igmp_initial_join(struct in_multi *, struct igmp_ifsoftc *);
 static int	igmp_input_v1_query(struct ifnet *, const struct ip *,
 		    const struct igmp *);
 static int	igmp_input_v2_query(struct ifnet *, const struct ip *,
 		    const struct igmp *);
 static int	igmp_input_v3_query(struct ifnet *, const struct ip *,
 		    /*const*/ struct igmpv3 *);
 static int	igmp_input_v3_group_query(struct in_multi *,
 		    struct igmp_ifsoftc *, int, /*const*/ struct igmpv3 *);
 static int	igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *,
 		    /*const*/ struct igmp *);
 static int	igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *,
 		    /*const*/ struct igmp *);
 static void	igmp_intr(struct mbuf *);
 static int	igmp_isgroupreported(const struct in_addr);
 static struct mbuf *
 		igmp_ra_alloc(void);
 #ifdef KTR
 static char *	igmp_rec_type_to_str(const int);
 #endif
 static void	igmp_set_version(struct igmp_ifsoftc *, const int);
 static void	igmp_slowtimo_vnet(void);
 static int	igmp_v1v2_queue_report(struct in_multi *, const int);
 static void	igmp_v1v2_process_group_timer(struct in_multi *, const int);
 static void	igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *);
 static void	igmp_v2_update_group(struct in_multi *, const int);
 static void	igmp_v3_cancel_link_timers(struct igmp_ifsoftc *);
 static void	igmp_v3_dispatch_general_query(struct igmp_ifsoftc *);
 static struct mbuf *
 		igmp_v3_encap_report(struct ifnet *, struct mbuf *);
 static int	igmp_v3_enqueue_group_record(struct mbufq *,
 		    struct in_multi *, const int, const int, const int);
 static int	igmp_v3_enqueue_filter_change(struct mbufq *,
 		    struct in_multi *);
 static void	igmp_v3_process_group_timers(struct igmp_ifsoftc *,
 		    struct mbufq *, struct mbufq *, struct in_multi *,
 		    const int);
 static int	igmp_v3_merge_state_changes(struct in_multi *,
 		    struct mbufq *);
 static void	igmp_v3_suppress_group_record(struct in_multi *);
 static int	sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS);
 static int	sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS);
 static int	sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS);
 
 static const struct netisr_handler igmp_nh = {
 	.nh_name = "igmp",
 	.nh_handler = igmp_intr,
 	.nh_proto = NETISR_IGMP,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 /*
  * System-wide globals.
  *
  * Unlocked access to these is OK, except for the global IGMP output
  * queue. The IGMP subsystem lock ends up being system-wide for the moment,
  * because all VIMAGEs have to share a global output queue, as netisrs
  * themselves are not virtualized.
  *
  * Locking:
  *  * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
  *    Any may be taken independently; if any are held at the same
  *    time, the above lock order must be followed.
  *  * All output is delegated to the netisr.
  *    Now that Giant has been eliminated, the netisr may be inlined.
  *  * IN_MULTI_LOCK covers in_multi.
  *  * IGMP_LOCK covers igmp_ifsoftc and any global variables in this file,
  *    including the output queue.
  *  * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
  *    per-link state iterators.
  *  * igmp_ifsoftc is valid as long as PF_INET is attached to the interface,
  *    therefore it is not refcounted.
  *    We allow unlocked reads of igmp_ifsoftc when accessed via in_multi.
  *
  * Reference counting
  *  * IGMP acquires its own reference every time an in_multi is passed to
  *    it and the group is being joined for the first time.
  *  * IGMP releases its reference(s) on in_multi in a deferred way,
  *    because the operations which process the release run as part of
  *    a loop whose control variables are directly affected by the release
  *    (that, and not recursing on the IF_ADDR_LOCK).
  *
  * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds
  * to a vnet in ifp->if_vnet.
  *
  * SMPng: XXX We may potentially race operations on ifma_protospec.
  * The problem is that we currently lack a clean way of taking the
  * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing,
  * as anything which modifies ifma needs to be covered by that lock.
  * So check for ifma_protospec being NULL before proceeding.
  */
 struct mtx		 igmp_mtx;
 
 struct mbuf		*m_raopt;		 /* Router Alert option */
 static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");
 
 /*
  * VIMAGE-wide globals.
  *
  * The IGMPv3 timers themselves need to run per-image, however,
  * protosw timers run globally (see tcp).
  * An ifnet can only be in one vimage at a time, and the loopback
  * ifnet, loif, is itself virtualized.
  * It would otherwise be possible to seriously hose IGMP state,
  * and create inconsistencies in upstream multicast routing, if you have
  * multiple VIMAGEs running on the same link joining different multicast
  * groups, UNLESS the "primary IP address" is different. This is because
  * IGMP for IPv4 does not force link-local addresses to be used for each
  * node, unlike MLD for IPv6.
  * Obviously the IGMPv3 per-interface state has per-vimage granularity
  * also as a result.
  *
  * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection
  * policy to control the address used by IGMP on the link.
  */
 static VNET_DEFINE(int, interface_timers_running);	/* IGMPv3 general
 							 * query response */
 static VNET_DEFINE(int, state_change_timers_running);	/* IGMPv3 state-change
 							 * retransmit */
 static VNET_DEFINE(int, current_state_timers_running);	/* IGMPv1/v2 host
 							 * report; IGMPv3 g/sg
 							 * query response */
 
 #define	V_interface_timers_running	VNET(interface_timers_running)
 #define	V_state_change_timers_running	VNET(state_change_timers_running)
 #define	V_current_state_timers_running	VNET(current_state_timers_running)
 
 static VNET_DEFINE(LIST_HEAD(, igmp_ifsoftc), igi_head) =
     LIST_HEAD_INITIALIZER(igi_head);
 static VNET_DEFINE(struct igmpstat, igmpstat) = {
 	.igps_version = IGPS_VERSION_3,
 	.igps_len = sizeof(struct igmpstat),
 };
 static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0};
 
 #define	V_igi_head			VNET(igi_head)
 #define	V_igmpstat			VNET(igmpstat)
 #define	V_igmp_gsrdelay			VNET(igmp_gsrdelay)
 
 static VNET_DEFINE(int, igmp_recvifkludge) = 1;
 static VNET_DEFINE(int, igmp_sendra) = 1;
 static VNET_DEFINE(int, igmp_sendlocal) = 1;
 static VNET_DEFINE(int, igmp_v1enable) = 1;
 static VNET_DEFINE(int, igmp_v2enable) = 1;
 static VNET_DEFINE(int, igmp_legacysupp);
 static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3;
 
 #define	V_igmp_recvifkludge		VNET(igmp_recvifkludge)
 #define	V_igmp_sendra			VNET(igmp_sendra)
 #define	V_igmp_sendlocal		VNET(igmp_sendlocal)
 #define	V_igmp_v1enable			VNET(igmp_v1enable)
 #define	V_igmp_v2enable			VNET(igmp_v2enable)
 #define	V_igmp_legacysupp		VNET(igmp_legacysupp)
 #define	V_igmp_default_version		VNET(igmp_default_version)
 
 /*
  * Virtualized sysctls.
  */
 SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmpstat), igmpstat, "");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_recvifkludge), 0,
     "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_sendra), 0,
     "Send IP Router Alert option in IGMPv2/v3 messages");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_sendlocal), 0,
     "Send IGMP membership reports for 224.0.0.0/24 groups");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_v1enable), 0,
     "Enable backwards compatibility with IGMPv1");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_v2enable), 0,
     "Enable backwards compatibility with IGMPv2");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_legacysupp), 0,
     "Allow v1/v2 reports to suppress v3 group responses");
 SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I",
     "Default version of IGMP to run on each interface");
 SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I",
     "Rate limit for IGMPv3 Group-and-Source queries in seconds");
 
 /*
  * Non-virtualized sysctls.
  */
 static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo,
     "Per-interface IGMPv3 state");
 
 static __inline void
 igmp_save_context(struct mbuf *m, struct ifnet *ifp)
 {
 
 #ifdef VIMAGE
 	m->m_pkthdr.PH_loc.ptr = ifp->if_vnet;
 #endif /* VIMAGE */
 	m->m_pkthdr.flowid = ifp->if_index;
 }
 
 static __inline void
 igmp_scrub_context(struct mbuf *m)
 {
 
 	m->m_pkthdr.PH_loc.ptr = NULL;
 	m->m_pkthdr.flowid = 0;
 }
 
 #ifdef KTR
 static __inline char *
 inet_ntoa_haddr(in_addr_t haddr)
 {
 	struct in_addr ia;
 
 	ia.s_addr = htonl(haddr);
 	return (inet_ntoa(ia));
 }
 #endif
 
 /*
  * Restore context from a queued IGMP output chain.
  * Return saved ifindex.
  *
  * VIMAGE: The assertion is there to make sure that we
  * actually called CURVNET_SET() with what's in the mbuf chain.
  */
 static __inline uint32_t
 igmp_restore_context(struct mbuf *m)
 {
 
 #ifdef notyet
 #if defined(VIMAGE) && defined(INVARIANTS)
 	KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr),
 	    ("%s: called when curvnet was not restored", __func__));
 #endif
 #endif
 	return (m->m_pkthdr.flowid);
 }
 
 /*
  * Retrieve or set default IGMP version.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by IGMP lock.
  */
 static int
 sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS)
 {
 	int	 error;
 	int	 new;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	IGMP_LOCK();
 
 	new = V_igmp_default_version;
 
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d",
 	     V_igmp_default_version, new);
 
 	V_igmp_default_version = new;
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Retrieve or set threshold between group-source queries in seconds.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by IGMP lock.
  */
 static int
 sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	IGMP_LOCK();
 
 	i = V_igmp_gsrdelay.tv_sec;
 
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (i < -1 || i >= 60) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d",
 	     V_igmp_gsrdelay.tv_sec, i);
 	V_igmp_gsrdelay.tv_sec = i;
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Expose struct igmp_ifsoftc to userland, keyed by ifindex.
  * For use by ifmcstat(8).
  *
  * SMPng: NOTE: Does an unlocked ifindex space read.
  * VIMAGE: Assume curvnet set by caller. The node handler itself
  * is not directly virtualized.
  */
 static int
 sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
 {
 	int			*name;
 	int			 error;
 	u_int			 namelen;
 	struct ifnet		*ifp;
 	struct igmp_ifsoftc	*igi;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo));
 	if (error)
 		return (error);
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	if (name[0] <= 0 || name[0] > V_if_index) {
 		error = ENOENT;
 		goto out_locked;
 	}
 
 	error = ENOENT;
 
 	ifp = ifnet_byindex(name[0]);
 	if (ifp == NULL)
 		goto out_locked;
 
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		if (ifp == igi->igi_ifp) {
 			struct igmp_ifinfo info;
 
 			info.igi_version = igi->igi_version;
 			info.igi_v1_timer = igi->igi_v1_timer;
 			info.igi_v2_timer = igi->igi_v2_timer;
 			info.igi_v3_timer = igi->igi_v3_timer;
 			info.igi_flags = igi->igi_flags;
 			info.igi_rv = igi->igi_rv;
 			info.igi_qi = igi->igi_qi;
 			info.igi_qri = igi->igi_qri;
 			info.igi_uri = igi->igi_uri;
 			error = SYSCTL_OUT(req, &info, sizeof(info));
 			break;
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Dispatch an entire queue of pending packet chains
  * using the netisr.
  * VIMAGE: Assumes the vnet pointer has been set.
  */
 static void
 igmp_dispatch_queue(struct mbufq *mq, int limit, const int loop)
 {
 	struct mbuf *m;
 
 	while ((m = mbufq_dequeue(mq)) != NULL) {
 		CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, mq, m);
 		if (loop)
 			m->m_flags |= M_IGMP_LOOP;
 		netisr_dispatch(NETISR_IGMP, m);
 		if (--limit == 0)
 			break;
 	}
 }
 
 /*
  * Filter outgoing IGMP report state by group.
  *
  * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1).
  * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are
  * disabled for all groups in the 224.0.0.0/24 link-local scope. However,
  * this may break certain IGMP snooping switches which rely on the old
  * report behaviour.
  *
  * Return zero if the given group is one for which IGMP reports
  * should be suppressed, or non-zero if reports should be issued.
  */
 static __inline int
 igmp_isgroupreported(const struct in_addr addr)
 {
 
 	if (in_allhosts(addr) ||
 	    ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr)))))
 		return (0);
 
 	return (1);
 }
 
 /*
  * Construct a Router Alert option to use in outgoing packets.
  */
 static struct mbuf *
 igmp_ra_alloc(void)
 {
 	struct mbuf	*m;
 	struct ipoption	*p;
 
 	m = m_get(M_WAITOK, MT_DATA);
 	p = mtod(m, struct ipoption *);
 	p->ipopt_dst.s_addr = INADDR_ANY;
 	p->ipopt_list[0] = IPOPT_RA;	/* Router Alert Option */
 	p->ipopt_list[1] = 0x04;	/* 4 bytes long */
 	p->ipopt_list[2] = IPOPT_EOL;	/* End of IP option list */
 	p->ipopt_list[3] = 0x00;	/* pad byte */
 	m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1];
 
 	return (m);
 }
 
 /*
  * Attach IGMP when PF_INET is attached to an interface.
  */
 struct igmp_ifsoftc *
 igmp_domifattach(struct ifnet *ifp)
 {
 	struct igmp_ifsoftc *igi;
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK();
 
 	igi = igi_alloc_locked(ifp);
 	if (!(ifp->if_flags & IFF_MULTICAST))
 		igi->igi_flags |= IGIF_SILENT;
 
 	IGMP_UNLOCK();
 
 	return (igi);
 }
 
 /*
  * VIMAGE: assume curvnet set by caller.
  */
 static struct igmp_ifsoftc *
 igi_alloc_locked(/*const*/ struct ifnet *ifp)
 {
 	struct igmp_ifsoftc *igi;
 
 	IGMP_LOCK_ASSERT();
 
 	igi = malloc(sizeof(struct igmp_ifsoftc), M_IGMP, M_NOWAIT|M_ZERO);
 	if (igi == NULL)
 		goto out;
 
 	igi->igi_ifp = ifp;
 	igi->igi_version = V_igmp_default_version;
 	igi->igi_flags = 0;
 	igi->igi_rv = IGMP_RV_INIT;
 	igi->igi_qi = IGMP_QI_INIT;
 	igi->igi_qri = IGMP_QRI_INIT;
 	igi->igi_uri = IGMP_URI_INIT;
 	SLIST_INIT(&igi->igi_relinmhead);
 	mbufq_init(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS);
 
 	LIST_INSERT_HEAD(&V_igi_head, igi, igi_link);
 
 	CTR2(KTR_IGMPV3, "allocate igmp_ifsoftc for ifp %p(%s)",
 	     ifp, ifp->if_xname);
 
 out:
 	return (igi);
 }
 
 /*
  * Hook for ifdetach.
  *
  * NOTE: Some finalization tasks need to run before the protocol domain
  * is detached, but also before the link layer does its cleanup.
  *
  * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK().
  * XXX This is also bitten by unlocked ifma_protospec access.
  */
 void
 igmp_ifdetach(struct ifnet *ifp)
 {
 	struct igmp_ifsoftc	*igi;
 	struct ifmultiaddr	*ifma;
 	struct in_multi		*inm, *tinm;
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp,
 	    ifp->if_xname);
 
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	if (igi->igi_version == IGMP_VERSION_3) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 #if 0
 			KASSERT(ifma->ifma_protospec != NULL,
 			    ("%s: ifma_protospec is NULL", __func__));
 #endif
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			if (inm->inm_state == IGMP_LEAVING_MEMBER) {
 				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
 				    inm, inm_nrele);
 			}
 			inm_clear_recorded(inm);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		/*
 		 * Free the in_multi reference(s) for this IGMP lifecycle.
 		 */
 		SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele,
 		    tinm) {
 			SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
 			inm_release_locked(inm);
 		}
 	}
 
 	IGMP_UNLOCK();
 }
 
 /*
  * Hook for domifdetach.
  */
 void
 igmp_domifdetach(struct ifnet *ifp)
 {
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK();
 	igi_delete_locked(ifp);
 	IGMP_UNLOCK();
 }
 
 static void
 igi_delete_locked(const struct ifnet *ifp)
 {
 	struct igmp_ifsoftc *igi, *tigi;
 
 	CTR3(KTR_IGMPV3, "%s: freeing igmp_ifsoftc for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK_ASSERT();
 
 	LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) {
 		if (igi->igi_ifp == ifp) {
 			/*
 			 * Free deferred General Query responses.
 			 */
 			mbufq_drain(&igi->igi_gq);
 
 			LIST_REMOVE(igi, igi_link);
 
 			KASSERT(SLIST_EMPTY(&igi->igi_relinmhead),
 			    ("%s: there are dangling in_multi references",
 			    __func__));
 
 			free(igi, M_IGMP);
 			return;
 		}
 	}
 }
 
 /*
  * Process a received IGMPv1 query.
  * Return non-zero if the message should be dropped.
  *
  * VIMAGE: The curvnet pointer is derived from the input ifp.
  */
 static int
 igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
     const struct igmp *igmp)
 {
 	struct ifmultiaddr	*ifma;
 	struct igmp_ifsoftc	*igi;
 	struct in_multi		*inm;
 
 	/*
 	 * IGMPv1 Host Mmembership Queries SHOULD always be addressed to
 	 * 224.0.0.1. They are always treated as General Queries.
 	 * igmp_group is always ignored. Do not drop it as a userland
 	 * daemon may wish to see it.
 	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
 	 */
 	if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) {
 		IGMPSTAT_INC(igps_rcv_badqueries);
 		return (0);
 	}
 	IGMPSTAT_INC(igps_rcv_gen_queries);
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Switch to IGMPv1 host compatibility mode.
 	 */
 	igmp_set_version(igi, IGMP_VERSION_1);
 
 	CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname);
 
 	/*
 	 * Start the timers in all of our group records
 	 * for the interface on which the query arrived,
 	 * except those which are already running.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		if (inm->inm_timer != 0)
 			continue;
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			inm->inm_timer = IGMP_RANDOM_DELAY(
 			    IGMP_V1V2_MAX_RI * PR_FASTHZ);
 			V_current_state_timers_running = 1;
 			break;
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a received IGMPv2 general or group-specific query.
  */
 static int
 igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
     const struct igmp *igmp)
 {
 	struct ifmultiaddr	*ifma;
 	struct igmp_ifsoftc	*igi;
 	struct in_multi		*inm;
 	int			 is_general_query;
 	uint16_t		 timer;
 
 	is_general_query = 0;
 
 	/*
 	 * Validate address fields upfront.
 	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
 	 */
 	if (in_nullhost(igmp->igmp_group)) {
 		/*
 		 * IGMPv2 General Query.
 		 * If this was not sent to the all-hosts group, ignore it.
 		 */
 		if (!in_allhosts(ip->ip_dst))
 			return (0);
 		IGMPSTAT_INC(igps_rcv_gen_queries);
 		is_general_query = 1;
 	} else {
 		/* IGMPv2 Group-Specific Query. */
 		IGMPSTAT_INC(igps_rcv_group_queries);
 	}
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Ignore v2 query if in v1 Compatibility Mode.
 	 */
 	if (igi->igi_version == IGMP_VERSION_1)
 		goto out_locked;
 
 	igmp_set_version(igi, IGMP_VERSION_2);
 
 	timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	if (is_general_query) {
 		/*
 		 * For each reporting group joined on this
 		 * interface, kick the report timer.
 		 */
 		CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			igmp_v2_update_group(inm, timer);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	} else {
 		/*
 		 * Group-specific IGMPv2 query, we need only
 		 * look up the single group to process it.
 		 */
 		inm = inm_lookup(ifp, igmp->igmp_group);
 		if (inm != NULL) {
 			CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 			igmp_v2_update_group(inm, timer);
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Update the report timer on a group in response to an IGMPv2 query.
  *
  * If we are becoming the reporting member for this group, start the timer.
  * If we already are the reporting member for this group, and timer is
  * below the threshold, reset it.
  *
  * We may be updating the group for the first time since we switched
  * to IGMPv3. If we are, then we must clear any recorded source lists,
  * and transition to REPORTING state; the group timer is overloaded
  * for group and group-source query responses. 
  *
  * Unlike IGMPv3, the delay per group should be jittered
  * to avoid bursts of IGMPv2 reports.
  */
 static void
 igmp_v2_update_group(struct in_multi *inm, const int timer)
 {
 
 	CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__,
 	    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer);
 
 	IN_MULTI_LOCK_ASSERT();
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 		break;
 	case IGMP_REPORTING_MEMBER:
 		if (inm->inm_timer != 0 &&
 		    inm->inm_timer <= timer) {
 			CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, "
 			    "skipping.", __func__);
 			break;
 		}
 		/* FALLTHROUGH */
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__);
 		inm->inm_state = IGMP_REPORTING_MEMBER;
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		break;
 	case IGMP_SLEEPING_MEMBER:
 		CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__);
 		inm->inm_state = IGMP_AWAKENING_MEMBER;
 		break;
 	case IGMP_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Process a received IGMPv3 general, group-specific or
  * group-and-source-specific query.
  * Assumes m has already been pulled up to the full IGMP message length.
  * Return 0 if successful, otherwise an appropriate error code is returned.
  */
 static int
 igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
     /*const*/ struct igmpv3 *igmpv3)
 {
 	struct igmp_ifsoftc	*igi;
 	struct in_multi		*inm;
 	int			 is_general_query;
 	uint32_t		 maxresp, nsrc, qqi;
 	uint16_t		 timer;
 	uint8_t			 qrv;
 
 	is_general_query = 0;
 
 	CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname);
 
 	maxresp = igmpv3->igmp_code;	/* in 1/10ths of a second */
 	if (maxresp >= 128) {
 		maxresp = IGMP_MANT(igmpv3->igmp_code) <<
 			  (IGMP_EXP(igmpv3->igmp_code) + 3);
 	}
 
 	/*
 	 * Robustness must never be less than 2 for on-wire IGMPv3.
 	 * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make
 	 * an exception for interfaces whose IGMPv3 state changes
 	 * are redirected to loopback (e.g. MANET).
 	 */
 	qrv = IGMP_QRV(igmpv3->igmp_misc);
 	if (qrv < 2) {
 		CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__,
 		    qrv, IGMP_RV_INIT);
 		qrv = IGMP_RV_INIT;
 	}
 
 	qqi = igmpv3->igmp_qqi;
 	if (qqi >= 128) {
 		qqi = IGMP_MANT(igmpv3->igmp_qqi) <<
 		     (IGMP_EXP(igmpv3->igmp_qqi) + 3);
 	}
 
 	timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	nsrc = ntohs(igmpv3->igmp_numsrc);
 
 	/*
 	 * Validate address fields and versions upfront before
 	 * accepting v3 query.
 	 * XXX SMPng: Unlocked access to igmpstat counters here.
 	 */
 	if (in_nullhost(igmpv3->igmp_group)) {
 		/*
 		 * IGMPv3 General Query.
 		 *
 		 * General Queries SHOULD be directed to 224.0.0.1.
 		 * A general query with a source list has undefined
 		 * behaviour; discard it.
 		 */
 		IGMPSTAT_INC(igps_rcv_gen_queries);
 		if (!in_allhosts(ip->ip_dst) || nsrc > 0) {
 			IGMPSTAT_INC(igps_rcv_badqueries);
 			return (0);
 		}
 		is_general_query = 1;
 	} else {
 		/* Group or group-source specific query. */
 		if (nsrc == 0)
 			IGMPSTAT_INC(igps_rcv_group_queries);
 		else
 			IGMPSTAT_INC(igps_rcv_gsr_queries);
 	}
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Discard the v3 query if we're in Compatibility Mode.
 	 * The RFC is not obviously worded that hosts need to stay in
 	 * compatibility mode until the Old Version Querier Present
 	 * timer expires.
 	 */
 	if (igi->igi_version != IGMP_VERSION_3) {
 		CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)",
 		    igi->igi_version, ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	igmp_set_version(igi, IGMP_VERSION_3);
 	igi->igi_rv = qrv;
 	igi->igi_qi = qqi;
 	igi->igi_qri = maxresp;
 
 	CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi,
 	    maxresp);
 
 	if (is_general_query) {
 		/*
 		 * Schedule a current-state report on this ifp for
 		 * all groups, possibly containing source lists.
 		 * If there is a pending General Query response
 		 * scheduled earlier than the selected delay, do
 		 * not schedule any other reports.
 		 * Otherwise, reset the interface timer.
 		 */
 		CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) {
 			igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer);
 			V_interface_timers_running = 1;
 		}
 	} else {
 		/*
 		 * Group-source-specific queries are throttled on
 		 * a per-group basis to defeat denial-of-service attempts.
 		 * Queries for groups we are not a member of on this
 		 * link are simply ignored.
 		 */
 		inm = inm_lookup(ifp, igmpv3->igmp_group);
 		if (inm == NULL)
 			goto out_locked;
 		if (nsrc > 0) {
 			if (!ratecheck(&inm->inm_lastgsrtv,
 			    &V_igmp_gsrdelay)) {
 				CTR1(KTR_IGMPV3, "%s: GS query throttled.",
 				    __func__);
 				IGMPSTAT_INC(igps_drop_gsr_queries);
 				goto out_locked;
 			}
 		}
 		CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)",
 		     inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname);
 		/*
 		 * If there is a pending General Query response
 		 * scheduled sooner than the selected delay, no
 		 * further report need be scheduled.
 		 * Otherwise, prepare to respond to the
 		 * group-specific or group-and-source query.
 		 */
 		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer)
 			igmp_input_v3_group_query(inm, igi, timer, igmpv3);
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a recieved IGMPv3 group-specific or group-and-source-specific
  * query.
  * Return <0 if any error occured. Currently this is ignored.
  */
 static int
 igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifsoftc *igi,
     int timer, /*const*/ struct igmpv3 *igmpv3)
 {
 	int			 retval;
 	uint16_t		 nsrc;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	retval = 0;
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		return (retval);
 		break;
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		break;
 	}
 
 	nsrc = ntohs(igmpv3->igmp_numsrc);
 
 	/*
 	 * Deal with group-specific queries upfront.
 	 * If any group query is already pending, purge any recorded
 	 * source-list state if it exists, and schedule a query response
 	 * for this group-specific query.
 	 */
 	if (nsrc == 0) {
 		if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
 		    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) {
 			inm_clear_recorded(inm);
 			timer = min(inm->inm_timer, timer);
 		}
 		inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER;
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		return (retval);
 	}
 
 	/*
 	 * Deal with the case where a group-and-source-specific query has
 	 * been received but a group-specific query is already pending.
 	 */
 	if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) {
 		timer = min(inm->inm_timer, timer);
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		return (retval);
 	}
 
 	/*
 	 * Finally, deal with the case where a group-and-source-specific
 	 * query has been received, where a response to a previous g-s-r
 	 * query exists, or none exists.
 	 * In this case, we need to parse the source-list which the Querier
 	 * has provided us with and check if we have any source list filter
 	 * entries at T1 for these sources. If we do not, there is no need
 	 * schedule a report and the query may be dropped.
 	 * If we do, we must record them and schedule a current-state
 	 * report for those sources.
 	 * FIXME: Handling source lists larger than 1 mbuf requires that
 	 * we pass the mbuf chain pointer down to this function, and use
 	 * m_getptr() to walk the chain.
 	 */
 	if (inm->inm_nsrc > 0) {
 		const struct in_addr	*ap;
 		int			 i, nrecorded;
 
 		ap = (const struct in_addr *)(igmpv3 + 1);
 		nrecorded = 0;
 		for (i = 0; i < nsrc; i++, ap++) {
 			retval = inm_record_source(inm, ap->s_addr);
 			if (retval < 0)
 				break;
 			nrecorded += retval;
 		}
 		if (nrecorded > 0) {
 			CTR1(KTR_IGMPV3,
 			    "%s: schedule response to SG query", __func__);
 			inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER;
 			inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 			V_current_state_timers_running = 1;
 		}
 	}
 
 	return (retval);
 }
 
 /*
  * Process a received IGMPv1 host membership report.
  *
  * NOTE: 0.0.0.0 workaround breaks const correctness.
  */
 static int
 igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
     /*const*/ struct igmp *igmp)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct in_ifaddr *ia;
 	struct in_multi *inm;
 
 	IGMPSTAT_INC(igps_rcv_reports);
 
 	if (ifp->if_flags & IFF_LOOPBACK)
 		return (0);
 
 	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
 	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
 		IGMPSTAT_INC(igps_rcv_badreports);
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
 	 * Booting clients may use the source address 0.0.0.0. Some
 	 * IGMP daemons may not know how to use IP_RECVIF to determine
 	 * the interface upon which this message was received.
 	 * Replace 0.0.0.0 with the subnet address if told to do so.
 	 */
 	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
 		IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 		if (ia != NULL) {
 			ip->ip_src.s_addr = htonl(ia->ia_subnet);
 			ifa_free(&ia->ia_ifa);
 		}
 	}
 
 	CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)",
 	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 
 	/*
 	 * IGMPv1 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, stop our group timer and transition to the 'lazy' state.
 	 */
 	IN_MULTI_LOCK();
 	inm = inm_lookup(ifp, igmp->igmp_group);
 	if (inm != NULL) {
 		struct igmp_ifsoftc *igi;
 
 		igi = inm->inm_igi;
 		if (igi == NULL) {
 			KASSERT(igi != NULL,
 			    ("%s: no igi for ifp %p", __func__, ifp));
 			goto out_locked;
 		}
 
 		IGMPSTAT_INC(igps_rcv_ourreports);
 
 		/*
 		 * If we are in IGMPv3 host mode, do not allow the
 		 * other host's IGMPv1 report to suppress our reports
 		 * unless explicitly configured to do so.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3) {
 			if (V_igmp_legacysupp)
 				igmp_v3_suppress_group_record(inm);
 			goto out_locked;
 		}
 
 		inm->inm_timer = 0;
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 		case IGMP_SLEEPING_MEMBER:
 			inm->inm_state = IGMP_SLEEPING_MEMBER;
 			break;
 		case IGMP_REPORTING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 			if (igi->igi_version == IGMP_VERSION_1)
 				inm->inm_state = IGMP_LAZY_MEMBER;
 			else if (igi->igi_version == IGMP_VERSION_2)
 				inm->inm_state = IGMP_SLEEPING_MEMBER;
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a received IGMPv2 host membership report.
  *
  * NOTE: 0.0.0.0 workaround breaks const correctness.
  */
 static int
 igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
     /*const*/ struct igmp *igmp)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct in_ifaddr *ia;
 	struct in_multi *inm;
 
 	/*
 	 * Make sure we don't hear our own membership report.  Fast
 	 * leave requires knowing that we are the only member of a
 	 * group.
 	 */
 	IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 	if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) {
 		ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 
 	IGMPSTAT_INC(igps_rcv_reports);
 
 	if (ifp->if_flags & IFF_LOOPBACK) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 
 	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
 	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		IGMPSTAT_INC(igps_rcv_badreports);
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
 	 * Booting clients may use the source address 0.0.0.0. Some
 	 * IGMP daemons may not know how to use IP_RECVIF to determine
 	 * the interface upon which this message was received.
 	 * Replace 0.0.0.0 with the subnet address if told to do so.
 	 */
 	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
 		if (ia != NULL)
 			ip->ip_src.s_addr = htonl(ia->ia_subnet);
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 
 	CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)",
 	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 
 	/*
 	 * IGMPv2 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, and our group timer is pending or about to be reset,
 	 * stop our group timer by transitioning to the 'lazy' state.
 	 */
 	IN_MULTI_LOCK();
 	inm = inm_lookup(ifp, igmp->igmp_group);
 	if (inm != NULL) {
 		struct igmp_ifsoftc *igi;
 
 		igi = inm->inm_igi;
 		KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp));
 
 		IGMPSTAT_INC(igps_rcv_ourreports);
 
 		/*
 		 * If we are in IGMPv3 host mode, do not allow the
 		 * other host's IGMPv1 report to suppress our reports
 		 * unless explicitly configured to do so.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3) {
 			if (V_igmp_legacysupp)
 				igmp_v3_suppress_group_record(inm);
 			goto out_locked;
 		}
 
 		inm->inm_timer = 0;
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 			break;
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 		case IGMP_LAZY_MEMBER:
 			inm->inm_state = IGMP_LAZY_MEMBER;
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 int
 igmp_input(struct mbuf **mp, int *offp, int proto)
 {
 	int iphlen;
 	struct ifnet *ifp;
 	struct igmp *igmp;
 	struct ip *ip;
 	struct mbuf *m;
 	int igmplen;
 	int minlen;
 	int queryver;
 
 	CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp);
 
 	m = *mp;
 	ifp = m->m_pkthdr.rcvif;
 	*mp = NULL;
 
 	IGMPSTAT_INC(igps_rcv_total);
 
 	ip = mtod(m, struct ip *);
 	iphlen = *offp;
 	igmplen = ntohs(ip->ip_len) - iphlen;
 
 	/*
 	 * Validate lengths.
 	 */
 	if (igmplen < IGMP_MINLEN) {
 		IGMPSTAT_INC(igps_rcv_tooshort);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Always pullup to the minimum size for v1/v2 or v3
 	 * to amortize calls to m_pullup().
 	 */
 	minlen = iphlen;
 	if (igmplen >= IGMP_V3_QUERY_MINLEN)
 		minlen += IGMP_V3_QUERY_MINLEN;
 	else
 		minlen += IGMP_MINLEN;
 	if ((!M_WRITABLE(m) || m->m_len < minlen) &&
 	    (m = m_pullup(m, minlen)) == 0) {
 		IGMPSTAT_INC(igps_rcv_tooshort);
 		return (IPPROTO_DONE);
 	}
 	ip = mtod(m, struct ip *);
 
 	/*
 	 * Validate checksum.
 	 */
 	m->m_data += iphlen;
 	m->m_len -= iphlen;
 	igmp = mtod(m, struct igmp *);
 	if (in_cksum(m, igmplen)) {
 		IGMPSTAT_INC(igps_rcv_badsum);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	m->m_data -= iphlen;
 	m->m_len += iphlen;
 
 	/*
 	 * IGMP control traffic is link-scope, and must have a TTL of 1.
 	 * DVMRP traffic (e.g. mrinfo, mtrace) is an exception;
 	 * probe packets may come from beyond the LAN.
 	 */
 	if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) {
 		IGMPSTAT_INC(igps_rcv_badttl);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	switch (igmp->igmp_type) {
 	case IGMP_HOST_MEMBERSHIP_QUERY:
 		if (igmplen == IGMP_MINLEN) {
 			if (igmp->igmp_code == 0)
 				queryver = IGMP_VERSION_1;
 			else
 				queryver = IGMP_VERSION_2;
 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
 			queryver = IGMP_VERSION_3;
 		} else {
 			IGMPSTAT_INC(igps_rcv_tooshort);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		switch (queryver) {
 		case IGMP_VERSION_1:
 			IGMPSTAT_INC(igps_rcv_v1v2_queries);
 			if (!V_igmp_v1enable)
 				break;
 			if (igmp_input_v1_query(ifp, ip, igmp) != 0) {
 				m_freem(m);
 				return (IPPROTO_DONE);
 			}
 			break;
 
 		case IGMP_VERSION_2:
 			IGMPSTAT_INC(igps_rcv_v1v2_queries);
 			if (!V_igmp_v2enable)
 				break;
 			if (igmp_input_v2_query(ifp, ip, igmp) != 0) {
 				m_freem(m);
 				return (IPPROTO_DONE);
 			}
 			break;
 
 		case IGMP_VERSION_3: {
 				struct igmpv3 *igmpv3;
 				uint16_t igmpv3len;
 				uint16_t nsrc;
 
 				IGMPSTAT_INC(igps_rcv_v3_queries);
 				igmpv3 = (struct igmpv3 *)igmp;
 				/*
 				 * Validate length based on source count.
 				 */
 				nsrc = ntohs(igmpv3->igmp_numsrc);
 				if (nsrc * sizeof(in_addr_t) >
 				    UINT16_MAX - iphlen - IGMP_V3_QUERY_MINLEN) {
 					IGMPSTAT_INC(igps_rcv_tooshort);
 					return (IPPROTO_DONE);
 				}
 				/*
 				 * m_pullup() may modify m, so pullup in
 				 * this scope.
 				 */
 				igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN +
 				   sizeof(struct in_addr) * nsrc;
 				if ((!M_WRITABLE(m) ||
 				     m->m_len < igmpv3len) &&
 				    (m = m_pullup(m, igmpv3len)) == NULL) {
 					IGMPSTAT_INC(igps_rcv_tooshort);
 					return (IPPROTO_DONE);
 				}
 				igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *)
 				    + iphlen);
 				if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) {
 					m_freem(m);
 					return (IPPROTO_DONE);
 				}
 			}
 			break;
 		}
 		break;
 
 	case IGMP_v1_HOST_MEMBERSHIP_REPORT:
 		if (!V_igmp_v1enable)
 			break;
 		if (igmp_input_v1_report(ifp, ip, igmp) != 0) {
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	case IGMP_v2_HOST_MEMBERSHIP_REPORT:
 		if (!V_igmp_v2enable)
 			break;
 		if (!ip_checkrouteralert(m))
 			IGMPSTAT_INC(igps_rcv_nora);
 		if (igmp_input_v2_report(ifp, ip, igmp) != 0) {
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	case IGMP_v3_HOST_MEMBERSHIP_REPORT:
 		/*
 		 * Hosts do not need to process IGMPv3 membership reports,
 		 * as report suppression is no longer required.
 		 */
 		if (!ip_checkrouteralert(m))
 			IGMPSTAT_INC(igps_rcv_nora);
 		break;
 
 	default:
 		break;
 	}
 
 	/*
 	 * Pass all valid IGMP packets up to any process(es) listening on a
 	 * raw IGMP socket.
 	 */
 	*mp = m;
 	return (rip_input(mp, offp, proto));
 }
 
 
 /*
  * Fast timeout handler (global).
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 igmp_fasttimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		igmp_fasttimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Fast timeout handler (per-vnet).
  * Sends are shuffled off to a netisr to deal with Giant.
  *
  * VIMAGE: Assume caller has set up our curvnet.
  */
 static void
 igmp_fasttimo_vnet(void)
 {
 	struct mbufq		 scq;	/* State-change packets */
 	struct mbufq		 qrq;	/* Query response packets */
 	struct ifnet		*ifp;
 	struct igmp_ifsoftc	*igi;
 	struct ifmultiaddr	*ifma;
 	struct in_multi		*inm;
 	int			 loop, uri_fasthz;
 
 	loop = 0;
 	uri_fasthz = 0;
 
 	/*
 	 * Quick check to see if any work needs to be done, in order to
 	 * minimize the overhead of fasttimo processing.
 	 * SMPng: XXX Unlocked reads.
 	 */
 	if (!V_current_state_timers_running &&
 	    !V_interface_timers_running &&
 	    !V_state_change_timers_running)
 		return;
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	/*
 	 * IGMPv3 General Query response timer processing.
 	 */
 	if (V_interface_timers_running) {
 		CTR1(KTR_IGMPV3, "%s: interface timers running", __func__);
 
 		V_interface_timers_running = 0;
 		LIST_FOREACH(igi, &V_igi_head, igi_link) {
 			if (igi->igi_v3_timer == 0) {
 				/* Do nothing. */
 			} else if (--igi->igi_v3_timer == 0) {
 				igmp_v3_dispatch_general_query(igi);
 			} else {
 				V_interface_timers_running = 1;
 			}
 		}
 	}
 
 	if (!V_current_state_timers_running &&
 	    !V_state_change_timers_running)
 		goto out_locked;
 
 	V_current_state_timers_running = 0;
 	V_state_change_timers_running = 0;
 
 	CTR1(KTR_IGMPV3, "%s: state change timers running", __func__);
 
 	/*
 	 * IGMPv1/v2/v3 host report and state-change timer processing.
 	 * Note: Processing a v3 group timer may remove a node.
 	 */
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		ifp = igi->igi_ifp;
 
 		if (igi->igi_version == IGMP_VERSION_3) {
 			loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
 			uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri *
 			    PR_FASTHZ);
 			mbufq_init(&qrq, IGMP_MAX_G_GS_PACKETS);
 			mbufq_init(&scq, IGMP_MAX_STATE_CHANGE_PACKETS);
 		}
 
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			switch (igi->igi_version) {
 			case IGMP_VERSION_1:
 			case IGMP_VERSION_2:
 				igmp_v1v2_process_group_timer(inm,
 				    igi->igi_version);
 				break;
 			case IGMP_VERSION_3:
 				igmp_v3_process_group_timers(igi, &qrq,
 				    &scq, inm, uri_fasthz);
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 
 		if (igi->igi_version == IGMP_VERSION_3) {
 			struct in_multi		*tinm;
 
 			igmp_dispatch_queue(&qrq, 0, loop);
 			igmp_dispatch_queue(&scq, 0, loop);
 
 			/*
 			 * Free the in_multi reference(s) for this
 			 * IGMP lifecycle.
 			 */
 			SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead,
 			    inm_nrele, tinm) {
 				SLIST_REMOVE_HEAD(&igi->igi_relinmhead,
 				    inm_nrele);
 				inm_release_locked(inm);
 			}
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 }
 
 /*
  * Update host report group timer for IGMPv1/v2.
  * Will update the global pending timer flags.
  */
 static void
 igmp_v1v2_process_group_timer(struct in_multi *inm, const int version)
 {
 	int report_timer_expired;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	if (inm->inm_timer == 0) {
 		report_timer_expired = 0;
 	} else if (--inm->inm_timer == 0) {
 		report_timer_expired = 1;
 	} else {
 		V_current_state_timers_running = 1;
 		return;
 	}
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		break;
 	case IGMP_REPORTING_MEMBER:
 		if (report_timer_expired) {
 			inm->inm_state = IGMP_IDLE_MEMBER;
 			(void)igmp_v1v2_queue_report(inm,
 			    (version == IGMP_VERSION_2) ?
 			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
 			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
 		}
 		break;
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Update a group's timers for IGMPv3.
  * Will update the global pending timer flags.
  * Note: Unlocked read from igi.
  */
 static void
 igmp_v3_process_group_timers(struct igmp_ifsoftc *igi,
     struct mbufq *qrq, struct mbufq *scq,
     struct in_multi *inm, const int uri_fasthz)
 {
 	int query_response_timer_expired;
 	int state_change_retransmit_timer_expired;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	query_response_timer_expired = 0;
 	state_change_retransmit_timer_expired = 0;
 
 	/*
 	 * During a transition from v1/v2 compatibility mode back to v3,
 	 * a group record in REPORTING state may still have its group
 	 * timer active. This is a no-op in this function; it is easier
 	 * to deal with it here than to complicate the slow-timeout path.
 	 */
 	if (inm->inm_timer == 0) {
 		query_response_timer_expired = 0;
 	} else if (--inm->inm_timer == 0) {
 		query_response_timer_expired = 1;
 	} else {
 		V_current_state_timers_running = 1;
 	}
 
 	if (inm->inm_sctimer == 0) {
 		state_change_retransmit_timer_expired = 0;
 	} else if (--inm->inm_sctimer == 0) {
 		state_change_retransmit_timer_expired = 1;
 	} else {
 		V_state_change_timers_running = 1;
 	}
 
 	/* We are in fasttimo, so be quick about it. */
 	if (!state_change_retransmit_timer_expired &&
 	    !query_response_timer_expired)
 		return;
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 		break;
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		/*
 		 * Respond to a previously pending Group-Specific
 		 * or Group-and-Source-Specific query by enqueueing
 		 * the appropriate Current-State report for
 		 * immediate transmission.
 		 */
 		if (query_response_timer_expired) {
 			int retval;
 
 			retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1,
 			    (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER));
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			/* XXX Clear recorded sources for next time. */
 			inm_clear_recorded(inm);
 		}
 		/* FALLTHROUGH */
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		if (state_change_retransmit_timer_expired) {
 			/*
 			 * State-change retransmission timer fired.
 			 * If there are any further pending retransmissions,
 			 * set the global pending state-change flag, and
 			 * reset the timer.
 			 */
 			if (--inm->inm_scrv > 0) {
 				inm->inm_sctimer = uri_fasthz;
 				V_state_change_timers_running = 1;
 			}
 			/*
 			 * Retransmit the previously computed state-change
 			 * report. If there are no further pending
 			 * retransmissions, the mbuf queue will be consumed.
 			 * Update T0 state to T1 as we have now sent
 			 * a state-change.
 			 */
 			(void)igmp_v3_merge_state_changes(inm, scq);
 
 			inm_commit(inm);
 			CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 			    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 
 			/*
 			 * If we are leaving the group for good, make sure
 			 * we release IGMP's reference to it.
 			 * This release must be deferred using a SLIST,
 			 * as we are called from a loop which traverses
 			 * the in_ifmultiaddr TAILQ.
 			 */
 			if (inm->inm_state == IGMP_LEAVING_MEMBER &&
 			    inm->inm_scrv == 0) {
 				inm->inm_state = IGMP_NOT_MEMBER;
 				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
 				    inm, inm_nrele);
 			}
 		}
 		break;
 	}
 }
 
 
 /*
  * Suppress a group's pending response to a group or source/group query.
  *
  * Do NOT suppress state changes. This leads to IGMPv3 inconsistency.
  * Do NOT update ST1/ST0 as this operation merely suppresses
  * the currently pending group record.
  * Do NOT suppress the response to a general query. It is possible but
  * it would require adding another state or flag.
  */
 static void
 igmp_v3_suppress_group_record(struct in_multi *inm)
 {
 
 	IN_MULTI_LOCK_ASSERT();
 
 	KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3,
 		("%s: not IGMPv3 mode on link", __func__));
 
 	if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER ||
 	    inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER)
 		return;
 
 	if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
 		inm_clear_recorded(inm);
 
 	inm->inm_timer = 0;
 	inm->inm_state = IGMP_REPORTING_MEMBER;
 }
 
 /*
  * Switch to a different IGMP version on the given interface,
  * as per Section 7.2.1.
  */
 static void
 igmp_set_version(struct igmp_ifsoftc *igi, const int version)
 {
 	int old_version_timer;
 
 	IGMP_LOCK_ASSERT();
 
 	CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__,
 	    version, igi->igi_ifp, igi->igi_ifp->if_xname);
 
 	if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) {
 		/*
 		 * Compute the "Older Version Querier Present" timer as per
 		 * Section 8.12.
 		 */
 		old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri;
 		old_version_timer *= PR_SLOWHZ;
 
 		if (version == IGMP_VERSION_1) {
 			igi->igi_v1_timer = old_version_timer;
 			igi->igi_v2_timer = 0;
 		} else if (version == IGMP_VERSION_2) {
 			igi->igi_v1_timer = 0;
 			igi->igi_v2_timer = old_version_timer;
 		}
 	}
 
 	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
 		if (igi->igi_version != IGMP_VERSION_2) {
 			igi->igi_version = IGMP_VERSION_2;
 			igmp_v3_cancel_link_timers(igi);
 		}
 	} else if (igi->igi_v1_timer > 0) {
 		if (igi->igi_version != IGMP_VERSION_1) {
 			igi->igi_version = IGMP_VERSION_1;
 			igmp_v3_cancel_link_timers(igi);
 		}
 	}
 }
 
 /*
  * Cancel pending IGMPv3 timers for the given link and all groups
  * joined on it; state-change, general-query, and group-query timers.
  *
  * Only ever called on a transition from v3 to Compatibility mode. Kill
  * the timers stone dead (this may be expensive for large N groups), they
  * will be restarted if Compatibility Mode deems that they must be due to
  * query processing.
  */
 static void
 igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in_multi		*inm, *tinm;
 
 	CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__,
 	    igi->igi_ifp, igi->igi_ifp->if_xname);
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	/*
 	 * Stop the v3 General Query Response on this link stone dead.
 	 * If fasttimo is woken up due to V_interface_timers_running,
 	 * the flag will be cleared if there are no pending link timers.
 	 */
 	igi->igi_v3_timer = 0;
 
 	/*
 	 * Now clear the current-state and state-change report timers
 	 * for all memberships scoped to this link.
 	 */
 	ifp = igi->igi_ifp;
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			/*
 			 * These states are either not relevant in v3 mode,
 			 * or are unreported. Do nothing.
 			 */
 			break;
 		case IGMP_LEAVING_MEMBER:
 			/*
 			 * If we are leaving the group and switching to
 			 * compatibility mode, we need to release the final
 			 * reference held for issuing the INCLUDE {}, and
 			 * transition to REPORTING to ensure the host leave
 			 * message is sent upstream to the old querier --
 			 * transition to NOT would lose the leave and race.
 			 */
 			SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele);
 			/* FALLTHROUGH */
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 			inm_clear_recorded(inm);
 			/* FALLTHROUGH */
 		case IGMP_REPORTING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			break;
 		}
 		/*
 		 * Always clear state-change and group report timers.
 		 * Free any pending IGMPv3 state-change records.
 		 */
 		inm->inm_sctimer = 0;
 		inm->inm_timer = 0;
 		mbufq_drain(&inm->inm_scq);
 	}
 	IF_ADDR_RUNLOCK(ifp);
 	SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
 		inm_release_locked(inm);
 	}
 }
 
 /*
  * Update the Older Version Querier Present timers for a link.
  * See Section 7.2.1 of RFC 3376.
  */
 static void
 igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *igi)
 {
 
 	IGMP_LOCK_ASSERT();
 
 	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) {
 		/*
 		 * IGMPv1 and IGMPv2 Querier Present timers expired.
 		 *
 		 * Revert to IGMPv3.
 		 */
 		if (igi->igi_version != IGMP_VERSION_3) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_version = IGMP_VERSION_3;
 		}
 	} else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
 		/*
 		 * IGMPv1 Querier Present timer expired,
 		 * IGMPv2 Querier Present timer running.
 		 * If IGMPv2 was disabled since last timeout,
 		 * revert to IGMPv3.
 		 * If IGMPv2 is enabled, revert to IGMPv2.
 		 */
 		if (!V_igmp_v2enable) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v2_timer = 0;
 			igi->igi_version = IGMP_VERSION_3;
 		} else {
 			--igi->igi_v2_timer;
 			if (igi->igi_version != IGMP_VERSION_2) {
 				CTR5(KTR_IGMPV3,
 				    "%s: transition from v%d -> v%d on %p(%s)",
 				    __func__, igi->igi_version, IGMP_VERSION_2,
 				    igi->igi_ifp, igi->igi_ifp->if_xname);
 				igi->igi_version = IGMP_VERSION_2;
 				igmp_v3_cancel_link_timers(igi);
 			}
 		}
 	} else if (igi->igi_v1_timer > 0) {
 		/*
 		 * IGMPv1 Querier Present timer running.
 		 * Stop IGMPv2 timer if running.
 		 *
 		 * If IGMPv1 was disabled since last timeout,
 		 * revert to IGMPv3.
 		 * If IGMPv1 is enabled, reset IGMPv2 timer if running.
 		 */
 		if (!V_igmp_v1enable) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v1_timer = 0;
 			igi->igi_version = IGMP_VERSION_3;
 		} else {
 			--igi->igi_v1_timer;
 		}
 		if (igi->igi_v2_timer > 0) {
 			CTR3(KTR_IGMPV3,
 			    "%s: cancel v2 timer on %p(%s)",
 			    __func__, igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v2_timer = 0;
 		}
 	}
 }
 
 /*
  * Global slowtimo handler.
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 igmp_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		igmp_slowtimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Per-vnet slowtimo handler.
  */
 static void
 igmp_slowtimo_vnet(void)
 {
 	struct igmp_ifsoftc *igi;
 
 	IGMP_LOCK();
 
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		igmp_v1v2_process_querier_timers(igi);
 	}
 
 	IGMP_UNLOCK();
 }
 
 /*
  * Dispatch an IGMPv1/v2 host report or leave message.
  * These are always small enough to fit inside a single mbuf.
  */
 static int
 igmp_v1v2_queue_report(struct in_multi *inm, const int type)
 {
 	struct ifnet		*ifp;
 	struct igmp		*igmp;
 	struct ip		*ip;
 	struct mbuf		*m;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	ifp = inm->inm_ifp;
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOMEM);
 	M_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp));
 
 	m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp);
 
 	m->m_data += sizeof(struct ip);
 	m->m_len = sizeof(struct igmp);
 
 	igmp = mtod(m, struct igmp *);
 	igmp->igmp_type = type;
 	igmp->igmp_code = 0;
 	igmp->igmp_group = inm->inm_addr;
 	igmp->igmp_cksum = 0;
 	igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp));
 
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_tos = 0;
 	ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp));
 	ip->ip_off = 0;
 	ip->ip_p = IPPROTO_IGMP;
 	ip->ip_src.s_addr = INADDR_ANY;
 
 	if (type == IGMP_HOST_LEAVE_MESSAGE)
 		ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP);
 	else
 		ip->ip_dst = inm->inm_addr;
 
 	igmp_save_context(m, ifp);
 
 	m->m_flags |= M_IGMPV2;
 	if (inm->inm_igi->igi_flags & IGIF_LOOPBACK)
 		m->m_flags |= M_IGMP_LOOP;
 
 	CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m);
 	netisr_dispatch(NETISR_IGMP, m);
 
 	return (0);
 }
 
 /*
  * Process a state change from the upper layer for the given IPv4 group.
  *
  * Each socket holds a reference on the in_multi in its own ip_moptions.
  * The socket layer will have made the necessary updates to.the group
  * state, it is now up to IGMP to issue a state change report if there
  * has been any change between T0 (when the last state-change was issued)
  * and T1 (now).
  *
  * We use the IGMPv3 state machine at group level. The IGMP module
  * however makes the decision as to which IGMP protocol version to speak.
  * A state change *from* INCLUDE {} always means an initial join.
  * A state change *to* INCLUDE {} always means a final leave.
  *
  * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can
  * save ourselves a bunch of work; any exclusive mode groups need not
  * compute source filter lists.
  *
  * VIMAGE: curvnet should have been set by caller, as this routine
  * is called from the socket option handlers.
  */
 int
 igmp_change_state(struct in_multi *inm)
 {
 	struct igmp_ifsoftc *igi;
 	struct ifnet *ifp;
 	int error;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	error = 0;
 
 	/*
 	 * Try to detect if the upper layer just asked us to change state
 	 * for an interface which has now gone away.
 	 */
 	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->inm_ifma->ifma_ifp;
 	/*
 	 * Sanity check that netinet's notion of ifp is the
 	 * same as net's.
 	 */
 	KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
 
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	/*
 	 * If we detect a state transition to or from MCAST_UNDEFINED
 	 * for this group, then we are starting or finishing an IGMP
 	 * life cycle for this group.
 	 */
 	if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) {
 		CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__,
 		    inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode);
 		if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_IGMPV3, "%s: initial join", __func__);
 			error = igmp_initial_join(inm, igi);
 			goto out_locked;
 		} else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_IGMPV3, "%s: final leave", __func__);
 			igmp_final_leave(inm, igi);
 			goto out_locked;
 		}
 	} else {
 		CTR1(KTR_IGMPV3, "%s: filter set change", __func__);
 	}
 
 	error = igmp_handle_state_change(inm, igi);
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Perform the initial join for an IGMP group.
  *
  * When joining a group:
  *  If the group should have its IGMP traffic suppressed, do nothing.
  *  IGMPv1 starts sending IGMPv1 host membership reports.
  *  IGMPv2 starts sending IGMPv2 host membership reports.
  *  IGMPv3 will schedule an IGMPv3 state-change report containing the
  *  initial state of the membership.
  */
 static int
 igmp_initial_join(struct in_multi *inm, struct igmp_ifsoftc *igi)
 {
 	struct ifnet		*ifp;
 	struct mbufq		*mq;
 	int			 error, retval, syncstates;
 
 	CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	error = 0;
 	syncstates = 1;
 
 	ifp = inm->inm_ifp;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	/*
 	 * Groups joined on loopback or marked as 'not reported',
 	 * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and
 	 * are never reported in any IGMP protocol exchanges.
 	 * All other groups enter the appropriate IGMP state machine
 	 * for the version in use on this link.
 	 * A link marked as IGIF_SILENT causes IGMP to be completely
 	 * disabled for the link.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (igi->igi_flags & IGIF_SILENT) ||
 	    !igmp_isgroupreported(inm->inm_addr)) {
 		CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		inm->inm_state = IGMP_SILENT_MEMBER;
 		inm->inm_timer = 0;
 	} else {
 		/*
 		 * Deal with overlapping in_multi lifecycle.
 		 * If this group was LEAVING, then make sure
 		 * we drop the reference we picked up to keep the
 		 * group around for the final INCLUDE {} enqueue.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3 &&
 		    inm->inm_state == IGMP_LEAVING_MEMBER)
 			inm_release_locked(inm);
 
 		inm->inm_state = IGMP_REPORTING_MEMBER;
 
 		switch (igi->igi_version) {
 		case IGMP_VERSION_1:
 		case IGMP_VERSION_2:
 			inm->inm_state = IGMP_IDLE_MEMBER;
 			error = igmp_v1v2_queue_report(inm,
 			    (igi->igi_version == IGMP_VERSION_2) ?
 			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
 			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
 			if (error == 0) {
 				inm->inm_timer = IGMP_RANDOM_DELAY(
 				    IGMP_V1V2_MAX_RI * PR_FASTHZ);
 				V_current_state_timers_running = 1;
 			}
 			break;
 
 		case IGMP_VERSION_3:
 			/*
 			 * Defer update of T0 to T1, until the first copy
 			 * of the state change has been transmitted.
 			 */
 			syncstates = 0;
 
 			/*
 			 * Immediately enqueue a State-Change Report for
 			 * this interface, freeing any previous reports.
 			 * Don't kick the timers if there is nothing to do,
 			 * or if an error occurred.
 			 */
 			mq = &inm->inm_scq;
 			mbufq_drain(mq);
 			retval = igmp_v3_enqueue_group_record(mq, inm, 1,
 			    0, 0);
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			if (retval <= 0) {
 				error = retval * -1;
 				break;
 			}
 
 			/*
 			 * Schedule transmission of pending state-change
 			 * report up to RV times for this link. The timer
 			 * will fire at the next igmp_fasttimo (~200ms),
 			 * giving us an opportunity to merge the reports.
 			 */
 			if (igi->igi_flags & IGIF_LOOPBACK) {
 				inm->inm_scrv = 1;
 			} else {
 				KASSERT(igi->igi_rv > 1,
 				   ("%s: invalid robustness %d", __func__,
 				    igi->igi_rv));
 				inm->inm_scrv = igi->igi_rv;
 			}
 			inm->inm_sctimer = 1;
 			V_state_change_timers_running = 1;
 
 			error = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Only update the T0 state if state change is atomic,
 	 * i.e. we don't need to wait for a timer to fire before we
 	 * can consider the state change to have been communicated.
 	 */
 	if (syncstates) {
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 	}
 
 	return (error);
 }
 
 /*
  * Issue an intermediate state change during the IGMP life-cycle.
  */
 static int
 igmp_handle_state_change(struct in_multi *inm, struct igmp_ifsoftc *igi)
 {
 	struct ifnet		*ifp;
 	int			 retval;
 
 	CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	ifp = inm->inm_ifp;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (igi->igi_flags & IGIF_SILENT) ||
 	    !igmp_isgroupreported(inm->inm_addr) ||
 	    (igi->igi_version != IGMP_VERSION_3)) {
 		if (!igmp_isgroupreported(inm->inm_addr)) {
 			CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		}
 		CTR1(KTR_IGMPV3, "%s: nothing to do", __func__);
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 		return (0);
 	}
 
 	mbufq_drain(&inm->inm_scq);
 
 	retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0);
 	CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval);
 	if (retval <= 0)
 		return (-retval);
 
 	/*
 	 * If record(s) were enqueued, start the state-change
 	 * report timer for this group.
 	 */
 	inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv);
 	inm->inm_sctimer = 1;
 	V_state_change_timers_running = 1;
 
 	return (0);
 }
 
 /*
  * Perform the final leave for an IGMP group.
  *
  * When leaving a group:
  *  IGMPv1 does nothing.
  *  IGMPv2 sends a host leave message, if and only if we are the reporter.
  *  IGMPv3 enqueues a state-change report containing a transition
  *  to INCLUDE {} for immediate transmission.
  */
 static void
 igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi)
 {
 	int syncstates;
 
 	syncstates = 1;
 
 	CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		/* Already leaving or left; do nothing. */
 		CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		break;
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		if (igi->igi_version == IGMP_VERSION_2) {
 #ifdef INVARIANTS
 			if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
 			    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
 			panic("%s: IGMPv3 state reached, not IGMPv3 mode",
 			     __func__);
 #endif
 			igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE);
 			inm->inm_state = IGMP_NOT_MEMBER;
 		} else if (igi->igi_version == IGMP_VERSION_3) {
 			/*
 			 * Stop group timer and all pending reports.
 			 * Immediately enqueue a state-change report
 			 * TO_IN {} to be sent on the next fast timeout,
 			 * giving us an opportunity to merge reports.
 			 */
 			mbufq_drain(&inm->inm_scq);
 			inm->inm_timer = 0;
 			if (igi->igi_flags & IGIF_LOOPBACK) {
 				inm->inm_scrv = 1;
 			} else {
 				inm->inm_scrv = igi->igi_rv;
 			}
 			CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d "
 			    "pending retransmissions.", __func__,
 			    inet_ntoa(inm->inm_addr),
 			    inm->inm_ifp->if_xname, inm->inm_scrv);
 			if (inm->inm_scrv == 0) {
 				inm->inm_state = IGMP_NOT_MEMBER;
 				inm->inm_sctimer = 0;
 			} else {
 				int retval;
 
 				inm_acquire_locked(inm);
 
 				retval = igmp_v3_enqueue_group_record(
 				    &inm->inm_scq, inm, 1, 0, 0);
 				KASSERT(retval != 0,
 				    ("%s: enqueue record = %d", __func__,
 				     retval));
 
 				inm->inm_state = IGMP_LEAVING_MEMBER;
 				inm->inm_sctimer = 1;
 				V_state_change_timers_running = 1;
 				syncstates = 0;
 			}
 			break;
 		}
 		break;
 	case IGMP_LAZY_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		/* Our reports are suppressed; do nothing. */
 		break;
 	}
 
 	if (syncstates) {
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 		CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s",
 		    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 	}
 }
 
 /*
  * Enqueue an IGMPv3 group record to the given output queue.
  *
  * XXX This function could do with having the allocation code
  * split out, and the multiple-tree-walks coalesced into a single
  * routine as has been done in igmp_v3_enqueue_filter_change().
  *
  * If is_state_change is zero, a current-state record is appended.
  * If is_state_change is non-zero, a state-change report is appended.
  *
  * If is_group_query is non-zero, an mbuf packet chain is allocated.
  * If is_group_query is zero, and if there is a packet with free space
  * at the tail of the queue, it will be appended to providing there
  * is enough free space.
  * Otherwise a new mbuf packet chain is allocated.
  *
  * If is_source_query is non-zero, each source is checked to see if
  * it was recorded for a Group-Source query, and will be omitted if
  * it is not both in-mode and recorded.
  *
  * The function will attempt to allocate leading space in the packet
  * for the IP/IGMP header to be prepended without fragmenting the chain.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm,
     const int is_state_change, const int is_group_query,
     const int is_source_query)
 {
 	struct igmp_grouprec	 ig;
 	struct igmp_grouprec	*pig;
 	struct ifnet		*ifp;
 	struct ip_msource	*ims, *nims;
 	struct mbuf		*m0, *m, *md;
 	int			 error, is_filter_list_change;
 	int			 minrec0len, m0srcs, msrcs, nbytes, off;
 	int			 record_has_sources;
 	int			 now;
 	int			 type;
 	in_addr_t		 naddr;
 	uint8_t			 mode;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	error = 0;
 	ifp = inm->inm_ifp;
 	is_filter_list_change = 0;
 	m = NULL;
 	m0 = NULL;
 	m0srcs = 0;
 	msrcs = 0;
 	nbytes = 0;
 	nims = NULL;
 	record_has_sources = 1;
 	pig = NULL;
 	type = IGMP_DO_NOTHING;
 	mode = inm->inm_st[1].iss_fmode;
 
 	/*
 	 * If we did not transition out of ASM mode during t0->t1,
 	 * and there are no source nodes to process, we can skip
 	 * the generation of source records.
 	 */
 	if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 &&
 	    inm->inm_nsrc == 0)
 		record_has_sources = 0;
 
 	if (is_state_change) {
 		/*
 		 * Queue a state change record.
 		 * If the mode did not change, and there are non-ASM
 		 * listeners or source filters present,
 		 * we potentially need to issue two records for the group.
 		 * If we are transitioning to MCAST_UNDEFINED, we need
 		 * not send any sources.
 		 * If there are ASM listeners, and there was no filter
 		 * mode transition of any kind, do nothing.
 		 */
 		if (mode != inm->inm_st[0].iss_fmode) {
 			if (mode == MCAST_EXCLUDE) {
 				CTR1(KTR_IGMPV3, "%s: change to EXCLUDE",
 				    __func__);
 				type = IGMP_CHANGE_TO_EXCLUDE_MODE;
 			} else {
 				CTR1(KTR_IGMPV3, "%s: change to INCLUDE",
 				    __func__);
 				type = IGMP_CHANGE_TO_INCLUDE_MODE;
 				if (mode == MCAST_UNDEFINED)
 					record_has_sources = 0;
 			}
 		} else {
 			if (record_has_sources) {
 				is_filter_list_change = 1;
 			} else {
 				type = IGMP_DO_NOTHING;
 			}
 		}
 	} else {
 		/*
 		 * Queue a current state record.
 		 */
 		if (mode == MCAST_EXCLUDE) {
 			type = IGMP_MODE_IS_EXCLUDE;
 		} else if (mode == MCAST_INCLUDE) {
 			type = IGMP_MODE_IS_INCLUDE;
 			KASSERT(inm->inm_st[1].iss_asm == 0,
 			    ("%s: inm %p is INCLUDE but ASM count is %d",
 			     __func__, inm, inm->inm_st[1].iss_asm));
 		}
 	}
 
 	/*
 	 * Generate the filter list changes using a separate function.
 	 */
 	if (is_filter_list_change)
 		return (igmp_v3_enqueue_filter_change(mq, inm));
 
 	if (type == IGMP_DO_NOTHING) {
 		CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s",
 		    __func__, inet_ntoa(inm->inm_addr),
 		    inm->inm_ifp->if_xname);
 		return (0);
 	}
 
 	/*
 	 * If any sources are present, we must be able to fit at least
 	 * one in the trailing space of the tail packet's mbuf,
 	 * ideally more.
 	 */
 	minrec0len = sizeof(struct igmp_grouprec);
 	if (record_has_sources)
 		minrec0len += sizeof(in_addr_t);
 
 	CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__,
 	    igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr),
 	    inm->inm_ifp->if_xname);
 
 	/*
 	 * Check if we have a packet in the tail of the queue for this
 	 * group into which the first group record for this group will fit.
 	 * Otherwise allocate a new packet.
 	 * Always allocate leading space for IP+RA_OPT+IGMP+REPORT.
 	 * Note: Group records for G/GSR query responses MUST be sent
 	 * in their own packet.
 	 */
 	m0 = mbufq_last(mq);
 	if (!is_group_query &&
 	    m0 != NULL &&
 	    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) &&
 	    (m0->m_pkthdr.len + minrec0len) <
 	     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
 		m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 			    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 		m = m0;
 		CTR1(KTR_IGMPV3, "%s: use existing packet", __func__);
 	} else {
 		if (mbufq_full(mq)) {
 			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = NULL;
 		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 		if (!is_state_change && !is_group_query) {
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 			if (m)
 				m->m_data += IGMP_LEADINGSPACE;
 		}
 		if (m == NULL) {
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 			if (m)
 				M_ALIGN(m, IGMP_LEADINGSPACE);
 		}
 		if (m == NULL)
 			return (-ENOMEM);
 
 		igmp_save_context(m, ifp);
 
 		CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__);
 	}
 
 	/*
 	 * Append group record.
 	 * If we have sources, we don't know how many yet.
 	 */
 	ig.ig_type = type;
 	ig.ig_datalen = 0;
 	ig.ig_numsrc = 0;
 	ig.ig_group = inm->inm_addr;
 	if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
 		if (m != m0)
 			m_freem(m);
 		CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
 		return (-ENOMEM);
 	}
 	nbytes += sizeof(struct igmp_grouprec);
 
 	/*
 	 * Append as many sources as will fit in the first packet.
 	 * If we are appending to a new packet, the chain allocation
 	 * may potentially use clusters; use m_getptr() in this case.
 	 * If we are appending to an existing packet, we need to obtain
 	 * a pointer to the group record after m_append(), in case a new
 	 * mbuf was allocated.
 	 * Only append sources which are in-mode at t1. If we are
 	 * transitioning to MCAST_UNDEFINED state on the group, do not
 	 * include source entries.
 	 * Only report recorded sources in our filter set when responding
 	 * to a group-source query.
 	 */
 	if (record_has_sources) {
 		if (m == m0) {
 			md = m_last(m);
 			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
 			    md->m_len - nbytes);
 		} else {
 			md = m_getptr(m, 0, &off);
 			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
 			    off);
 		}
 		msrcs = 0;
 		RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) {
 			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
 			    inet_ntoa_haddr(ims->ims_haddr));
 			now = ims_get_mode(inm, ims, 1);
 			CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now);
 			if ((now != mode) ||
 			    (now == mode && mode == MCAST_UNDEFINED)) {
 				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->ims_stp == 0) {
 				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_IGMPV3, "%s: append node", __func__);
 			naddr = htonl(ims->ims_haddr);
 			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			nbytes += sizeof(in_addr_t);
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__,
 		    msrcs);
 		pig->ig_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(in_addr_t));
 	}
 
 	if (is_source_query && msrcs == 0) {
 		CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__);
 		if (m != m0)
 			m_freem(m);
 		return (0);
 	}
 
 	/*
 	 * We are good to go with first packet.
 	 */
 	if (m != m0) {
 		CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__);
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		mbufq_enqueue(mq, m);
 	} else
 		m->m_pkthdr.PH_vt.vt_nrecs++;
 
 	/*
 	 * No further work needed if no source list in packet(s).
 	 */
 	if (!record_has_sources)
 		return (nbytes);
 
 	/*
 	 * Whilst sources remain to be announced, we need to allocate
 	 * a new packet and fill out as many sources as will fit.
 	 * Always try for a cluster first.
 	 */
 	while (nims != NULL) {
 		if (mbufq_full(mq)) {
 			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m)
 			m->m_data += IGMP_LEADINGSPACE;
 		if (m == NULL) {
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 			if (m)
 				M_ALIGN(m, IGMP_LEADINGSPACE);
 		}
 		if (m == NULL)
 			return (-ENOMEM);
 		igmp_save_context(m, ifp);
 		md = m_getptr(m, 0, &off);
 		pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off);
 		CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__);
 
 		if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
 			if (m != m0)
 				m_freem(m);
 			CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
 			return (-ENOMEM);
 		}
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		nbytes += sizeof(struct igmp_grouprec);
 
 		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 
 		msrcs = 0;
 		RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
 			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
 			    inet_ntoa_haddr(ims->ims_haddr));
 			now = ims_get_mode(inm, ims, 1);
 			if ((now != mode) ||
 			    (now == mode && mode == MCAST_UNDEFINED)) {
 				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->ims_stp == 0) {
 				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_IGMPV3, "%s: append node", __func__);
 			naddr = htonl(ims->ims_haddr);
 			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		pig->ig_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(in_addr_t));
 
 		CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__);
 		mbufq_enqueue(mq, m);
 	}
 
 	return (nbytes);
 }
 
 /*
  * Type used to mark record pass completion.
  * We exploit the fact we can cast to this easily from the
  * current filter modes on each ip_msource node.
  */
 typedef enum {
 	REC_NONE = 0x00,	/* MCAST_UNDEFINED */
 	REC_ALLOW = 0x01,	/* MCAST_INCLUDE */
 	REC_BLOCK = 0x02,	/* MCAST_EXCLUDE */
 	REC_FULL = REC_ALLOW | REC_BLOCK
 } rectype_t;
 
 /*
  * Enqueue an IGMPv3 filter list change to the given output queue.
  *
  * Source list filter state is held in an RB-tree. When the filter list
  * for a group is changed without changing its mode, we need to compute
  * the deltas between T0 and T1 for each source in the filter set,
  * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
  *
  * As we may potentially queue two record types, and the entire R-B tree
  * needs to be walked at once, we break this out into its own function
  * so we can generate a tightly packed queue of packets.
  *
  * XXX This could be written to only use one tree walk, although that makes
  * serializing into the mbuf chains a bit harder. For now we do two walks
  * which makes things easier on us, and it may or may not be harder on
  * the L2 cache.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 igmp_v3_enqueue_filter_change(struct mbufq *mq, struct in_multi *inm)
 {
 	static const int MINRECLEN =
 	    sizeof(struct igmp_grouprec) + sizeof(in_addr_t);
 	struct ifnet		*ifp;
 	struct igmp_grouprec	 ig;
 	struct igmp_grouprec	*pig;
 	struct ip_msource	*ims, *nims;
 	struct mbuf		*m, *m0, *md;
 	in_addr_t		 naddr;
 	int			 m0srcs, nbytes, npbytes, off, rsrcs, schanged;
 	int			 nallow, nblock;
 	uint8_t			 mode, now, then;
 	rectype_t		 crt, drt, nrt;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	if (inm->inm_nsrc == 0 ||
 	    (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0))
 		return (0);
 
 	ifp = inm->inm_ifp;			/* interface */
 	mode = inm->inm_st[1].iss_fmode;	/* filter mode at t1 */
 	crt = REC_NONE;	/* current group record type */
 	drt = REC_NONE;	/* mask of completed group record types */
 	nrt = REC_NONE;	/* record type for current node */
 	m0srcs = 0;	/* # source which will fit in current mbuf chain */
 	nbytes = 0;	/* # of bytes appended to group's state-change queue */
 	npbytes = 0;	/* # of bytes appended this packet */
 	rsrcs = 0;	/* # sources encoded in current record */
 	schanged = 0;	/* # nodes encoded in overall filter change */
 	nallow = 0;	/* # of source entries in ALLOW_NEW */
 	nblock = 0;	/* # of source entries in BLOCK_OLD */
 	nims = NULL;	/* next tree node pointer */
 
 	/*
 	 * For each possible filter record mode.
 	 * The first kind of source we encounter tells us which
 	 * is the first kind of record we start appending.
 	 * If a node transitioned to UNDEFINED at t1, its mode is treated
 	 * as the inverse of the group's filter mode.
 	 */
 	while (drt != REC_FULL) {
 		do {
 			m0 = mbufq_last(mq);
 			if (m0 != NULL &&
 			    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
 			     IGMP_V3_REPORT_MAXRECS) &&
 			    (m0->m_pkthdr.len + MINRECLEN) <
 			     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
 				m = m0;
 				m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 					    sizeof(struct igmp_grouprec)) /
 				    sizeof(in_addr_t);
 				CTR1(KTR_IGMPV3,
 				    "%s: use previous packet", __func__);
 			} else {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				if (m)
 					m->m_data += IGMP_LEADINGSPACE;
 				if (m == NULL) {
 					m = m_gethdr(M_NOWAIT, MT_DATA);
 					if (m)
 						M_ALIGN(m, IGMP_LEADINGSPACE);
 				}
 				if (m == NULL) {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_get*() failed", __func__);
 					return (-ENOMEM);
 				}
 				m->m_pkthdr.PH_vt.vt_nrecs = 0;
 				igmp_save_context(m, ifp);
 				m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 				    sizeof(struct igmp_grouprec)) /
 				    sizeof(in_addr_t);
 				npbytes = 0;
 				CTR1(KTR_IGMPV3,
 				    "%s: allocated new packet", __func__);
 			}
 			/*
 			 * Append the IGMP group record header to the
 			 * current packet's data area.
 			 * Recalculate pointer to free space for next
 			 * group record, in case m_append() allocated
 			 * a new mbuf or cluster.
 			 */
 			memset(&ig, 0, sizeof(ig));
 			ig.ig_group = inm->inm_addr;
 			if (!m_append(m, sizeof(ig), (void *)&ig)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3,
 				    "%s: m_append() failed", __func__);
 				return (-ENOMEM);
 			}
 			npbytes += sizeof(struct igmp_grouprec);
 			if (m != m0) {
 				/* new packet; offset in c hain */
 				md = m_getptr(m, npbytes -
 				    sizeof(struct igmp_grouprec), &off);
 				pig = (struct igmp_grouprec *)(mtod(md,
 				    uint8_t *) + off);
 			} else {
 				/* current packet; offset from last append */
 				md = m_last(m);
 				pig = (struct igmp_grouprec *)(mtod(md,
 				    uint8_t *) + md->m_len -
 				    sizeof(struct igmp_grouprec));
 			}
 			/*
 			 * Begin walking the tree for this record type
 			 * pass, or continue from where we left off
 			 * previously if we had to allocate a new packet.
 			 * Only report deltas in-mode at t1.
 			 * We need not report included sources as allowed
 			 * if we are in inclusive mode on the group,
 			 * however the converse is not true.
 			 */
 			rsrcs = 0;
 			if (nims == NULL)
 				nims = RB_MIN(ip_msource_tree, &inm->inm_srcs);
 			RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
 				CTR2(KTR_IGMPV3, "%s: visit node %s",
 				    __func__, inet_ntoa_haddr(ims->ims_haddr));
 				now = ims_get_mode(inm, ims, 1);
 				then = ims_get_mode(inm, ims, 0);
 				CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d",
 				    __func__, then, now);
 				if (now == then) {
 					CTR1(KTR_IGMPV3,
 					    "%s: skip unchanged", __func__);
 					continue;
 				}
 				if (mode == MCAST_EXCLUDE &&
 				    now == MCAST_INCLUDE) {
 					CTR1(KTR_IGMPV3,
 					    "%s: skip IN src on EX group",
 					    __func__);
 					continue;
 				}
 				nrt = (rectype_t)now;
 				if (nrt == REC_NONE)
 					nrt = (rectype_t)(~mode & REC_FULL);
 				if (schanged++ == 0) {
 					crt = nrt;
 				} else if (crt != nrt)
 					continue;
 				naddr = htonl(ims->ims_haddr);
 				if (!m_append(m, sizeof(in_addr_t),
 				    (void *)&naddr)) {
 					if (m != m0)
 						m_freem(m);
 					CTR1(KTR_IGMPV3,
 					    "%s: m_append() failed", __func__);
 					return (-ENOMEM);
 				}
 				nallow += !!(crt == REC_ALLOW);
 				nblock += !!(crt == REC_BLOCK);
 				if (++rsrcs == m0srcs)
 					break;
 			}
 			/*
 			 * If we did not append any tree nodes on this
 			 * pass, back out of allocations.
 			 */
 			if (rsrcs == 0) {
 				npbytes -= sizeof(struct igmp_grouprec);
 				if (m != m0) {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_free(m)", __func__);
 					m_freem(m);
 				} else {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_adj(m, -ig)", __func__);
 					m_adj(m, -((int)sizeof(
 					    struct igmp_grouprec)));
 				}
 				continue;
 			}
 			npbytes += (rsrcs * sizeof(in_addr_t));
 			if (crt == REC_ALLOW)
 				pig->ig_type = IGMP_ALLOW_NEW_SOURCES;
 			else if (crt == REC_BLOCK)
 				pig->ig_type = IGMP_BLOCK_OLD_SOURCES;
 			pig->ig_numsrc = htons(rsrcs);
 			/*
 			 * Count the new group record, and enqueue this
 			 * packet if it wasn't already queued.
 			 */
 			m->m_pkthdr.PH_vt.vt_nrecs++;
 			if (m != m0)
 				mbufq_enqueue(mq, m);
 			nbytes += npbytes;
 		} while (nims != NULL);
 		drt |= crt;
 		crt = (~crt & REC_FULL);
 	}
 
 	CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
 	    nallow, nblock);
 
 	return (nbytes);
 }
 
 static int
 igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq)
 {
 	struct mbufq	*gq;
 	struct mbuf	*m;		/* pending state-change */
 	struct mbuf	*m0;		/* copy of pending state-change */
 	struct mbuf	*mt;		/* last state-change in packet */
 	int		 docopy, domerge;
 	u_int		 recslen;
 
 	docopy = 0;
 	domerge = 0;
 	recslen = 0;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	/*
 	 * If there are further pending retransmissions, make a writable
 	 * copy of each queued state-change message before merging.
 	 */
 	if (inm->inm_scrv > 0)
 		docopy = 1;
 
 	gq = &inm->inm_scq;
 #ifdef KTR
 	if (mbufq_first(gq) == NULL) {
 		CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty",
 		    __func__, inm);
 	}
 #endif
 
 	m = mbufq_first(gq);
 	while (m != NULL) {
 		/*
 		 * Only merge the report into the current packet if
 		 * there is sufficient space to do so; an IGMPv3 report
 		 * packet may only contain 65,535 group records.
 		 * Always use a simple mbuf chain concatentation to do this,
 		 * as large state changes for single groups may have
 		 * allocated clusters.
 		 */
 		domerge = 0;
 		mt = mbufq_last(scq);
 		if (mt != NULL) {
 			recslen = m_length(m, NULL);
 
 			if ((mt->m_pkthdr.PH_vt.vt_nrecs +
 			    m->m_pkthdr.PH_vt.vt_nrecs <=
 			    IGMP_V3_REPORT_MAXRECS) &&
 			    (mt->m_pkthdr.len + recslen <=
 			    (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE)))
 				domerge = 1;
 		}
 
 		if (!domerge && mbufq_full(gq)) {
 			CTR2(KTR_IGMPV3,
 			    "%s: outbound queue full, skipping whole packet %p",
 			    __func__, m);
 			mt = m->m_nextpkt;
 			if (!docopy)
 				m_freem(m);
 			m = mt;
 			continue;
 		}
 
 		if (!docopy) {
 			CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m);
 			m0 = mbufq_dequeue(gq);
 			m = m0->m_nextpkt;
 		} else {
 			CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m);
 			m0 = m_dup(m, M_NOWAIT);
 			if (m0 == NULL)
 				return (ENOMEM);
 			m0->m_nextpkt = NULL;
 			m = m->m_nextpkt;
 		}
 
 		if (!domerge) {
 			CTR3(KTR_IGMPV3, "%s: queueing %p to scq %p)",
 			    __func__, m0, scq);
 			mbufq_enqueue(scq, m0);
 		} else {
 			struct mbuf *mtl;	/* last mbuf of packet mt */
 
 			CTR3(KTR_IGMPV3, "%s: merging %p with scq tail %p)",
 			    __func__, m0, mt);
 
 			mtl = m_last(mt);
 			m0->m_flags &= ~M_PKTHDR;
 			mt->m_pkthdr.len += recslen;
 			mt->m_pkthdr.PH_vt.vt_nrecs +=
 			    m0->m_pkthdr.PH_vt.vt_nrecs;
 
 			mtl->m_next = m0;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Respond to a pending IGMPv3 General Query.
  */
 static void
 igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in_multi		*inm;
 	int			 retval, loop;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi->igi_version == IGMP_VERSION_3,
 	    ("%s: called when version %d", __func__, igi->igi_version));
 
 	/*
 	 * Check that there are some packets queued. If so, send them first.
 	 * For large number of groups the reply to general query can take
 	 * many packets, we should finish sending them before starting of
 	 * queuing the new reply.
 	 */
 	if (mbufq_len(&igi->igi_gq) != 0)
 		goto send;
 
 	ifp = igi->igi_ifp;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		KASSERT(ifp == inm->inm_ifp,
 		    ("%s: inconsistent ifp", __func__));
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			retval = igmp_v3_enqueue_group_record(&igi->igi_gq,
 			    inm, 0, 0, 0);
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 send:
 	loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
 	igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop);
 
 	/*
 	 * Slew transmission of bursts over 500ms intervals.
 	 */
 	if (mbufq_first(&igi->igi_gq) != NULL) {
 		igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY(
 		    IGMP_RESPONSE_BURST_INTERVAL);
 		V_interface_timers_running = 1;
 	}
 }
 
 /*
  * Transmit the next pending IGMP message in the output queue.
  *
  * We get called from netisr_processqueue(). A mutex private to igmpoq
  * will be acquired and released around this routine.
  *
  * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
  * MRT: Nothing needs to be done, as IGMP traffic is always local to
  * a link and uses a link-scope multicast address.
  */
 static void
 igmp_intr(struct mbuf *m)
 {
 	struct ip_moptions	 imo;
 	struct ifnet		*ifp;
 	struct mbuf		*ipopts, *m0;
 	int			 error;
 	uint32_t		 ifindex;
 
 	CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m);
 
 	/*
 	 * Set VNET image pointer from enqueued mbuf chain
 	 * before doing anything else. Whilst we use interface
 	 * indexes to guard against interface detach, they are
 	 * unique to each VIMAGE and must be retrieved.
 	 */
 	CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr));
 	ifindex = igmp_restore_context(m);
 
 	/*
 	 * Check if the ifnet still exists. This limits the scope of
 	 * any race in the absence of a global ifp lock for low cost
 	 * (an array lookup).
 	 */
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.",
 		    __func__, m, ifindex);
 		m_freem(m);
 		IPSTAT_INC(ips_noroute);
 		goto out;
 	}
 
 	ipopts = V_igmp_sendra ? m_raopt : NULL;
 
 	imo.imo_multicast_ttl  = 1;
 	imo.imo_multicast_vif  = -1;
 	imo.imo_multicast_loop = (V_ip_mrouter != NULL);
 
 	/*
 	 * If the user requested that IGMP traffic be explicitly
 	 * redirected to the loopback interface (e.g. they are running a
 	 * MANET interface and the routing protocol needs to see the
 	 * updates), handle this now.
 	 */
 	if (m->m_flags & M_IGMP_LOOP)
 		imo.imo_multicast_ifp = V_loif;
 	else
 		imo.imo_multicast_ifp = ifp;
 
 	if (m->m_flags & M_IGMPV2) {
 		m0 = m;
 	} else {
 		m0 = igmp_v3_encap_report(ifp, m);
 		if (m0 == NULL) {
 			CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m);
 			m_freem(m);
 			IPSTAT_INC(ips_odropped);
 			goto out;
 		}
 	}
 
 	igmp_scrub_context(m0);
 	m_clrprotoflags(m);
 	m0->m_pkthdr.rcvif = V_loif;
 #ifdef MAC
 	mac_netinet_igmp_send(ifp, m0);
 #endif
 	error = ip_output(m0, ipopts, NULL, 0, &imo, NULL);
 	if (error) {
 		CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error);
 		goto out;
 	}
 
 	IGMPSTAT_INC(igps_snd_reports);
 
 out:
 	/*
 	 * We must restore the existing vnet pointer before
 	 * continuing as we are run from netisr context.
 	 */
 	CURVNET_RESTORE();
 }
 
 /*
  * Encapsulate an IGMPv3 report.
  *
  * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf
  * chain has already had its IP/IGMPv3 header prepended. In this case
  * the function will not attempt to prepend; the lengths and checksums
  * will however be re-computed.
  *
  * Returns a pointer to the new mbuf chain head, or NULL if the
  * allocation failed.
  */
 static struct mbuf *
 igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
 {
 	struct rm_priotracker	in_ifa_tracker;
 	struct igmp_report	*igmp;
 	struct ip		*ip;
 	int			 hdrlen, igmpreclen;
 
 	KASSERT((m->m_flags & M_PKTHDR),
 	    ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
 
 	igmpreclen = m_length(m, NULL);
 	hdrlen = sizeof(struct ip) + sizeof(struct igmp_report);
 
 	if (m->m_flags & M_IGMPV3_HDR) {
 		igmpreclen -= hdrlen;
 	} else {
 		M_PREPEND(m, hdrlen, M_NOWAIT);
 		if (m == NULL)
 			return (NULL);
 		m->m_flags |= M_IGMPV3_HDR;
 	}
 
 	CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen);
 
 	m->m_data += sizeof(struct ip);
 	m->m_len -= sizeof(struct ip);
 
 	igmp = mtod(m, struct igmp_report *);
 	igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT;
 	igmp->ir_rsv1 = 0;
 	igmp->ir_rsv2 = 0;
 	igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs);
 	igmp->ir_cksum = 0;
 	igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen);
 	m->m_pkthdr.PH_vt.vt_nrecs = 0;
 
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_tos = IPTOS_PREC_INTERNETCONTROL;
 	ip->ip_len = htons(hdrlen + igmpreclen);
 	ip->ip_off = htons(IP_DF);
 	ip->ip_p = IPPROTO_IGMP;
 	ip->ip_sum = 0;
 
 	ip->ip_src.s_addr = INADDR_ANY;
 
 	if (m->m_flags & M_IGMP_LOOP) {
 		struct in_ifaddr *ia;
 
 		IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 		if (ia != NULL) {
 			ip->ip_src = ia->ia_addr.sin_addr;
 			ifa_free(&ia->ia_ifa);
 		}
 	}
 
 	ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP);
 
 	return (m);
 }
 
 #ifdef KTR
 static char *
 igmp_rec_type_to_str(const int type)
 {
 
 	switch (type) {
 		case IGMP_CHANGE_TO_EXCLUDE_MODE:
 			return "TO_EX";
 			break;
 		case IGMP_CHANGE_TO_INCLUDE_MODE:
 			return "TO_IN";
 			break;
 		case IGMP_MODE_IS_EXCLUDE:
 			return "MODE_EX";
 			break;
 		case IGMP_MODE_IS_INCLUDE:
 			return "MODE_IN";
 			break;
 		case IGMP_ALLOW_NEW_SOURCES:
 			return "ALLOW_NEW";
 			break;
 		case IGMP_BLOCK_OLD_SOURCES:
 			return "BLOCK_OLD";
 			break;
 		default:
 			break;
 	}
 	return "unknown";
 }
 #endif
 
+static void
+vnet_igmp_init(const void *unused __unused)
+{
+
+	netisr_register(&igmp_nh);
+}
+VNET_SYSINIT(vnet_igmp_init, SI_SUB_PROTO_MC, SI_ORDER_ANY,
+    vnet_igmp_init, NULL);
+
 #ifdef VIMAGE
 static void
 vnet_igmp_uninit(const void *unused __unused)
 {
 
 	/* This can happen when we shutdown the entire network stack. */
 	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
+
+	netisr_unregister(&igmp_nh);
 }
 VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY,
     vnet_igmp_uninit, NULL);
 #endif
 
 #ifdef DDB
 DB_SHOW_COMMAND(igi_list, db_show_igi_list)
 {
 	struct igmp_ifsoftc *igi, *tigi;
 	LIST_HEAD(_igi_list, igmp_ifsoftc) *igi_head;
 
         if (!have_addr) {
                 db_printf("usage: show igi_list <addr>\n");
                 return;
         }
         igi_head = (struct _igi_list *)addr;
 
 	LIST_FOREACH_SAFE(igi, igi_head, igi_link, tigi) {
 		db_printf("igmp_ifsoftc %p:\n", igi);
 		db_printf("    ifp %p\n", igi->igi_ifp);
 		db_printf("    version %u\n", igi->igi_version);
 		db_printf("    v1_timer %u\n", igi->igi_v1_timer);
 		db_printf("    v2_timer %u\n", igi->igi_v2_timer);
 		db_printf("    v3_timer %u\n", igi->igi_v3_timer);
 		db_printf("    flags %#x\n", igi->igi_flags);
 		db_printf("    rv %u\n", igi->igi_rv);
 		db_printf("    qi %u\n", igi->igi_qi);
 		db_printf("    qri %u\n", igi->igi_qri);
 		db_printf("    uri %u\n", igi->igi_uri);
 		/* SLIST_HEAD(,in_multi)   igi_relinmhead */
 		/* struct mbufq    igi_gq; */
 		db_printf("\n");
 	}
 }
 #endif
 
 static int
 igmp_modevent(module_t mod, int type, void *unused __unused)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		CTR1(KTR_IGMPV3, "%s: initializing", __func__);
 		IGMP_LOCK_INIT();
 		m_raopt = igmp_ra_alloc();
-		netisr_register(&igmp_nh);
 		break;
 	case MOD_UNLOAD:
 		CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
-		netisr_unregister(&igmp_nh);
 		m_free(m_raopt);
 		m_raopt = NULL;
 		IGMP_LOCK_DESTROY();
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t igmp_mod = {
     "igmp",
     igmp_modevent,
     0
 };
-DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PROTO_MC, SI_ORDER_ANY);
+DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE);
Index: projects/vnet/sys/netinet/ip_input.c
===================================================================
--- projects/vnet/sys/netinet/ip_input.c	(revision 298547)
+++ projects/vnet/sys/netinet/ip_input.c	(revision 298548)
@@ -1,1357 +1,1363 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bootp.h"
 #include "opt_ipfw.h"
 #include "opt_ipstealth.h"
 #include "opt_ipsec.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hhook.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 
 #include <net/pfil.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_options.h>
 #include <machine/in_cksum.h>
 #include <netinet/ip_carp.h>
 #ifdef IPSEC
 #include <netinet/ip_ipsec.h>
 #include <netipsec/ipsec.h>
 #include <netipsec/key.h>
 #endif /* IPSEC */
 #include <netinet/in_rss.h>
 
 #include <sys/socketvar.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef CTASSERT
 CTASSERT(sizeof(struct ip) == 20);
 #endif
 
 /* IP reassembly functions are defined in ip_reass.c. */
 extern void ipreass_init(void);
 extern void ipreass_drain(void);
 extern void ipreass_slowtimo(void);
 #ifdef VIMAGE
 extern void ipreass_destroy(void);
 #endif
 
 struct rmlock in_ifaddr_lock;
 RM_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock");
 
 VNET_DEFINE(int, rsvp_on);
 
 VNET_DEFINE(int, ipforwarding);
 SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipforwarding), 0,
     "Enable IP forwarding between interfaces");
 
 static VNET_DEFINE(int, ipsendredirects) = 1;	/* XXX */
 #define	V_ipsendredirects	VNET(ipsendredirects)
 SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipsendredirects), 0,
     "Enable sending IP redirects");
 
 /*
  * XXX - Setting ip_checkinterface mostly implements the receive side of
  * the Strong ES model described in RFC 1122, but since the routing table
  * and transmit implementation do not implement the Strong ES model,
  * setting this to 1 results in an odd hybrid.
  *
  * XXX - ip_checkinterface currently must be disabled if you use ipnat
  * to translate the destination address to another local interface.
  *
  * XXX - ip_checkinterface must be disabled if you add IP aliases
  * to the loopback interface instead of the interface where the
  * packets for those addresses are received.
  */
 static VNET_DEFINE(int, ip_checkinterface);
 #define	V_ip_checkinterface	VNET(ip_checkinterface)
 SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip_checkinterface), 0,
     "Verify packet arrives on correct interface");
 
 VNET_DEFINE(struct pfil_head, inet_pfil_hook);	/* Packet filter hooks */
 
 static struct netisr_handler ip_nh = {
 	.nh_name = "ip",
 	.nh_handler = ip_input,
 	.nh_proto = NETISR_IP,
 #ifdef	RSS
 	.nh_m2cpuid = rss_soft_m2cpuid_v4,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 #else
 	.nh_policy = NETISR_POLICY_FLOW,
 #endif
 };
 
 #ifdef	RSS
 /*
  * Directly dispatched frames are currently assumed
  * to have a flowid already calculated.
  *
  * It should likely have something that assert it
  * actually has valid flow details.
  */
 static struct netisr_handler ip_direct_nh = {
 	.nh_name = "ip_direct",
 	.nh_handler = ip_direct_input,
 	.nh_proto = NETISR_IP_DIRECT,
 	.nh_m2cpuid = rss_soft_m2cpuid_v4,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 };
 #endif
 
 extern	struct domain inetdomain;
 extern	struct protosw inetsw[];
 u_char	ip_protox[IPPROTO_MAX];
 VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead);  /* first inet address */
 VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table  */
 VNET_DEFINE(u_long, in_ifaddrhmask);		/* mask for hash table */
 
 #ifdef IPCTL_DEFMTU
 SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
     &ip_mtu, 0, "Default MTU");
 #endif
 
 #ifdef IPSTEALTH
 VNET_DEFINE(int, ipstealth);
 SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipstealth), 0,
     "IP stealth mode, no TTL decrementation on forwarding");
 #endif
 
 /*
  * IP statistics are stored in the "array" of counter(9)s.
  */
 VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat);
 VNET_PCPUSTAT_SYSINIT(ipstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat,
     "IP statistics (struct ipstat, netinet/ip_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ipstat);
 #endif /* VIMAGE */
 
 /*
  * Kernel module interface for updating ipstat.  The argument is an index
  * into ipstat treated as an array.
  */
 void
 kmod_ipstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(ipstat)[statnum], 1);
 }
 
 void
 kmod_ipstat_dec(int statnum)
 {
 
 	counter_u64_add(VNET(ipstat)[statnum], -1);
 }
 
 static int
 sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip_nh, qlimit));
 }
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen,
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I",
     "Maximum size of the IP input queue");
 
 static int
 sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS)
 {
 	u_int64_t qdrops_long;
 	int error, qdrops;
 
 	netisr_getqdrops(&ip_nh, &qdrops_long);
 	qdrops = qdrops_long;
 	error = sysctl_handle_int(oidp, &qdrops, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qdrops != 0)
 		return (EINVAL);
 	netisr_clearqdrops(&ip_nh);
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops,
     CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I",
     "Number of packets dropped from the IP input queue");
 
 #ifdef	RSS
 static int
 sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip_direct_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip_direct_nh, qlimit));
 }
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_direct_queue_maxlen,
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I",
     "Maximum size of the IP direct input queue");
 
 static int
 sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS)
 {
 	u_int64_t qdrops_long;
 	int error, qdrops;
 
 	netisr_getqdrops(&ip_direct_nh, &qdrops_long);
 	qdrops = qdrops_long;
 	error = sysctl_handle_int(oidp, &qdrops, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qdrops != 0)
 		return (EINVAL);
 	netisr_clearqdrops(&ip_direct_nh);
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_direct_queue_drops,
     CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I",
     "Number of packets dropped from the IP direct input queue");
 #endif	/* RSS */
 
 /*
  * IP initialization: fill in IP protocol switch table.
  * All protocols not implemented in kernel go to raw IP protocol handler.
  */
 void
 ip_init(void)
 {
 	struct protosw *pr;
 	int i;
 
 	TAILQ_INIT(&V_in_ifaddrhead);
 	V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask);
 
 	/* Initialize IP reassembly queue. */
 	ipreass_init();
 
 	/* Initialize packet filter hooks. */
 	V_inet_pfil_hook.ph_type = PFIL_TYPE_AF;
 	V_inet_pfil_hook.ph_af = AF_INET;
 	if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to register pfil hook, "
 			"error %d\n", __func__, i);
 
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET,
 	    &V_ipsec_hhh_in[HHOOK_IPSEC_INET],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register input helper hook\n",
 		    __func__);
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET,
 	    &V_ipsec_hhh_out[HHOOK_IPSEC_INET],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register output helper hook\n",
 		    __func__);
 
 	/* Skip initialization of globals for non-default instances. */
 	if (!IS_DEFAULT_VNET(curvnet))
-		return;
+		goto out;
 
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		panic("ip_init: PF_INET not found");
 
 	/* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
 	for (i = 0; i < IPPROTO_MAX; i++)
 		ip_protox[i] = pr - inetsw;
 	/*
 	 * Cycle through IP protocols and put them into the appropriate place
 	 * in ip_protox[].
 	 */
 	for (pr = inetdomain.dom_protosw;
 	    pr < inetdomain.dom_protoswNPROTOSW; pr++)
 		if (pr->pr_domain->dom_family == PF_INET &&
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
 			/* Be careful to only index valid IP protocols. */
 			if (pr->pr_protocol < IPPROTO_MAX)
 				ip_protox[pr->pr_protocol] = pr - inetsw;
 		}
 
+out:
 	netisr_register(&ip_nh);
 #ifdef	RSS
 	netisr_register(&ip_direct_nh);
 #endif
 }
 
 #ifdef VIMAGE
 static void
 ip_destroy(void *unused __unused)
 {
 	struct ifnet *ifp;
 	int error;
+
+#ifdef	RSS
+	netisr_unregister(&ip_direct_nh);
+#endif
+	netisr_unregister(&ip_nh);
 
 	if ((error = pfil_head_unregister(&V_inet_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to unregister pfil hook, "
 		    "error %d\n", __func__, error);
 
 	error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister input helper hook "
 		    "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET: "
 		    "error %d returned\n", __func__, error);
 	}
 	error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister output helper hook "
 		    "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET: "
 		    "error %d returned\n", __func__, error);
 	}
 
 	/* Remove the IPv4 addresses from all interfaces. */
 	in_ifscrub_all();
 
 	/* Make sure the IPv4 routes are gone as well. */
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
 		rt_flushifroutes_af(ifp, AF_INET);
 	IFNET_RUNLOCK();
 
 	/* Destroy IP reassembly queue. */
 	ipreass_destroy();
 
 	/* Cleanup in_ifaddr hash table; should be empty. */
 	hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask);
 }
 
 VNET_SYSUNINIT(ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_destroy, NULL);
 #endif
 
 #ifdef	RSS
 /*
  * IP direct input routine.
  *
  * This is called when reinjecting completed fragments where
  * all of the previous checking and book-keeping has been done.
  */
 void
 ip_direct_input(struct mbuf *m)
 {
 	struct ip *ip;
 	int hlen;
 
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 
 	IPSTAT_INC(ips_delivered);
 	(*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p);
 	return;
 }
 #endif
 
 /*
  * Ip input routine.  Checksum and byte swap header.  If fragmented
  * try to reassemble.  Process options.  Pass to next level.
  */
 void
 ip_input(struct mbuf *m)
 {
 	struct ip *ip = NULL;
 	struct in_ifaddr *ia = NULL;
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	int    checkif, hlen = 0;
 	uint16_t sum, ip_len;
 	int dchg = 0;				/* dest changed after fw */
 	struct in_addr odst;			/* original dst address */
 
 	M_ASSERTPKTHDR(m);
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		/* Set up some basics that will be used later. */
 		ip = mtod(m, struct ip *);
 		hlen = ip->ip_hl << 2;
 		ip_len = ntohs(ip->ip_len);
 		goto ours;
 	}
 
 	IPSTAT_INC(ips_total);
 
 	if (m->m_pkthdr.len < sizeof(struct ip))
 		goto tooshort;
 
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == NULL) {
 		IPSTAT_INC(ips_toosmall);
 		return;
 	}
 	ip = mtod(m, struct ip *);
 
 	if (ip->ip_v != IPVERSION) {
 		IPSTAT_INC(ips_badvers);
 		goto bad;
 	}
 
 	hlen = ip->ip_hl << 2;
 	if (hlen < sizeof(struct ip)) {	/* minimum header length */
 		IPSTAT_INC(ips_badhlen);
 		goto bad;
 	}
 	if (hlen > m->m_len) {
 		if ((m = m_pullup(m, hlen)) == NULL) {
 			IPSTAT_INC(ips_badhlen);
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 
 	IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL);
 
 	/* 127/8 must not appear on wire - RFC1122 */
 	ifp = m->m_pkthdr.rcvif;
 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			IPSTAT_INC(ips_badaddr);
 			goto bad;
 		}
 	}
 
 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
 		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
 	} else {
 		if (hlen == sizeof(struct ip)) {
 			sum = in_cksum_hdr(ip);
 		} else {
 			sum = in_cksum(m, hlen);
 		}
 	}
 	if (sum) {
 		IPSTAT_INC(ips_badsum);
 		goto bad;
 	}
 
 #ifdef ALTQ
 	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
 		/* packet is dropped by traffic conditioner */
 		return;
 #endif
 
 	ip_len = ntohs(ip->ip_len);
 	if (ip_len < hlen) {
 		IPSTAT_INC(ips_badlen);
 		goto bad;
 	}
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IP header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len < ip_len) {
 tooshort:
 		IPSTAT_INC(ips_tooshort);
 		goto bad;
 	}
 	if (m->m_pkthdr.len > ip_len) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = ip_len;
 			m->m_pkthdr.len = ip_len;
 		} else
 			m_adj(m, ip_len - m->m_pkthdr.len);
 	}
 
 	/* Try to forward the packet, but if we fail continue */
 #ifdef IPSEC
 	/* For now we do not handle IPSEC in tryforward. */
 	if (!key_havesp(IPSEC_DIR_INBOUND) && !key_havesp(IPSEC_DIR_OUTBOUND) &&
 	    (V_ipforwarding == 1))
 		if (ip_tryforward(m) == NULL)
 			return;
 	/*
 	 * Bypass packet filtering for packets previously handled by IPsec.
 	 */
 	if (ip_ipsec_filtertunnel(m))
 		goto passin;
 #else
 	if (V_ipforwarding == 1)
 		if (ip_tryforward(m) == NULL)
 			return;
 #endif /* IPSEC */
 
 	/*
 	 * Run through list of hooks for input packets.
 	 *
 	 * NB: Beware of the destination address changing (e.g.
 	 *     by NAT rewriting).  When this happens, tell
 	 *     ip_forward to do the right thing.
 	 */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet_pfil_hook))
 		goto passin;
 
 	odst = ip->ip_dst;
 	if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0)
 		return;
 	if (m == NULL)			/* consumed by filter */
 		return;
 
 	ip = mtod(m, struct ip *);
 	dchg = (odst.s_addr != ip->ip_dst.s_addr);
 	ifp = m->m_pkthdr.rcvif;
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		goto ours;
 	}
 	if (m->m_flags & M_IP_NEXTHOP) {
 		dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL);
 		if (dchg != 0) {
 			/*
 			 * Directly ship the packet on.  This allows
 			 * forwarding packets originally destined to us
 			 * to some other directly connected host.
 			 */
 			ip_forward(m, 1);
 			return;
 		}
 	}
 passin:
 
 	/*
 	 * Process options and, if not destined for us,
 	 * ship it on.  ip_dooptions returns 1 when an
 	 * error was detected (causing an icmp message
 	 * to be sent and the original packet to be freed).
 	 */
 	if (hlen > sizeof (struct ip) && ip_dooptions(m, 0))
 		return;
 
         /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
          * matter if it is destined to another node, or whether it is 
          * a multicast one, RSVP wants it! and prevents it from being forwarded
          * anywhere else. Also checks if the rsvp daemon is running before
 	 * grabbing the packet.
          */
 	if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) 
 		goto ours;
 
 	/*
 	 * Check our list of addresses, to see if the packet is for us.
 	 * If we don't have any addresses, assume any unicast packet
 	 * we receive might be for us (and let the upper layers deal
 	 * with it).
 	 */
 	if (TAILQ_EMPTY(&V_in_ifaddrhead) &&
 	    (m->m_flags & (M_MCAST|M_BCAST)) == 0)
 		goto ours;
 
 	/*
 	 * Enable a consistency check between the destination address
 	 * and the arrival interface for a unicast packet (the RFC 1122
 	 * strong ES model) if IP forwarding is disabled and the packet
 	 * is not locally generated and the packet is not subject to
 	 * 'ipfw fwd'.
 	 *
 	 * XXX - Checking also should be disabled if the destination
 	 * address is ipnat'ed to a different interface.
 	 *
 	 * XXX - Checking is incompatible with IP aliases added
 	 * to the loopback interface instead of the interface where
 	 * the packets are received.
 	 *
 	 * XXX - This is the case for carp vhost IPs as well so we
 	 * insert a workaround. If the packet got here, we already
 	 * checked with carp_iamatch() and carp_forus().
 	 */
 	checkif = V_ip_checkinterface && (V_ipforwarding == 0) && 
 	    ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) &&
 	    ifp->if_carp == NULL && (dchg == 0);
 
 	/*
 	 * Check for exact addresses in the hash bucket.
 	 */
 	/* IN_IFADDR_RLOCK(); */
 	LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
 		/*
 		 * If the address matches, verify that the packet
 		 * arrived via the correct interface if checking is
 		 * enabled.
 		 */
 		if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && 
 		    (!checkif || ia->ia_ifp == ifp)) {
 			counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 			counter_u64_add(ia->ia_ifa.ifa_ibytes,
 			    m->m_pkthdr.len);
 			/* IN_IFADDR_RUNLOCK(); */
 			goto ours;
 		}
 	}
 	/* IN_IFADDR_RUNLOCK(); */
 
 	/*
 	 * Check for broadcast addresses.
 	 *
 	 * Only accept broadcast packets that arrive via the matching
 	 * interface.  Reception of forwarded directed broadcasts would
 	 * be handled via ip_forward() and ether_output() with the loopback
 	 * into the stack for SIMPLEX interfaces handled by ether_output().
 	 */
 	if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
 		IF_ADDR_RLOCK(ifp);
 	        TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
 			    ip->ip_dst.s_addr) {
 				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_ibytes,
 				    m->m_pkthdr.len);
 				IF_ADDR_RUNLOCK(ifp);
 				goto ours;
 			}
 #ifdef BOOTP_COMPAT
 			if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) {
 				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_ibytes,
 				    m->m_pkthdr.len);
 				IF_ADDR_RUNLOCK(ifp);
 				goto ours;
 			}
 #endif
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		ia = NULL;
 	}
 	/* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */
 	if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		if (V_ip_mrouter) {
 			/*
 			 * If we are acting as a multicast router, all
 			 * incoming multicast packets are passed to the
 			 * kernel-level multicast forwarding function.
 			 * The packet is returned (relatively) intact; if
 			 * ip_mforward() returns a non-zero value, the packet
 			 * must be discarded, else it may be accepted below.
 			 */
 			if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) {
 				IPSTAT_INC(ips_cantforward);
 				m_freem(m);
 				return;
 			}
 
 			/*
 			 * The process-level routing daemon needs to receive
 			 * all multicast IGMP packets, whether or not this
 			 * host belongs to their destination groups.
 			 */
 			if (ip->ip_p == IPPROTO_IGMP)
 				goto ours;
 			IPSTAT_INC(ips_forward);
 		}
 		/*
 		 * Assume the packet is for us, to avoid prematurely taking
 		 * a lock on the in_multi hash. Protocols must perform
 		 * their own filtering and update statistics accordingly.
 		 */
 		goto ours;
 	}
 	if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST)
 		goto ours;
 	if (ip->ip_dst.s_addr == INADDR_ANY)
 		goto ours;
 
 	/*
 	 * Not for us; forward if possible and desirable.
 	 */
 	if (V_ipforwarding == 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 	} else {
 		ip_forward(m, dchg);
 	}
 	return;
 
 ours:
 #ifdef IPSTEALTH
 	/*
 	 * IPSTEALTH: Process non-routing options only
 	 * if the packet is destined for us.
 	 */
 	if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1))
 		return;
 #endif /* IPSTEALTH */
 
 	/*
 	 * Attempt reassembly; if it succeeds, proceed.
 	 * ip_reass() will return a different mbuf.
 	 */
 	if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) {
 		/* XXXGL: shouldn't we save & set m_flags? */
 		m = ip_reass(m);
 		if (m == NULL)
 			return;
 		ip = mtod(m, struct ip *);
 		/* Get the header length of the reassembled packet */
 		hlen = ip->ip_hl << 2;
 	}
 
 #ifdef IPSEC
 	/*
 	 * enforce IPsec policy checking if we are seeing last header.
 	 * note that we do not visit this with protocols with pcb layer
 	 * code - like udp/tcp/raw ip.
 	 */
 	if (ip_ipsec_input(m, ip->ip_p) != 0)
 		goto bad;
 #endif /* IPSEC */
 
 	/*
 	 * Switch out to protocol's input routine.
 	 */
 	IPSTAT_INC(ips_delivered);
 
 	(*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p);
 	return;
 bad:
 	m_freem(m);
 }
 
 /*
  * IP timer processing;
  * if a timer expires on a reassembly
  * queue, discard it.
  */
 void
 ip_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		ipreass_slowtimo();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 void
 ip_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		ipreass_drain();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * The protocol to be inserted into ip_protox[] must be already registered
  * in inetsw[], either statically or through pf_proto_register().
  */
 int
 ipproto_register(short ipproto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/*
 	 * The protocol slot must not be occupied by another protocol
 	 * already.  An index pointing to IPPROTO_RAW is unused.
 	 */
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip_protox[ipproto] != pr - inetsw)	/* IPPROTO_RAW */
 		return (EEXIST);
 
 	/* Find the protocol position in inetsw[] and set the index. */
 	for (pr = inetdomain.dom_protosw;
 	     pr < inetdomain.dom_protoswNPROTOSW; pr++) {
 		if (pr->pr_domain->dom_family == PF_INET &&
 		    pr->pr_protocol && pr->pr_protocol == ipproto) {
 			ip_protox[pr->pr_protocol] = pr - inetsw;
 			return (0);
 		}
 	}
 	return (EPROTONOSUPPORT);
 }
 
 int
 ipproto_unregister(short ipproto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/* Check if the protocol was indeed registered. */
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip_protox[ipproto] == pr - inetsw)  /* IPPROTO_RAW */
 		return (ENOENT);
 
 	/* Reset the protocol slot to IPPROTO_RAW. */
 	ip_protox[ipproto] = pr - inetsw;
 	return (0);
 }
 
 u_char inetctlerrmap[PRC_NCMDS] = {
 	0,		0,		0,		0,
 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
 	0,		0,		EHOSTUNREACH,	0,
 	ENOPROTOOPT,	ECONNREFUSED
 };
 
 /*
  * Forward a packet.  If some error occurs return the sender
  * an icmp packet.  Note we can't always generate a meaningful
  * icmp message because icmp doesn't have a large enough repertoire
  * of codes and types.
  *
  * If not forwarding, just drop the packet.  This could be confusing
  * if ipforwarding was zero but some routing protocol was advancing
  * us as a gateway to somewhere.  However, we must let the routing
  * protocol deal with that.
  *
  * The srcrt parameter indicates whether the packet is being forwarded
  * via a source route.
  */
 void
 ip_forward(struct mbuf *m, int srcrt)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct in_ifaddr *ia;
 	struct mbuf *mcopy;
 	struct sockaddr_in *sin;
 	struct in_addr dest;
 	struct route ro;
 	int error, type = 0, code = 0, mtu = 0;
 
 	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 #ifdef IPSEC
 	if (ip_ipsec_fwd(m) != 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 #endif /* IPSEC */
 #ifdef IPSTEALTH
 	if (!V_ipstealth) {
 #endif
 		if (ip->ip_ttl <= IPTTLDEC) {
 			icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
 			    0, 0);
 			return;
 		}
 #ifdef IPSTEALTH
 	}
 #endif
 
 	bzero(&ro, sizeof(ro));
 	sin = (struct sockaddr_in *)&ro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = ip->ip_dst;
 #ifdef RADIX_MPATH
 	rtalloc_mpath_fib(&ro,
 	    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
 	    M_GETFIB(m));
 #else
 	in_rtalloc_ign(&ro, 0, M_GETFIB(m));
 #endif
 	if (ro.ro_rt != NULL) {
 		ia = ifatoia(ro.ro_rt->rt_ifa);
 		ifa_ref(&ia->ia_ifa);
 	} else
 		ia = NULL;
 #ifndef IPSEC
 	/*
 	 * 'ia' may be NULL if there is no route for this destination.
 	 * In case of IPsec, Don't discard it just yet, but pass it to
 	 * ip_output in case of outgoing IPsec policy.
 	 */
 	if (!srcrt && ia == NULL) {
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		RO_RTFREE(&ro);
 		return;
 	}
 #endif
 
 	/*
 	 * Save the IP header and at most 8 bytes of the payload,
 	 * in case we need to generate an ICMP message to the src.
 	 *
 	 * XXX this can be optimized a lot by saving the data in a local
 	 * buffer on the stack (72 bytes at most), and only allocating the
 	 * mbuf if really necessary. The vast majority of the packets
 	 * are forwarded without having to send an ICMP back (either
 	 * because unnecessary, or because rate limited), so we are
 	 * really we are wasting a lot of work here.
 	 *
 	 * We don't use m_copy() because it might return a reference
 	 * to a shared cluster. Both this function and ip_output()
 	 * assume exclusive access to the IP header in `m', so any
 	 * data in a cluster may change before we reach icmp_error().
 	 */
 	mcopy = m_gethdr(M_NOWAIT, m->m_type);
 	if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) {
 		/*
 		 * It's probably ok if the pkthdr dup fails (because
 		 * the deep copy of the tag chain failed), but for now
 		 * be conservative and just discard the copy since
 		 * code below may some day want the tags.
 		 */
 		m_free(mcopy);
 		mcopy = NULL;
 	}
 	if (mcopy != NULL) {
 		mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy));
 		mcopy->m_pkthdr.len = mcopy->m_len;
 		m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
 	}
 
 #ifdef IPSTEALTH
 	if (!V_ipstealth) {
 #endif
 		ip->ip_ttl -= IPTTLDEC;
 #ifdef IPSTEALTH
 	}
 #endif
 
 	/*
 	 * If forwarding packet using same interface that it came in on,
 	 * perhaps should send a redirect to sender to shortcut a hop.
 	 * Only send redirect if source is sending directly to us,
 	 * and if packet was not source routed (or has any options).
 	 * Also, don't send redirect if forwarding using a default route
 	 * or a route modified by a redirect.
 	 */
 	dest.s_addr = 0;
 	if (!srcrt && V_ipsendredirects &&
 	    ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) {
 		struct rtentry *rt;
 
 		rt = ro.ro_rt;
 
 		if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
 		    satosin(rt_key(rt))->sin_addr.s_addr != 0) {
 #define	RTA(rt)	((struct in_ifaddr *)(rt->rt_ifa))
 			u_long src = ntohl(ip->ip_src.s_addr);
 
 			if (RTA(rt) &&
 			    (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) {
 				if (rt->rt_flags & RTF_GATEWAY)
 					dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr;
 				else
 					dest.s_addr = ip->ip_dst.s_addr;
 				/* Router requirements says to only send host redirects */
 				type = ICMP_REDIRECT;
 				code = ICMP_REDIRECT_HOST;
 			}
 		}
 	}
 
 	error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);
 
 	if (error == EMSGSIZE && ro.ro_rt)
 		mtu = ro.ro_rt->rt_mtu;
 	RO_RTFREE(&ro);
 
 	if (error)
 		IPSTAT_INC(ips_cantforward);
 	else {
 		IPSTAT_INC(ips_forward);
 		if (type)
 			IPSTAT_INC(ips_redirectsent);
 		else {
 			if (mcopy)
 				m_freem(mcopy);
 			if (ia != NULL)
 				ifa_free(&ia->ia_ifa);
 			return;
 		}
 	}
 	if (mcopy == NULL) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return;
 	}
 
 	switch (error) {
 
 	case 0:				/* forwarded, but need redirect */
 		/* type, code set above */
 		break;
 
 	case ENETUNREACH:
 	case EHOSTUNREACH:
 	case ENETDOWN:
 	case EHOSTDOWN:
 	default:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_HOST;
 		break;
 
 	case EMSGSIZE:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_NEEDFRAG;
 
 #ifdef IPSEC
 		/* 
 		 * If IPsec is configured for this path,
 		 * override any possibly mtu value set by ip_output.
 		 */ 
 		mtu = ip_ipsec_mtu(mcopy, mtu);
 #endif /* IPSEC */
 		/*
 		 * If the MTU was set before make sure we are below the
 		 * interface MTU.
 		 * If the MTU wasn't set before use the interface mtu or
 		 * fall back to the next smaller mtu step compared to the
 		 * current packet size.
 		 */
 		if (mtu != 0) {
 			if (ia != NULL)
 				mtu = min(mtu, ia->ia_ifp->if_mtu);
 		} else {
 			if (ia != NULL)
 				mtu = ia->ia_ifp->if_mtu;
 			else
 				mtu = ip_next_mtu(ntohs(ip->ip_len), 0);
 		}
 		IPSTAT_INC(ips_cantfrag);
 		break;
 
 	case ENOBUFS:
 	case EACCES:			/* ipfw denied packet */
 		m_freem(mcopy);
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return;
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	icmp_error(mcopy, type, code, dest.s_addr, mtu);
 }
 
 void
 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
     struct mbuf *m)
 {
 
 	if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) {
 		struct bintime bt;
 
 		bintime(&bt);
 		if (inp->inp_socket->so_options & SO_BINTIME) {
 			*mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt),
 			    SCM_BINTIME, SOL_SOCKET);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 		if (inp->inp_socket->so_options & SO_TIMESTAMP) {
 			struct timeval tv;
 
 			bintime2timeval(&bt, &tv);
 			*mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv),
 			    SCM_TIMESTAMP, SOL_SOCKET);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 	if (inp->inp_flags & INP_RECVDSTADDR) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_dst,
 		    sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVTTL) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_ttl,
 		    sizeof(u_char), IP_RECVTTL, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #ifdef notyet
 	/* XXX
 	 * Moving these out of udp_input() made them even more broken
 	 * than they already were.
 	 */
 	/* options were tossed already */
 	if (inp->inp_flags & INP_RECVOPTS) {
 		*mp = sbcreatecontrol((caddr_t)opts_deleted_above,
 		    sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	/* ip_srcroute doesn't do what we want here, need to fix */
 	if (inp->inp_flags & INP_RECVRETOPTS) {
 		*mp = sbcreatecontrol((caddr_t)ip_srcroute(m),
 		    sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #endif
 	if (inp->inp_flags & INP_RECVIF) {
 		struct ifnet *ifp;
 		struct sdlbuf {
 			struct sockaddr_dl sdl;
 			u_char	pad[32];
 		} sdlbuf;
 		struct sockaddr_dl *sdp;
 		struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
 
 		if ((ifp = m->m_pkthdr.rcvif) &&
 		    ifp->if_index && ifp->if_index <= V_if_index) {
 			sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr;
 			/*
 			 * Change our mind and don't try copy.
 			 */
 			if (sdp->sdl_family != AF_LINK ||
 			    sdp->sdl_len > sizeof(sdlbuf)) {
 				goto makedummy;
 			}
 			bcopy(sdp, sdl2, sdp->sdl_len);
 		} else {
 makedummy:	
 			sdl2->sdl_len =
 			    offsetof(struct sockaddr_dl, sdl_data[0]);
 			sdl2->sdl_family = AF_LINK;
 			sdl2->sdl_index = 0;
 			sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
 		}
 		*mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len,
 		    IP_RECVIF, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVTOS) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_tos,
 		    sizeof(u_char), IP_RECVTOS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if (inp->inp_flags2 & INP_RECVFLOWID) {
 		uint32_t flowid, flow_type;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		/*
 		 * XXX should handle the failure of one or the
 		 * other - don't populate both?
 		 */
 		*mp = sbcreatecontrol((caddr_t) &flowid,
 		    sizeof(uint32_t), IP_FLOWID, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 		*mp = sbcreatecontrol((caddr_t) &flow_type,
 		    sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 #ifdef	RSS
 	if (inp->inp_flags2 & INP_RECVRSSBUCKETID) {
 		uint32_t flowid, flow_type;
 		uint32_t rss_bucketid;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) {
 			*mp = sbcreatecontrol((caddr_t) &rss_bucketid,
 			   sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 #endif
 }
 
 /*
  * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the
  * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on
  * locking.  This code remains in ip_input.c as ip_mroute.c is optionally
  * compiled.
  */
 static VNET_DEFINE(int, ip_rsvp_on);
 VNET_DEFINE(struct socket *, ip_rsvpd);
 
 #define	V_ip_rsvp_on		VNET(ip_rsvp_on)
 
 int
 ip_rsvp_init(struct socket *so)
 {
 
 	if (so->so_type != SOCK_RAW ||
 	    so->so_proto->pr_protocol != IPPROTO_RSVP)
 		return EOPNOTSUPP;
 
 	if (V_ip_rsvpd != NULL)
 		return EADDRINUSE;
 
 	V_ip_rsvpd = so;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-increment
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (!V_ip_rsvp_on) {
 		V_ip_rsvp_on = 1;
 		V_rsvp_on++;
 	}
 
 	return 0;
 }
 
 int
 ip_rsvp_done(void)
 {
 
 	V_ip_rsvpd = NULL;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-decrement
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (V_ip_rsvp_on) {
 		V_ip_rsvp_on = 0;
 		V_rsvp_on--;
 	}
 	return 0;
 }
 
 int
 rsvp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m;
 
 	m = *mp;
 	*mp = NULL;
 
 	if (rsvp_input_p) { /* call the real one if loaded */
 		*mp = m;
 		rsvp_input_p(mp, offp, proto);
 		return (IPPROTO_DONE);
 	}
 
 	/* Can still get packets with rsvp_on = 0 if there is a local member
 	 * of the group to which the RSVP packet is addressed.  But in this
 	 * case we want to throw the packet away.
 	 */
 	
 	if (!V_rsvp_on) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if (V_ip_rsvpd != NULL) { 
 		*mp = m;
 		rip_input(mp, offp, proto);
 		return (IPPROTO_DONE);
 	}
 	/* Drop the packet */
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
Index: projects/vnet/sys/netinet6/ip6_input.c
===================================================================
--- projects/vnet/sys/netinet6/ip6_input.c	(revision 298547)
+++ projects/vnet/sys/netinet6/ip6_input.c	(revision 298548)
@@ -1,1729 +1,1735 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipfw.h"
 #include "opt_ipsec.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hhook.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_systm.h>
 #include <net/if_llatbl.h>
 #ifdef INET
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #endif /* INET */
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/icmp6.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/in6_rss.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netinet6/ip6_ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif /* IPSEC */
 
 #include <netinet6/ip6protosw.h>
 
 extern struct domain inet6domain;
 
 u_char ip6_protox[IPPROTO_MAX];
 VNET_DEFINE(struct in6_ifaddrhead, in6_ifaddrhead);
 VNET_DEFINE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl);
 VNET_DEFINE(u_long, in6_ifaddrhmask);
 
 static struct netisr_handler ip6_nh = {
 	.nh_name = "ip6",
 	.nh_handler = ip6_input,
 	.nh_proto = NETISR_IPV6,
 #ifdef RSS
 	.nh_m2cpuid = rss_soft_m2cpuid_v6,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 #else
 	.nh_policy = NETISR_POLICY_FLOW,
 #endif
 };
 
 #ifdef RSS
 static struct netisr_handler ip6_direct_nh = {
 	.nh_name = "ip6_direct",
 	.nh_handler = ip6_direct_input,
 	.nh_proto = NETISR_IPV6_DIRECT,
 	.nh_m2cpuid = rss_soft_m2cpuid_v6,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 };
 #endif
 
 VNET_DEFINE(struct pfil_head, inet6_pfil_hook);
 
 VNET_PCPUSTAT_DEFINE(struct ip6stat, ip6stat);
 VNET_PCPUSTAT_SYSINIT(ip6stat);
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ip6stat);
 #endif /* VIMAGE */
 
 struct rmlock in6_ifaddr_lock;
 RM_SYSINIT(in6_ifaddr_lock, &in6_ifaddr_lock, "in6_ifaddr_lock");
 
 static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *);
 #ifdef PULLDOWN_TEST
 static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int);
 #endif
 
 /*
  * IP6 initialization: fill in IP6 protocol switch table.
  * All protocols not implemented in kernel go to raw IP6 protocol handler.
  */
 void
 ip6_init(void)
 {
 	struct protosw *pr;
 	int i;
 
 	TUNABLE_INT_FETCH("net.inet6.ip6.auto_linklocal",
 	    &V_ip6_auto_linklocal);
 	TUNABLE_INT_FETCH("net.inet6.ip6.accept_rtadv", &V_ip6_accept_rtadv);
 	TUNABLE_INT_FETCH("net.inet6.ip6.no_radr", &V_ip6_no_radr);
 
 	TAILQ_INIT(&V_in6_ifaddrhead);
 	V_in6_ifaddrhashtbl = hashinit(IN6ADDR_NHASH, M_IFADDR,
 	    &V_in6_ifaddrhmask);
 
 	/* Initialize packet filter hooks. */
 	V_inet6_pfil_hook.ph_type = PFIL_TYPE_AF;
 	V_inet6_pfil_hook.ph_af = AF_INET6;
 	if ((i = pfil_head_register(&V_inet6_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to register pfil hook, "
 			"error %d\n", __func__, i);
 
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET6,
 	    &V_ipsec_hhh_in[HHOOK_IPSEC_INET6],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register input helper hook\n",
 		    __func__);
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET6,
 	    &V_ipsec_hhh_out[HHOOK_IPSEC_INET6],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register output helper hook\n",
 		    __func__);
 
 	scope6_init();
 	addrsel_policy_init();
 	nd6_init();
 	frag6_init();
 
 	V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR;
 
 	/* Skip global initialization stuff for non-default instances. */
 	if (!IS_DEFAULT_VNET(curvnet))
-		return;
+		goto out;
 
 	pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		panic("ip6_init");
 
 	/* Initialize the entire ip6_protox[] array to IPPROTO_RAW. */
 	for (i = 0; i < IPPROTO_MAX; i++)
 		ip6_protox[i] = pr - inet6sw;
 	/*
 	 * Cycle through IP protocols and put them into the appropriate place
 	 * in ip6_protox[].
 	 */
 	for (pr = inet6domain.dom_protosw;
 	    pr < inet6domain.dom_protoswNPROTOSW; pr++)
 		if (pr->pr_domain->dom_family == PF_INET6 &&
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
 			/* Be careful to only index valid IP protocols. */
 			if (pr->pr_protocol < IPPROTO_MAX)
 				ip6_protox[pr->pr_protocol] = pr - inet6sw;
 		}
 
+out:
 	netisr_register(&ip6_nh);
 #ifdef RSS
 	netisr_register(&ip6_direct_nh);
 #endif
 }
 
 /*
  * The protocol to be inserted into ip6_protox[] must be already registered
  * in inet6sw[], either statically or through pf_proto_register().
  */
 int
 ip6proto_register(short ip6proto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/*
 	 * The protocol slot must not be occupied by another protocol
 	 * already.  An index pointing to IPPROTO_RAW is unused.
 	 */
 	pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip6_protox[ip6proto] != pr - inet6sw)	/* IPPROTO_RAW */
 		return (EEXIST);
 
 	/*
 	 * Find the protocol position in inet6sw[] and set the index.
 	 */
 	for (pr = inet6domain.dom_protosw;
 	    pr < inet6domain.dom_protoswNPROTOSW; pr++) {
 		if (pr->pr_domain->dom_family == PF_INET6 &&
 		    pr->pr_protocol && pr->pr_protocol == ip6proto) {
 			ip6_protox[pr->pr_protocol] = pr - inet6sw;
 			return (0);
 		}
 	}
 	return (EPROTONOSUPPORT);
 }
 
 int
 ip6proto_unregister(short ip6proto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/* Check if the protocol was indeed registered. */
 	pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip6_protox[ip6proto] == pr - inet6sw)	/* IPPROTO_RAW */
 		return (ENOENT);
 
 	/* Reset the protocol slot to IPPROTO_RAW. */
 	ip6_protox[ip6proto] = pr - inet6sw;
 	return (0);
 }
 
 #ifdef VIMAGE
 static void
 ip6_destroy(void *unused __unused)
 {
 	struct ifaddr *ifa, *nifa;
 	struct ifnet *ifp;
 	int error;
+
+#ifdef RSS
+	netisr_unregister(&ip6_direct_nh);
+#endif
+	netisr_unregister(&ip6_nh);
 
 	if ((error = pfil_head_unregister(&V_inet6_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to unregister pfil hook, "
 		    "error %d\n", __func__, error);
 	error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET6]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister input helper hook "
 		    "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET6: "
 		    "error %d returned\n", __func__, error);
 	}
 	error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET6]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister output helper hook "
 		    "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET6: "
 		    "error %d returned\n", __func__, error);
 	}
 
 	/* Cleanup addresses. */
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		/* Cannot lock here - lock recursion. */
 		/* IF_ADDR_LOCK(ifp); */
 		TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) {
 
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			in6_purgeaddr(ifa);
 		}
 		/* IF_ADDR_UNLOCK(ifp); */
 		in6_ifdetach(ifp, 0);
 		mld_domifdetach(ifp);
 		/* Make sure any routes are gone as well. */
 		rt_flushifroutes_af(ifp, AF_INET6);
 	}
 	IFNET_RUNLOCK();
 
 	nd6_destroy();
 	in6_ifattach_destroy();
 
 	hashdestroy(V_in6_ifaddrhashtbl, M_IFADDR, V_in6_ifaddrhmask);
 }
 
 VNET_SYSUNINIT(inet6, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip6_destroy, NULL);
 #endif
 
 static int
 ip6_input_hbh(struct mbuf *m, uint32_t *plen, uint32_t *rtalert, int *off,
     int *nxt, int *ours)
 {
 	struct ip6_hdr *ip6;
 	struct ip6_hbh *hbh;
 
 	if (ip6_hopopts_input(plen, rtalert, &m, off)) {
 #if 0	/*touches NULL pointer*/
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 #endif
 		goto out;	/* m have already been freed */
 	}
 
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * if the payload length field is 0 and the next header field
 	 * indicates Hop-by-Hop Options header, then a Jumbo Payload
 	 * option MUST be included.
 	 */
 	if (ip6->ip6_plen == 0 && *plen == 0) {
 		/*
 		 * Note that if a valid jumbo payload option is
 		 * contained, ip6_hopopts_input() must set a valid
 		 * (non-zero) payload length to the variable plen.
 		 */
 		IP6STAT_INC(ip6s_badoptions);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
 		icmp6_error(m, ICMP6_PARAM_PROB,
 			    ICMP6_PARAMPROB_HEADER,
 			    (caddr_t)&ip6->ip6_plen - (caddr_t)ip6);
 		goto out;
 	}
 #ifndef PULLDOWN_TEST
 	/* ip6_hopopts_input() ensures that mbuf is contiguous */
 	hbh = (struct ip6_hbh *)(ip6 + 1);
 #else
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
 		sizeof(struct ip6_hbh));
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		goto out;
 	}
 #endif
 	*nxt = hbh->ip6h_nxt;
 
 	/*
 	 * If we are acting as a router and the packet contains a
 	 * router alert option, see if we know the option value.
 	 * Currently, we only support the option value for MLD, in which
 	 * case we should pass the packet to the multicast routing
 	 * daemon.
 	 */
 	if (*rtalert != ~0) {
 		switch (*rtalert) {
 		case IP6OPT_RTALERT_MLD:
 			if (V_ip6_forwarding)
 				*ours = 1;
 			break;
 		default:
 			/*
 			 * RFC2711 requires unrecognized values must be
 			 * silently ignored.
 			 */
 			break;
 		}
 	}
 
 	return (0);
 
 out:
 	return (1);
 }
 
 #ifdef RSS
 /*
  * IPv6 direct input routine.
  *
  * This is called when reinjecting completed fragments where
  * all of the previous checking and book-keeping has been done.
  */
 void
 ip6_direct_input(struct mbuf *m)
 {
 	int off, nxt;
 	int nest;
 	struct m_tag *mtag;
 	struct ip6_direct_ctx *ip6dc;
 
 	mtag = m_tag_locate(m, MTAG_ABI_IPV6, IPV6_TAG_DIRECT, NULL);
 	KASSERT(mtag != NULL, ("Reinjected packet w/o direct ctx tag!"));
 
 	ip6dc = (struct ip6_direct_ctx *)(mtag + 1);
 	nxt = ip6dc->ip6dc_nxt;
 	off = ip6dc->ip6dc_off;
 
 	nest = 0;
 
 	m_tag_delete(m, mtag);
 
 	while (nxt != IPPROTO_DONE) {
 		if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) {
 			IP6STAT_INC(ip6s_toomanyhdr);
 			goto bad;
 		}
 
 		/*
 		 * protection against faulty packet - there should be
 		 * more sanity checks in header chain processing.
 		 */
 		if (m->m_pkthdr.len < off) {
 			IP6STAT_INC(ip6s_tooshort);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 			goto bad;
 		}
 
 #ifdef IPSEC
 		/*
 		 * enforce IPsec policy checking if we are seeing last header.
 		 * note that we do not visit this with protocols with pcb layer
 		 * code - like udp/tcp/raw ip.
 		 */
 		if (ip6_ipsec_input(m, nxt))
 			goto bad;
 #endif /* IPSEC */
 
 		nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
 	}
 	return;
 bad:
 	m_freem(m);
 }
 #endif
 
 void
 ip6_input(struct mbuf *m)
 {
 	struct in6_addr odst;
 	struct ip6_hdr *ip6;
 	struct in6_ifaddr *ia;
 	u_int32_t plen;
 	u_int32_t rtalert = ~0;
 	int off = sizeof(struct ip6_hdr), nest;
 	int nxt, ours = 0;
 	int srcrt = 0;
 
 #ifdef IPSEC
 	/*
 	 * should the inner packet be considered authentic?
 	 * see comment in ah4_input().
 	 * NB: m cannot be NULL when passed to the input routine
 	 */
 
 	m->m_flags &= ~M_AUTHIPHDR;
 	m->m_flags &= ~M_AUTHIPDGM;
 
 #endif /* IPSEC */
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		/*
 		 * Firewall changed destination to local.
 		 */
 		m->m_flags &= ~M_FASTFWD_OURS;
 		ours = 1;
 		ip6 = mtod(m, struct ip6_hdr *);
 		goto hbhcheck;
 	}
 
 	/*
 	 * mbuf statistics
 	 */
 	if (m->m_flags & M_EXT) {
 		if (m->m_next)
 			IP6STAT_INC(ip6s_mext2m);
 		else
 			IP6STAT_INC(ip6s_mext1);
 	} else {
 		if (m->m_next) {
 			if (m->m_flags & M_LOOP) {
 				IP6STAT_INC(ip6s_m2m[V_loif->if_index]);
 			} else if (m->m_pkthdr.rcvif->if_index < IP6S_M2MMAX)
 				IP6STAT_INC(
 				    ip6s_m2m[m->m_pkthdr.rcvif->if_index]);
 			else
 				IP6STAT_INC(ip6s_m2m[0]);
 		} else
 			IP6STAT_INC(ip6s_m1);
 	}
 
 	/* drop the packet if IPv6 operation is disabled on the IF */
 	if ((ND_IFINFO(m->m_pkthdr.rcvif)->flags & ND6_IFF_IFDISABLED))
 		goto bad;
 
 	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_receive);
 	IP6STAT_INC(ip6s_total);
 
 #ifndef PULLDOWN_TEST
 	/*
 	 * L2 bridge code and some other code can return mbuf chain
 	 * that does not conform to KAME requirement.  too bad.
 	 * XXX: fails to join if interface MTU > MCLBYTES.  jumbogram?
 	 */
 	if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) {
 		struct mbuf *n;
 
 		if (m->m_pkthdr.len > MHLEN)
 			n = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		else
 			n = m_gethdr(M_NOWAIT, MT_DATA);
 		if (n == NULL) {
 			m_freem(m);
 			return;	/* ENOBUFS */
 		}
 
 		m_move_pkthdr(n, m);
 		m_copydata(m, 0, n->m_pkthdr.len, mtod(n, caddr_t));
 		n->m_len = n->m_pkthdr.len;
 		m_freem(m);
 		m = n;
 	}
 	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /* nothing */);
 #endif
 
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		struct ifnet *inifp;
 		inifp = m->m_pkthdr.rcvif;
 		if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
 			IP6STAT_INC(ip6s_toosmall);
 			in6_ifstat_inc(inifp, ifs6_in_hdrerr);
 			return;
 		}
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 		IP6STAT_INC(ip6s_badvers);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
 		goto bad;
 	}
 
 	IP6STAT_INC(ip6s_nxthist[ip6->ip6_nxt]);
 
 	IP_PROBE(receive, NULL, NULL, ip6, m->m_pkthdr.rcvif, NULL, ip6);
 
 	/*
 	 * Check against address spoofing/corruption.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) {
 		/*
 		 * XXX: "badscope" is not very suitable for a multicast source.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) &&
 	    !(m->m_flags & M_LOOP)) {
 		/*
 		 * In this case, the packet should come from the loopback
 		 * interface.  However, we cannot just check the if_flags,
 		 * because ip6_mloopback() passes the "actual" interface
 		 * as the outgoing/incoming interface.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
 	    IPV6_ADDR_MC_SCOPE(&ip6->ip6_dst) == 0) {
 		/*
 		 * RFC4291 2.7:
 		 * Nodes must not originate a packet to a multicast address
 		 * whose scop field contains the reserved value 0; if such
 		 * a packet is received, it must be silently dropped.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #ifdef ALTQ
 	if (altq_input != NULL && (*altq_input)(m, AF_INET6) == 0) {
 		/* packet is dropped by traffic conditioner */
 		return;
 	}
 #endif
 	/*
 	 * The following check is not documented in specs.  A malicious
 	 * party may be able to use IPv4 mapped addr to confuse tcp/udp stack
 	 * and bypass security checks (act as if it was from 127.0.0.1 by using
 	 * IPv6 src ::ffff:127.0.0.1).  Be cautious.
 	 *
 	 * This check chokes if we are in an SIIT cloud.  As none of BSDs
 	 * support IPv4-less kernel compilation, we cannot support SIIT
 	 * environment at all.  So, it makes more sense for us to reject any
 	 * malicious packets for non-SIIT environment, than try to do a
 	 * partial support for SIIT environment.
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #if 0
 	/*
 	 * Reject packets with IPv4 compatible addresses (auto tunnel).
 	 *
 	 * The code forbids auto tunnel relay case in RFC1933 (the check is
 	 * stronger than RFC1933).  We may want to re-enable it if mech-xx
 	 * is revised to forbid relaying case.
 	 */
 	if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #endif
 #ifdef IPSEC
 	/*
 	 * Bypass packet filtering for packets previously handled by IPsec.
 	 */
 	if (ip6_ipsec_filtertunnel(m))
 		goto passin;
 #endif /* IPSEC */
 
 	/*
 	 * Run through list of hooks for input packets.
 	 *
 	 * NB: Beware of the destination address changing
 	 *     (e.g. by NAT rewriting).  When this happens,
 	 *     tell ip6_forward to do the right thing.
 	 */
 	odst = ip6->ip6_dst;
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet6_pfil_hook))
 		goto passin;
 
 	if (pfil_run_hooks(&V_inet6_pfil_hook, &m,
 	    m->m_pkthdr.rcvif, PFIL_IN, NULL))
 		return;
 	if (m == NULL)			/* consumed by filter */
 		return;
 	ip6 = mtod(m, struct ip6_hdr *);
 	srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst);
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		ours = 1;
 		goto hbhcheck;
 	}
 	if ((m->m_flags & M_IP6_NEXTHOP) &&
 	    m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) {
 		/*
 		 * Directly ship the packet on.  This allows forwarding
 		 * packets originally destined to us to some other directly
 		 * connected host.
 		 */
 		ip6_forward(m, 1);
 		return;
 	}
 
 passin:
 	/*
 	 * Disambiguate address scope zones (if there is ambiguity).
 	 * We first make sure that the original source or destination address
 	 * is not in our internal form for scoped addresses.  Such addresses
 	 * are not necessarily invalid spec-wise, but we cannot accept them due
 	 * to the usage conflict.
 	 * in6_setscope() then also checks and rejects the cases where src or
 	 * dst are the loopback address and the receiving interface
 	 * is not loopback.
 	 */
 	if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope); /* XXX */
 		goto bad;
 	}
 	if (in6_setscope(&ip6->ip6_src, m->m_pkthdr.rcvif, NULL) ||
 	    in6_setscope(&ip6->ip6_dst, m->m_pkthdr.rcvif, NULL)) {
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 	/*
 	 * Multicast check. Assume packet is for us to avoid
 	 * prematurely taking locks.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		ours = 1;
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mcast);
 		goto hbhcheck;
 	}
 	/*
 	 * Unicast check
 	 * XXX: For now we keep link-local IPv6 addresses with embedded
 	 *      scope zone id, therefore we use zero zoneid here.
 	 */
 	ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */);
 	if (ia != NULL) {
 		if (ia->ia6_flags & IN6_IFF_NOTREADY) {
 			char ip6bufs[INET6_ADDRSTRLEN];
 			char ip6bufd[INET6_ADDRSTRLEN];
 			/* address is not ready, so discard the packet. */
 			nd6log((LOG_INFO,
 			    "ip6_input: packet to an unready address %s->%s\n",
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst)));
 			ifa_free(&ia->ia_ifa);
 			goto bad;
 		}
 		/* Count the packet in the ip address stats */
 		counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 		counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len);
 		ifa_free(&ia->ia_ifa);
 		ours = 1;
 		goto hbhcheck;
 	}
 
 	/*
 	 * Now there is no reason to process the packet if it's not our own
 	 * and we're not a router.
 	 */
 	if (!V_ip6_forwarding) {
 		IP6STAT_INC(ip6s_cantforward);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 		goto bad;
 	}
 
   hbhcheck:
 	/*
 	 * Process Hop-by-Hop options header if it's contained.
 	 * m may be modified in ip6_hopopts_input().
 	 * If a JumboPayload option is included, plen will also be modified.
 	 */
 	plen = (u_int32_t)ntohs(ip6->ip6_plen);
 	if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
 		if (ip6_input_hbh(m, &plen, &rtalert, &off, &nxt, &ours) != 0)
 			return;
 	} else
 		nxt = ip6->ip6_nxt;
 
 	/*
 	 * Use mbuf flags to propagate Router Alert option to
 	 * ICMPv6 layer, as hop-by-hop options have been stripped.
 	 */
 	if (rtalert != ~0)
 		m->m_flags |= M_RTALERT_MLD;
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IPv6 header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) {
 		IP6STAT_INC(ip6s_tooshort);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 		goto bad;
 	}
 	if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = sizeof(struct ip6_hdr) + plen;
 			m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen;
 		} else
 			m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len);
 	}
 
 	/*
 	 * Forward if desirable.
 	 */
 	if (V_ip6_mrouter &&
 	    IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		/*
 		 * If we are acting as a multicast router, all
 		 * incoming multicast packets are passed to the
 		 * kernel-level multicast forwarding function.
 		 * The packet is returned (relatively) intact; if
 		 * ip6_mforward() returns a non-zero value, the packet
 		 * must be discarded, else it may be accepted below.
 		 *
 		 * XXX TODO: Check hlim and multicast scope here to avoid
 		 * unnecessarily calling into ip6_mforward().
 		 */
 		if (ip6_mforward &&
 		    ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) {
 			IP6STAT_INC(ip6s_cantforward);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 			goto bad;
 		}
 	} else if (!ours) {
 		ip6_forward(m, srcrt);
 		return;
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * Malicious party may be able to use IPv4 mapped addr to confuse
 	 * tcp/udp stack and bypass security checks (act as if it was from
 	 * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1).  Be cautious.
 	 *
 	 * For SIIT end node behavior, you may want to disable the check.
 	 * However, you will  become vulnerable to attacks using IPv4 mapped
 	 * source.
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 
 	/*
 	 * Tell launch routine the next header
 	 */
 	IP6STAT_INC(ip6s_delivered);
 	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_deliver);
 	nest = 0;
 
 	while (nxt != IPPROTO_DONE) {
 		if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) {
 			IP6STAT_INC(ip6s_toomanyhdr);
 			goto bad;
 		}
 
 		/*
 		 * protection against faulty packet - there should be
 		 * more sanity checks in header chain processing.
 		 */
 		if (m->m_pkthdr.len < off) {
 			IP6STAT_INC(ip6s_tooshort);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 			goto bad;
 		}
 
 #ifdef IPSEC
 		/*
 		 * enforce IPsec policy checking if we are seeing last header.
 		 * note that we do not visit this with protocols with pcb layer
 		 * code - like udp/tcp/raw ip.
 		 */
 		if (ip6_ipsec_input(m, nxt))
 			goto bad;
 #endif /* IPSEC */
 
 		nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
 	}
 	return;
 bad:
 	m_freem(m);
 }
 
 /*
  * Hop-by-Hop options header processing. If a valid jumbo payload option is
  * included, the real payload length will be stored in plenp.
  *
  * rtalertp - XXX: should be stored more smart way
  */
 static int
 ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp,
     struct mbuf **mp, int *offp)
 {
 	struct mbuf *m = *mp;
 	int off = *offp, hbhlen;
 	struct ip6_hbh *hbh;
 
 	/* validation of the length of the header */
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1);
 	hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off);
 	hbhlen = (hbh->ip6h_len + 1) << 3;
 
 	IP6_EXTHDR_CHECK(m, off, hbhlen, -1);
 	hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off);
 #else
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m,
 		sizeof(struct ip6_hdr), sizeof(struct ip6_hbh));
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		return -1;
 	}
 	hbhlen = (hbh->ip6h_len + 1) << 3;
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
 		hbhlen);
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		return -1;
 	}
 #endif
 	off += hbhlen;
 	hbhlen -= sizeof(struct ip6_hbh);
 	if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh),
 				hbhlen, rtalertp, plenp) < 0)
 		return (-1);
 
 	*offp = off;
 	*mp = m;
 	return (0);
 }
 
 /*
  * Search header for all Hop-by-hop options and process each option.
  * This function is separate from ip6_hopopts_input() in order to
  * handle a case where the sending node itself process its hop-by-hop
  * options header. In such a case, the function is called from ip6_output().
  *
  * The function assumes that hbh header is located right after the IPv6 header
  * (RFC2460 p7), opthead is pointer into data content in m, and opthead to
  * opthead + hbhlen is located in contiguous memory region.
  */
 int
 ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen,
     u_int32_t *rtalertp, u_int32_t *plenp)
 {
 	struct ip6_hdr *ip6;
 	int optlen = 0;
 	u_int8_t *opt = opthead;
 	u_int16_t rtalert_val;
 	u_int32_t jumboplen;
 	const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh);
 
 	for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) {
 		switch (*opt) {
 		case IP6OPT_PAD1:
 			optlen = 1;
 			break;
 		case IP6OPT_PADN:
 			if (hbhlen < IP6OPT_MINLEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			optlen = *(opt + 1) + 2;
 			break;
 		case IP6OPT_ROUTER_ALERT:
 			/* XXX may need check for alignment */
 			if (hbhlen < IP6OPT_RTALERT_LEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) {
 				/* XXX stat */
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 1 - opthead);
 				return (-1);
 			}
 			optlen = IP6OPT_RTALERT_LEN;
 			bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2);
 			*rtalertp = ntohs(rtalert_val);
 			break;
 		case IP6OPT_JUMBO:
 			/* XXX may need check for alignment */
 			if (hbhlen < IP6OPT_JUMBO_LEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) {
 				/* XXX stat */
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 1 - opthead);
 				return (-1);
 			}
 			optlen = IP6OPT_JUMBO_LEN;
 
 			/*
 			 * IPv6 packets that have non 0 payload length
 			 * must not contain a jumbo payload option.
 			 */
 			ip6 = mtod(m, struct ip6_hdr *);
 			if (ip6->ip6_plen) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt - opthead);
 				return (-1);
 			}
 
 			/*
 			 * We may see jumbolen in unaligned location, so
 			 * we'd need to perform bcopy().
 			 */
 			bcopy(opt + 2, &jumboplen, sizeof(jumboplen));
 			jumboplen = (u_int32_t)htonl(jumboplen);
 
 #if 1
 			/*
 			 * if there are multiple jumbo payload options,
 			 * *plenp will be non-zero and the packet will be
 			 * rejected.
 			 * the behavior may need some debate in ipngwg -
 			 * multiple options does not make sense, however,
 			 * there's no explicit mention in specification.
 			 */
 			if (*plenp != 0) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 2 - opthead);
 				return (-1);
 			}
 #endif
 
 			/*
 			 * jumbo payload length must be larger than 65535.
 			 */
 			if (jumboplen <= IPV6_MAXPACKET) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 2 - opthead);
 				return (-1);
 			}
 			*plenp = jumboplen;
 
 			break;
 		default:		/* unknown option */
 			if (hbhlen < IP6OPT_MINLEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			optlen = ip6_unknown_opt(opt, m,
 			    erroff + opt - opthead);
 			if (optlen == -1)
 				return (-1);
 			optlen += 2;
 			break;
 		}
 	}
 
 	return (0);
 
   bad:
 	m_freem(m);
 	return (-1);
 }
 
 /*
  * Unknown option processing.
  * The third argument `off' is the offset from the IPv6 header to the option,
  * which is necessary if the IPv6 header the and option header and IPv6 header
  * is not contiguous in order to return an ICMPv6 error.
  */
 int
 ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off)
 {
 	struct ip6_hdr *ip6;
 
 	switch (IP6OPT_TYPE(*optp)) {
 	case IP6OPT_TYPE_SKIP: /* ignore the option */
 		return ((int)*(optp + 1));
 	case IP6OPT_TYPE_DISCARD:	/* silently discard */
 		m_freem(m);
 		return (-1);
 	case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */
 		IP6STAT_INC(ip6s_badoptions);
 		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off);
 		return (-1);
 	case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */
 		IP6STAT_INC(ip6s_badoptions);
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    (m->m_flags & (M_BCAST|M_MCAST)))
 			m_freem(m);
 		else
 			icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_OPTION, off);
 		return (-1);
 	}
 
 	m_freem(m);		/* XXX: NOTREACHED */
 	return (-1);
 }
 
 /*
  * Create the "control" list for this pcb.
  * These functions will not modify mbuf chain at all.
  *
  * With KAME mbuf chain restriction:
  * The routine will be called from upper layer handlers like tcp6_input().
  * Thus the routine assumes that the caller (tcp6_input) have already
  * called IP6_EXTHDR_CHECK() and all the extension headers are located in the
  * very first mbuf on the mbuf chain.
  *
  * ip6_savecontrol_v4 will handle those options that are possible to be
  * set on a v4-mapped socket.
  * ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those
  * options and handle the v6-only ones itself.
  */
 struct mbuf **
 ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp,
     int *v4only)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 
 #ifdef SO_TIMESTAMP
 	if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) {
 		struct timeval tv;
 
 		microtime(&tv);
 		*mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
 		    SCM_TIMESTAMP, SOL_SOCKET);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #endif
 
 #define IS2292(inp, x, y)	(((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y))
 	/* RFC 2292 sec. 5 */
 	if ((inp->inp_flags & IN6P_PKTINFO) != 0) {
 		struct in6_pktinfo pi6;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			pi6.ipi6_addr.s6_addr32[0] = 0;
 			pi6.ipi6_addr.s6_addr32[1] = 0;
 			pi6.ipi6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP;
 			pi6.ipi6_addr.s6_addr32[3] = ip->ip_dst.s_addr;
 #else
 			/* We won't hit this code */
 			bzero(&pi6.ipi6_addr, sizeof(struct in6_addr));
 #endif
 		} else {	
 			bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr));
 			in6_clearscope(&pi6.ipi6_addr);	/* XXX */
 		}
 		pi6.ipi6_ifindex =
 		    (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0;
 
 		*mp = sbcreatecontrol((caddr_t) &pi6,
 		    sizeof(struct in6_pktinfo),
 		    IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) {
 		int hlim;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			hlim = ip->ip_ttl;
 #else
 			/* We won't hit this code */
 			hlim = 0;
 #endif
 		} else {
 			hlim = ip6->ip6_hlim & 0xff;
 		}
 		*mp = sbcreatecontrol((caddr_t) &hlim, sizeof(int),
 		    IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT),
 		    IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if ((inp->inp_flags & IN6P_TCLASS) != 0) {
 		int tclass;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			tclass = ip->ip_tos;
 #else
 			/* We won't hit this code */
 			tclass = 0;
 #endif
 		} else {
 			u_int32_t flowinfo;
 
 			flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK);
 			flowinfo >>= 20;
 			tclass = flowinfo & 0xff;
 		}
 		*mp = sbcreatecontrol((caddr_t) &tclass, sizeof(int),
 		    IPV6_TCLASS, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if (v4only != NULL) {
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 			*v4only = 1;
 		} else {
 			*v4only = 0;
 		}
 	}
 
 	return (mp);
 }
 
 void
 ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	int v4only = 0;
 
 	mp = ip6_savecontrol_v4(in6p, m, mp, &v4only);
 	if (v4only)
 		return;
 
 	/*
 	 * IPV6_HOPOPTS socket option.  Recall that we required super-user
 	 * privilege for the option (see ip6_ctloutput), but it might be too
 	 * strict, since there might be some hop-by-hop options which can be
 	 * returned to normal user.
 	 * See also RFC 2292 section 6 (or RFC 3542 section 8).
 	 */
 	if ((in6p->inp_flags & IN6P_HOPOPTS) != 0) {
 		/*
 		 * Check if a hop-by-hop options header is contatined in the
 		 * received packet, and if so, store the options as ancillary
 		 * data. Note that a hop-by-hop options header must be
 		 * just after the IPv6 header, which is assured through the
 		 * IPv6 input processing.
 		 */
 		if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
 			struct ip6_hbh *hbh;
 			int hbhlen = 0;
 #ifdef PULLDOWN_TEST
 			struct mbuf *ext;
 #endif
 
 #ifndef PULLDOWN_TEST
 			hbh = (struct ip6_hbh *)(ip6 + 1);
 			hbhlen = (hbh->ip6h_len + 1) << 3;
 #else
 			ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr),
 			    ip6->ip6_nxt);
 			if (ext == NULL) {
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 			hbh = mtod(ext, struct ip6_hbh *);
 			hbhlen = (hbh->ip6h_len + 1) << 3;
 			if (hbhlen != ext->m_len) {
 				m_freem(ext);
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 #endif
 
 			/*
 			 * XXX: We copy the whole header even if a
 			 * jumbo payload option is included, the option which
 			 * is to be removed before returning according to
 			 * RFC2292.
 			 * Note: this constraint is removed in RFC3542
 			 */
 			*mp = sbcreatecontrol((caddr_t)hbh, hbhlen,
 			    IS2292(in6p, IPV6_2292HOPOPTS, IPV6_HOPOPTS),
 			    IPPROTO_IPV6);
 			if (*mp)
 				mp = &(*mp)->m_next;
 #ifdef PULLDOWN_TEST
 			m_freem(ext);
 #endif
 		}
 	}
 
 	if ((in6p->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) {
 		int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr);
 
 		/*
 		 * Search for destination options headers or routing
 		 * header(s) through the header chain, and stores each
 		 * header as ancillary data.
 		 * Note that the order of the headers remains in
 		 * the chain of ancillary data.
 		 */
 		while (1) {	/* is explicit loop prevention necessary? */
 			struct ip6_ext *ip6e = NULL;
 			int elen;
 #ifdef PULLDOWN_TEST
 			struct mbuf *ext = NULL;
 #endif
 
 			/*
 			 * if it is not an extension header, don't try to
 			 * pull it from the chain.
 			 */
 			switch (nxt) {
 			case IPPROTO_DSTOPTS:
 			case IPPROTO_ROUTING:
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_AH: /* is it possible? */
 				break;
 			default:
 				goto loopend;
 			}
 
 #ifndef PULLDOWN_TEST
 			if (off + sizeof(*ip6e) > m->m_len)
 				goto loopend;
 			ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off);
 			if (nxt == IPPROTO_AH)
 				elen = (ip6e->ip6e_len + 2) << 2;
 			else
 				elen = (ip6e->ip6e_len + 1) << 3;
 			if (off + elen > m->m_len)
 				goto loopend;
 #else
 			ext = ip6_pullexthdr(m, off, nxt);
 			if (ext == NULL) {
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 			ip6e = mtod(ext, struct ip6_ext *);
 			if (nxt == IPPROTO_AH)
 				elen = (ip6e->ip6e_len + 2) << 2;
 			else
 				elen = (ip6e->ip6e_len + 1) << 3;
 			if (elen != ext->m_len) {
 				m_freem(ext);
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 #endif
 
 			switch (nxt) {
 			case IPPROTO_DSTOPTS:
 				if (!(in6p->inp_flags & IN6P_DSTOPTS))
 					break;
 
 				*mp = sbcreatecontrol((caddr_t)ip6e, elen,
 				    IS2292(in6p,
 					IPV6_2292DSTOPTS, IPV6_DSTOPTS),
 				    IPPROTO_IPV6);
 				if (*mp)
 					mp = &(*mp)->m_next;
 				break;
 			case IPPROTO_ROUTING:
 				if (!(in6p->inp_flags & IN6P_RTHDR))
 					break;
 
 				*mp = sbcreatecontrol((caddr_t)ip6e, elen,
 				    IS2292(in6p, IPV6_2292RTHDR, IPV6_RTHDR),
 				    IPPROTO_IPV6);
 				if (*mp)
 					mp = &(*mp)->m_next;
 				break;
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_AH: /* is it possible? */
 				break;
 
 			default:
 				/*
 				 * other cases have been filtered in the above.
 				 * none will visit this case.  here we supply
 				 * the code just in case (nxt overwritten or
 				 * other cases).
 				 */
 #ifdef PULLDOWN_TEST
 				m_freem(ext);
 #endif
 				goto loopend;
 
 			}
 
 			/* proceed with the next header. */
 			off += elen;
 			nxt = ip6e->ip6e_nxt;
 			ip6e = NULL;
 #ifdef PULLDOWN_TEST
 			m_freem(ext);
 			ext = NULL;
 #endif
 		}
 	  loopend:
 		;
 	}
 
 	if (in6p->inp_flags2 & INP_RECVFLOWID) {
 		uint32_t flowid, flow_type;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		/*
 		 * XXX should handle the failure of one or the
 		 * other - don't populate both?
 		 */
 		*mp = sbcreatecontrol((caddr_t) &flowid,
 		    sizeof(uint32_t), IPV6_FLOWID, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 		*mp = sbcreatecontrol((caddr_t) &flow_type,
 		    sizeof(uint32_t), IPV6_FLOWTYPE, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 #ifdef	RSS
 	if (in6p->inp_flags2 & INP_RECVRSSBUCKETID) {
 		uint32_t flowid, flow_type;
 		uint32_t rss_bucketid;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) {
 			*mp = sbcreatecontrol((caddr_t) &rss_bucketid,
 			   sizeof(uint32_t), IPV6_RSSBUCKETID, IPPROTO_IPV6);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 #endif
 
 }
 #undef IS2292
 
 void
 ip6_notify_pmtu(struct inpcb *inp, struct sockaddr_in6 *dst, u_int32_t mtu)
 {
 	struct socket *so;
 	struct mbuf *m_mtu;
 	struct ip6_mtuinfo mtuctl;
 
 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 	/*
 	 * Notify the error by sending IPV6_PATHMTU ancillary data if
 	 * application wanted to know the MTU value.
 	 * NOTE: we notify disconnected sockets, because some udp
 	 * applications keep sending sockets disconnected.
 	 * NOTE: our implementation doesn't notify connected sockets that has
 	 * foreign address that is different than given destination addresses
 	 * (this is permitted by RFC 3542).
 	 */
 	if ((inp->inp_flags & IN6P_MTU) == 0 || (
 	    !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
 	    !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &dst->sin6_addr)))
 		return;
 
 	mtuctl.ip6m_mtu = mtu;
 	mtuctl.ip6m_addr = *dst;
 	if (sa6_recoverscope(&mtuctl.ip6m_addr))
 		return;
 
 	if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl),
 	    IPV6_PATHMTU, IPPROTO_IPV6)) == NULL)
 		return;
 
 	so =  inp->inp_socket;
 	if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu)
 	    == 0) {
 		m_freem(m_mtu);
 		/* XXX: should count statistics */
 	} else
 		sorwakeup(so);
 }
 
 #ifdef PULLDOWN_TEST
 /*
  * pull single extension header from mbuf chain.  returns single mbuf that
  * contains the result, or NULL on error.
  */
 static struct mbuf *
 ip6_pullexthdr(struct mbuf *m, size_t off, int nxt)
 {
 	struct ip6_ext ip6e;
 	size_t elen;
 	struct mbuf *n;
 
 #ifdef DIAGNOSTIC
 	switch (nxt) {
 	case IPPROTO_DSTOPTS:
 	case IPPROTO_ROUTING:
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_AH: /* is it possible? */
 		break;
 	default:
 		printf("ip6_pullexthdr: invalid nxt=%d\n", nxt);
 	}
 #endif
 
 	m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 	if (nxt == IPPROTO_AH)
 		elen = (ip6e.ip6e_len + 2) << 2;
 	else
 		elen = (ip6e.ip6e_len + 1) << 3;
 
 	if (elen > MLEN)
 		n = m_getcl(M_NOWAIT, MT_DATA, 0);
 	else
 		n = m_get(M_NOWAIT, MT_DATA);
 	if (n == NULL)
 		return NULL;
 
 	m_copydata(m, off, elen, mtod(n, caddr_t));
 	n->m_len = elen;
 	return n;
 }
 #endif
 
 /*
  * Get pointer to the previous header followed by the header
  * currently processed.
  * XXX: This function supposes that
  *	M includes all headers,
  *	the next header field and the header length field of each header
  *	are valid, and
  *	the sum of each header length equals to OFF.
  * Because of these assumptions, this function must be called very
  * carefully. Moreover, it will not be used in the near future when
  * we develop `neater' mechanism to process extension headers.
  */
 char *
 ip6_get_prevhdr(const struct mbuf *m, int off)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 
 	if (off == sizeof(struct ip6_hdr))
 		return (&ip6->ip6_nxt);
 	else {
 		int len, nxt;
 		struct ip6_ext *ip6e = NULL;
 
 		nxt = ip6->ip6_nxt;
 		len = sizeof(struct ip6_hdr);
 		while (len < off) {
 			ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + len);
 
 			switch (nxt) {
 			case IPPROTO_FRAGMENT:
 				len += sizeof(struct ip6_frag);
 				break;
 			case IPPROTO_AH:
 				len += (ip6e->ip6e_len + 2) << 2;
 				break;
 			default:
 				len += (ip6e->ip6e_len + 1) << 3;
 				break;
 			}
 			nxt = ip6e->ip6e_nxt;
 		}
 		if (ip6e)
 			return (&ip6e->ip6e_nxt);
 		else
 			return NULL;
 	}
 }
 
 /*
  * get next header offset.  m will be retained.
  */
 int
 ip6_nexthdr(const struct mbuf *m, int off, int proto, int *nxtp)
 {
 	struct ip6_hdr ip6;
 	struct ip6_ext ip6e;
 	struct ip6_frag fh;
 
 	/* just in case */
 	if (m == NULL)
 		panic("ip6_nexthdr: m == NULL");
 	if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off)
 		return -1;
 
 	switch (proto) {
 	case IPPROTO_IPV6:
 		if (m->m_pkthdr.len < off + sizeof(ip6))
 			return -1;
 		m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6);
 		if (nxtp)
 			*nxtp = ip6.ip6_nxt;
 		off += sizeof(ip6);
 		return off;
 
 	case IPPROTO_FRAGMENT:
 		/*
 		 * terminate parsing if it is not the first fragment,
 		 * it does not make sense to parse through it.
 		 */
 		if (m->m_pkthdr.len < off + sizeof(fh))
 			return -1;
 		m_copydata(m, off, sizeof(fh), (caddr_t)&fh);
 		/* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */
 		if (fh.ip6f_offlg & IP6F_OFF_MASK)
 			return -1;
 		if (nxtp)
 			*nxtp = fh.ip6f_nxt;
 		off += sizeof(struct ip6_frag);
 		return off;
 
 	case IPPROTO_AH:
 		if (m->m_pkthdr.len < off + sizeof(ip6e))
 			return -1;
 		m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 		if (nxtp)
 			*nxtp = ip6e.ip6e_nxt;
 		off += (ip6e.ip6e_len + 2) << 2;
 		return off;
 
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_ROUTING:
 	case IPPROTO_DSTOPTS:
 		if (m->m_pkthdr.len < off + sizeof(ip6e))
 			return -1;
 		m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 		if (nxtp)
 			*nxtp = ip6e.ip6e_nxt;
 		off += (ip6e.ip6e_len + 1) << 3;
 		return off;
 
 	case IPPROTO_NONE:
 	case IPPROTO_ESP:
 	case IPPROTO_IPCOMP:
 		/* give up */
 		return -1;
 
 	default:
 		return -1;
 	}
 
 	/* NOTREACHED */
 }
 
 /*
  * get offset for the last header in the chain.  m will be kept untainted.
  */
 int
 ip6_lasthdr(const struct mbuf *m, int off, int proto, int *nxtp)
 {
 	int newoff;
 	int nxt;
 
 	if (!nxtp) {
 		nxt = -1;
 		nxtp = &nxt;
 	}
 	while (1) {
 		newoff = ip6_nexthdr(m, off, proto, nxtp);
 		if (newoff < 0)
 			return off;
 		else if (newoff < off)
 			return -1;	/* invalid */
 		else if (newoff == off)
 			return newoff;
 
 		off = newoff;
 		proto = *nxtp;
 	}
 }
 
 /*
  * System control for IP6
  */
 
 u_char	inet6ctlerrmap[PRC_NCMDS] = {
 	0,		0,		0,		0,
 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
 	0,		0,		0,		0,
 	ENOPROTOOPT
 };