Index: projects/vnet/sys/net/if.c
===================================================================
--- projects/vnet/sys/net/if.c	(revision 299172)
+++ projects/vnet/sys/net/if.c	(revision 299173)
@@ -1,4076 +1,4115 @@
 /*-
  * Copyright (c) 1980, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if.c	8.5 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #include "opt_compat.h"
 #include "opt_inet6.h"
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/conf.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <sys/bus.h>
 #include <sys/mbuf.h>
 #include <sys/systm.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/refcount.h>
 #include <sys/module.h>
 #include <sys/rwlock.h>
 #include <sys/sockio.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/domain.h>
 #include <sys/jail.h>
 #include <sys/priv.h>
 
 #include <machine/stdarg.h>
 #include <vm/uma.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_vlan_var.h>
 #include <net/radix.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <net/ethernet.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_carp.h>
 #ifdef INET
 #include <netinet/if_ether.h>
 #endif /* INET */
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif /* INET6 */
 #endif /* INET || INET6 */
 
 #include <security/mac/mac_framework.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 #endif
 
 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
 
 SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN,
     &ifqmaxlen, 0, "max send queue size");
 
 /* Log link state change events */
 static int log_link_state_change = 1;
 
 SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW,
 	&log_link_state_change, 0,
 	"log interface link state change events");
 
 /* Interface description */
 static unsigned int ifdescr_maxlen = 1024;
 SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW,
 	&ifdescr_maxlen, 0,
 	"administrative maximum length for interface description");
 
 static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions");
 
 /* global sx for non-critical path ifdescr */
 static struct sx ifdescr_sx;
 SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr");
 
 void	(*bridge_linkstate_p)(struct ifnet *ifp);
 void	(*ng_ether_link_state_p)(struct ifnet *ifp, int state);
 void	(*lagg_linkstate_p)(struct ifnet *ifp, int state);
 /* These are external hooks for CARP. */
 void	(*carp_linkstate_p)(struct ifnet *ifp);
 void	(*carp_demote_adj_p)(int, char *);
 int	(*carp_master_p)(struct ifaddr *);
 #if defined(INET) || defined(INET6)
 int	(*carp_forus_p)(struct ifnet *ifp, u_char *dhost);
 int	(*carp_output_p)(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *sa);
 int	(*carp_ioctl_p)(struct ifreq *, u_long, struct thread *);   
 int	(*carp_attach_p)(struct ifaddr *, int);
 void	(*carp_detach_p)(struct ifaddr *);
 #endif
 #ifdef INET
 int	(*carp_iamatch_p)(struct ifaddr *, uint8_t **);
 #endif
 #ifdef INET6
 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6);
 caddr_t	(*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m,
     const struct in6_addr *taddr);
 #endif
 
 struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL;
 
 /*
  * XXX: Style; these should be sorted alphabetically, and unprototyped
  * static functions should be prototyped. Currently they are sorted by
  * declaration order.
  */
 static void	if_attachdomain(void *);
 static void	if_attachdomain1(struct ifnet *);
 static int	ifconf(u_long, caddr_t);
 static void	if_freemulti(struct ifmultiaddr *);
 static void	if_grow(void);
 static void	if_input_default(struct ifnet *, struct mbuf *);
 static int	if_requestencap_default(struct ifnet *, struct if_encap_req *);
 static void	if_route(struct ifnet *, int flag, int fam);
 static int	if_setflag(struct ifnet *, int, int, int *, int);
 static int	if_transmit(struct ifnet *ifp, struct mbuf *m);
 static void	if_unroute(struct ifnet *, int flag, int fam);
 static void	link_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
 static int	ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *);
 static int	if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int);
 static void	do_link_state_change(void *, int);
 static int	if_getgroup(struct ifgroupreq *, struct ifnet *);
 static int	if_getgroupmembers(struct ifgroupreq *);
 static void	if_delgroups(struct ifnet *);
 static void	if_attach_internal(struct ifnet *, int, struct if_clone *);
 static int	if_detach_internal(struct ifnet *, int, struct if_clone **);
 #ifdef VIMAGE
 static void	if_vmove(struct ifnet *, struct vnet *);
 #endif
 
 #ifdef INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
  * should be more generalized?
  */
 extern void	nd6_setmtu(struct ifnet *);
 #endif
 
 /* ipsec helper hooks */
 VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]);
 VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]);
 
 VNET_DEFINE(int, if_index);
 int	ifqmaxlen = IFQ_MAXLEN;
 VNET_DEFINE(struct ifnethead, ifnet);	/* depend on static init XXX */
 VNET_DEFINE(struct ifgrouphead, ifg_head);
 
 static VNET_DEFINE(int, if_indexlim) = 8;
 
 /* Table of ifnet by index. */
 VNET_DEFINE(struct ifnet **, ifindex_table);
 
 #define	V_if_indexlim		VNET(if_indexlim)
 #define	V_ifindex_table		VNET(ifindex_table)
 
 /*
  * The global network interface list (V_ifnet) and related state (such as
  * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and
  * an rwlock.  Either may be acquired shared to stablize the list, but both
  * must be acquired writable to modify the list.  This model allows us to
  * both stablize the interface list during interrupt thread processing, but
  * also to stablize it over long-running ioctls, without introducing priority
  * inversions and deadlocks.
  */
 struct rwlock ifnet_rwlock;
 RW_SYSINIT_FLAGS(ifnet_rw, &ifnet_rwlock, "ifnet_rw", RW_RECURSE);
 struct sx ifnet_sxlock;
 SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE);
 
 /*
  * The allocation of network interfaces is a rather non-atomic affair; we
  * need to select an index before we are ready to expose the interface for
  * use, so will use this pointer value to indicate reservation.
  */
 #define	IFNET_HOLD	(void *)(uintptr_t)(-1)
 
 static	if_com_alloc_t *if_com_alloc[256];
 static	if_com_free_t *if_com_free[256];
 
 static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals");
 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
 
 struct ifnet *
 ifnet_byindex_locked(u_short idx)
 {
 
 	if (idx > V_if_index)
 		return (NULL);
 	if (V_ifindex_table[idx] == IFNET_HOLD)
 		return (NULL);
 	return (V_ifindex_table[idx]);
 }
 
 struct ifnet *
 ifnet_byindex(u_short idx)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifp = ifnet_byindex_locked(idx);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 struct ifnet *
 ifnet_byindex_ref(u_short idx)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifp = ifnet_byindex_locked(idx);
 	if (ifp == NULL || (ifp->if_flags & IFF_DYING)) {
 		IFNET_RUNLOCK_NOSLEEP();
 		return (NULL);
 	}
 	if_ref(ifp);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 /*
  * Allocate an ifindex array entry; return 0 on success or an error on
  * failure.
  */
 static u_short
 ifindex_alloc(void)
 {
 	u_short idx;
 
 	IFNET_WLOCK_ASSERT();
 retry:
 	/*
 	 * Try to find an empty slot below V_if_index.  If we fail, take the
 	 * next slot.
 	 */
 	for (idx = 1; idx <= V_if_index; idx++) {
 		if (V_ifindex_table[idx] == NULL)
 			break;
 	}
 
 	/* Catch if_index overflow. */
 	if (idx >= V_if_indexlim) {
 		if_grow();
 		goto retry;
 	}
 	if (idx > V_if_index)
 		V_if_index = idx;
 	return (idx);
 }
 
 static void
 ifindex_free_locked(u_short idx)
 {
 
 	IFNET_WLOCK_ASSERT();
 
 	V_ifindex_table[idx] = NULL;
 	while (V_if_index > 0 &&
 	    V_ifindex_table[V_if_index] == NULL)
 		V_if_index--;
 }
 
 static void
 ifindex_free(u_short idx)
 {
 
 	IFNET_WLOCK();
 	ifindex_free_locked(idx);
 	IFNET_WUNLOCK();
 }
 
 static void
 ifnet_setbyindex_locked(u_short idx, struct ifnet *ifp)
 {
 
 	IFNET_WLOCK_ASSERT();
 
 	V_ifindex_table[idx] = ifp;
 }
 
 static void
 ifnet_setbyindex(u_short idx, struct ifnet *ifp)
 {
 
 	IFNET_WLOCK();
 	ifnet_setbyindex_locked(idx, ifp);
 	IFNET_WUNLOCK();
 }
 
 struct ifaddr *
 ifaddr_byindex(u_short idx)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa = NULL;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifp = ifnet_byindex_locked(idx);
 	if (ifp != NULL && (ifa = ifp->if_addr) != NULL)
 		ifa_ref(ifa);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Network interface utility routines.
  *
  * Routines with ifa_ifwith* names take sockaddr *'s as
  * parameters.
  */
 
 static void
 vnet_if_init(const void *unused __unused)
 {
 
 	TAILQ_INIT(&V_ifnet);
 	TAILQ_INIT(&V_ifg_head);
 	IFNET_WLOCK();
 	if_grow();				/* create initial table */
 	IFNET_WUNLOCK();
 	vnet_if_clone_init();
 }
 VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init,
     NULL);
 
 #ifdef VIMAGE
 static void
 vnet_if_uninit(const void *unused __unused)
 {
 
 	VNET_ASSERT(TAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p "
 	    "not empty", __func__, __LINE__, &V_ifnet));
 	VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p "
 	    "not empty", __func__, __LINE__, &V_ifg_head));
 
 	free((caddr_t)V_ifindex_table, M_IFNET);
 }
 VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST,
     vnet_if_uninit, NULL);
 
 static void
 vnet_if_return(const void *unused __unused)
 {
 	struct ifnet *ifp, *nifp;
 
 	/* Return all inherited interfaces to their parent vnets. */
 	TAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) {
 		if (ifp->if_home_vnet != ifp->if_vnet)
 			if_vmove(ifp, ifp->if_home_vnet);
 	}
 }
 VNET_SYSUNINIT(vnet_if_return, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_if_return, NULL);
 #endif
 
 static void
 if_grow(void)
 {
 	int oldlim;
 	u_int n;
 	struct ifnet **e;
 
 	IFNET_WLOCK_ASSERT();
 	oldlim = V_if_indexlim;
 	IFNET_WUNLOCK();
 	n = (oldlim << 1) * sizeof(*e);
 	e = malloc(n, M_IFNET, M_WAITOK | M_ZERO);
 	IFNET_WLOCK();
 	if (V_if_indexlim != oldlim) {
 		free(e, M_IFNET);
 		return;
 	}
 	if (V_ifindex_table != NULL) {
 		memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2);
 		free((caddr_t)V_ifindex_table, M_IFNET);
 	}
 	V_if_indexlim <<= 1;
 	V_ifindex_table = e;
 }
 
 /*
  * Allocate a struct ifnet and an index for an interface.  A layer 2
  * common structure will also be allocated if an allocation routine is
  * registered for the passed type.
  */
 struct ifnet *
 if_alloc(u_char type)
 {
 	struct ifnet *ifp;
 	u_short idx;
 
 	ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO);
 	IFNET_WLOCK();
 	idx = ifindex_alloc();
 	ifnet_setbyindex_locked(idx, IFNET_HOLD);
 	IFNET_WUNLOCK();
 	ifp->if_index = idx;
 	ifp->if_type = type;
 	ifp->if_alloctype = type;
 	if (if_com_alloc[type] != NULL) {
 		ifp->if_l2com = if_com_alloc[type](type, ifp);
 		if (ifp->if_l2com == NULL) {
 			free(ifp, M_IFNET);
 			ifindex_free(idx);
 			return (NULL);
 		}
 	}
 
 	IF_ADDR_LOCK_INIT(ifp);
 	TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp);
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_LOCK_INIT(ifp);
 	TAILQ_INIT(&ifp->if_addrhead);
 	TAILQ_INIT(&ifp->if_multiaddrs);
 	TAILQ_INIT(&ifp->if_groups);
 #ifdef MAC
 	mac_ifnet_init(ifp);
 #endif
 	ifq_init(&ifp->if_snd, ifp);
 
 	refcount_init(&ifp->if_refcount, 1);	/* Index reference. */
 	for (int i = 0; i < IFCOUNTERS; i++)
 		ifp->if_counters[i] = counter_u64_alloc(M_WAITOK);
 	ifp->if_get_counter = if_get_counter_default;
 	ifnet_setbyindex(ifp->if_index, ifp);
 	return (ifp);
 }
 
 /*
  * Do the actual work of freeing a struct ifnet, and layer 2 common
  * structure.  This call is made when the last reference to an
  * interface is released.
  */
 static void
 if_free_internal(struct ifnet *ifp)
 {
 
 	KASSERT((ifp->if_flags & IFF_DYING),
 	    ("if_free_internal: interface not dying"));
 
 	if (if_com_free[ifp->if_alloctype] != NULL)
 		if_com_free[ifp->if_alloctype](ifp->if_l2com,
 		    ifp->if_alloctype);
 
 #ifdef MAC
 	mac_ifnet_destroy(ifp);
 #endif /* MAC */
 	if (ifp->if_description != NULL)
 		free(ifp->if_description, M_IFDESCR);
 	IF_AFDATA_DESTROY(ifp);
 	IF_ADDR_LOCK_DESTROY(ifp);
 	ifq_delete(&ifp->if_snd);
 
 	for (int i = 0; i < IFCOUNTERS; i++)
 		counter_u64_free(ifp->if_counters[i]);
 
 	free(ifp, M_IFNET);
 }
 
 /*
  * Deregister an interface and free the associated storage.
  */
 void
 if_free(struct ifnet *ifp)
 {
 
 	ifp->if_flags |= IFF_DYING;			/* XXX: Locking */
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	IFNET_WLOCK();
 	KASSERT(ifp == ifnet_byindex_locked(ifp->if_index),
 	    ("%s: freeing unallocated ifnet", ifp->if_xname));
 
 	ifindex_free_locked(ifp->if_index);
 	IFNET_WUNLOCK();
 
 	if (refcount_release(&ifp->if_refcount))
 		if_free_internal(ifp);
 	CURVNET_RESTORE();
 }
 
 /*
  * Interfaces to keep an ifnet type-stable despite the possibility of the
  * driver calling if_free().  If there are additional references, we defer
  * freeing the underlying data structure.
  */
 void
 if_ref(struct ifnet *ifp)
 {
 
 	/* We don't assert the ifnet list lock here, but arguably should. */
 	refcount_acquire(&ifp->if_refcount);
 }
 
 void
 if_rele(struct ifnet *ifp)
 {
 
 	if (!refcount_release(&ifp->if_refcount))
 		return;
 	if_free_internal(ifp);
 }
 
 void
 ifq_init(struct ifaltq *ifq, struct ifnet *ifp)
 {
 	
 	mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF);
 
 	if (ifq->ifq_maxlen == 0) 
 		ifq->ifq_maxlen = ifqmaxlen;
 
 	ifq->altq_type = 0;
 	ifq->altq_disc = NULL;
 	ifq->altq_flags &= ALTQF_CANTCHANGE;
 	ifq->altq_tbr  = NULL;
 	ifq->altq_ifp  = ifp;
 }
 
 void
 ifq_delete(struct ifaltq *ifq)
 {
 	mtx_destroy(&ifq->ifq_mtx);
 }
 
 /*
  * Perform generic interface initalization tasks and attach the interface
  * to the list of "active" interfaces.  If vmove flag is set on entry
  * to if_attach_internal(), perform only a limited subset of initialization
  * tasks, given that we are moving from one vnet to another an ifnet which
  * has already been fully initialized.
  *
  * Note that if_detach_internal() removes group membership unconditionally
  * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL.
  * Thus, when if_vmove() is applied to a cloned interface, group membership
  * is lost while a cloned one always joins a group whose name is
  * ifc->ifc_name.  To recover this after if_detach_internal() and
  * if_attach_internal(), the cloner should be specified to
  * if_attach_internal() via ifc.  If it is non-NULL, if_attach_internal()
  * attempts to join a group whose name is ifc->ifc_name.
  *
  * XXX:
  *  - The decision to return void and thus require this function to
  *    succeed is questionable.
  *  - We should probably do more sanity checking.  For instance we don't
  *    do anything to insure if_xname is unique or non-empty.
  */
 void
 if_attach(struct ifnet *ifp)
 {
 
 	if_attach_internal(ifp, 0, NULL);
 }
 
 /*
  * Compute the least common TSO limit.
  */
 void
 if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax)
 {
 	/*
 	 * 1) If there is no limit currently, take the limit from
 	 * the network adapter.
 	 *
 	 * 2) If the network adapter has a limit below the current
 	 * limit, apply it.
 	 */
 	if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 &&
 	    ifp->if_hw_tsomax < pmax->tsomaxbytes)) {
 		pmax->tsomaxbytes = ifp->if_hw_tsomax;
 	}
 	if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 &&
 	    ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) {
 		pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 	}
 	if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 &&
 	    ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) {
 		pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 	}
 }
 
 /*
  * Update TSO limit of a network adapter.
  *
  * Returns zero if no change. Else non-zero.
  */
 int
 if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax)
 {
 	int retval = 0;
 	if (ifp->if_hw_tsomax != pmax->tsomaxbytes) {
 		ifp->if_hw_tsomax = pmax->tsomaxbytes;
 		retval++;
 	}
 	if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) {
 		ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize;
 		retval++;
 	}
 	if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) {
 		ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount;
 		retval++;
 	}
 	return (retval);
 }
 
 static void
 if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc)
 {
 	unsigned socksize, ifasize;
 	int namelen, masklen;
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 
 	if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index))
 		panic ("%s: BUG: if_attach called without if_alloc'd input()\n",
 		    ifp->if_xname);
 
 #ifdef VIMAGE
 	ifp->if_vnet = curvnet;
 	if (ifp->if_home_vnet == NULL)
 		ifp->if_home_vnet = curvnet;
 #endif
 
 	if_addgroup(ifp, IFG_ALL);
 
 	/* Restore group membership for cloned interfaces. */
 	if (vmove && ifc != NULL)
 		if_clone_addgroup(ifp, ifc);
 
 	getmicrotime(&ifp->if_lastchange);
 	ifp->if_epoch = time_uptime;
 
 	KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) ||
 	    (ifp->if_transmit != NULL && ifp->if_qflush != NULL),
 	    ("transmit and qflush must both either be set or both be NULL"));
 	if (ifp->if_transmit == NULL) {
 		ifp->if_transmit = if_transmit;
 		ifp->if_qflush = if_qflush;
 	}
 	if (ifp->if_input == NULL)
 		ifp->if_input = if_input_default;
 
 	if (ifp->if_requestencap == NULL)
 		ifp->if_requestencap = if_requestencap_default;
 
 	if (!vmove) {
 #ifdef MAC
 		mac_ifnet_create(ifp);
 #endif
 
 		/*
 		 * Create a Link Level name for this device.
 		 */
 		namelen = strlen(ifp->if_xname);
 		/*
 		 * Always save enough space for any possiable name so we
 		 * can do a rename in place later.
 		 */
 		masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ;
 		socksize = masklen + ifp->if_addrlen;
 		if (socksize < sizeof(*sdl))
 			socksize = sizeof(*sdl);
 		socksize = roundup2(socksize, sizeof(long));
 		ifasize = sizeof(*ifa) + 2 * socksize;
 		ifa = ifa_alloc(ifasize, M_WAITOK);
 		sdl = (struct sockaddr_dl *)(ifa + 1);
 		sdl->sdl_len = socksize;
 		sdl->sdl_family = AF_LINK;
 		bcopy(ifp->if_xname, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl->sdl_index = ifp->if_index;
 		sdl->sdl_type = ifp->if_type;
 		ifp->if_addr = ifa;
 		ifa->ifa_ifp = ifp;
 		ifa->ifa_rtrequest = link_rtrequest;
 		ifa->ifa_addr = (struct sockaddr *)sdl;
 		sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
 		ifa->ifa_netmask = (struct sockaddr *)sdl;
 		sdl->sdl_len = masklen;
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
 		/* Reliably crash if used uninitialized. */
 		ifp->if_broadcastaddr = NULL;
 
 #if defined(INET) || defined(INET6)
 		/* Use defaults for TSO, if nothing is set */
 		if (ifp->if_hw_tsomax == 0 &&
 		    ifp->if_hw_tsomaxsegcount == 0 &&
 		    ifp->if_hw_tsomaxsegsize == 0) {
 			/*
 			 * The TSO defaults needs to be such that an
 			 * NFS mbuf list of 35 mbufs totalling just
 			 * below 64K works and that a chain of mbufs
 			 * can be defragged into at most 32 segments:
 			 */
 			ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) -
 			    (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
 			ifp->if_hw_tsomaxsegcount = 35;
 			ifp->if_hw_tsomaxsegsize = 2048;	/* 2K */
 
 			/* XXX some drivers set IFCAP_TSO after ethernet attach */
 			if (ifp->if_capabilities & IFCAP_TSO) {
 				if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n",
 				    ifp->if_hw_tsomax,
 				    ifp->if_hw_tsomaxsegcount,
 				    ifp->if_hw_tsomaxsegsize);
 			}
 		}
 #endif
 	}
 #ifdef VIMAGE
 	else {
 		/*
 		 * Update the interface index in the link layer address
 		 * of the interface.
 		 */
 		for (ifa = ifp->if_addr; ifa != NULL;
 		    ifa = TAILQ_NEXT(ifa, ifa_link)) {
 			if (ifa->ifa_addr->sa_family == AF_LINK) {
 				sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 				sdl->sdl_index = ifp->if_index;
 			}
 		}
 	}
 #endif
 
 	IFNET_WLOCK();
 	TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link);
 #ifdef VIMAGE
 	curvnet->vnet_ifcnt++;
 #endif
 	IFNET_WUNLOCK();
 
 	if (domain_init_status >= 2)
 		if_attachdomain1(ifp);
 
 	EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
 
 	/* Announce the interface. */
 	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 }
 
 static void
 if_attachdomain(void *dummy)
 {
 	struct ifnet *ifp;
 
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
 		if_attachdomain1(ifp);
 }
 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND,
     if_attachdomain, NULL);
 
 static void
 if_attachdomain1(struct ifnet *ifp)
 {
 	struct domain *dp;
 
 	/*
 	 * Since dp->dom_ifattach calls malloc() with M_WAITOK, we
 	 * cannot lock ifp->if_afdata initialization, entirely.
 	 */
-	if (IF_AFDATA_TRYLOCK(ifp) == 0)
-		return;
+	IF_AFDATA_LOCK(ifp);
 	if (ifp->if_afdata_initialized >= domain_init_status) {
 		IF_AFDATA_UNLOCK(ifp);
 		log(LOG_WARNING, "%s called more than once on %s\n",
 		    __func__, ifp->if_xname);
 		return;
 	}
-	ifp->if_afdata_initialized = domain_init_status;
 	IF_AFDATA_UNLOCK(ifp);
 
 	/* address family dependent data region */
 	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_ifattach)
 			ifp->if_afdata[dp->dom_family] =
 			    (*dp->dom_ifattach)(ifp);
 	}
+
+	IF_AFDATA_LOCK(ifp);
+	ifp->if_afdata_initialized = domain_init_status;
+	IF_AFDATA_UNLOCK(ifp);
 }
 
 /*
  * Remove any unicast or broadcast network addresses from an interface.
  */
 void
 if_purgeaddrs(struct ifnet *ifp)
 {
 	struct ifaddr *ifa, *next;
 
 	/* XXX IF_ADDR_R/WLOCK */
 	TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
 		if (ifa->ifa_addr->sa_family == AF_LINK)
 			continue;
 #ifdef INET
 		/* XXX: Ugly!! ad hoc just for INET */
 		if (ifa->ifa_addr->sa_family == AF_INET) {
 			struct ifaliasreq ifr;
 
 			bzero(&ifr, sizeof(ifr));
 			ifr.ifra_addr = *ifa->ifa_addr;
 			if (ifa->ifa_dstaddr)
 				ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
 			if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
 			    NULL) == 0)
 				continue;
 		}
 #endif /* INET */
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6) {
 			in6_purgeaddr(ifa);
 			/* ifp_addrhead is already updated */
 			continue;
 		}
 #endif /* INET6 */
 		IF_ADDR_WLOCK(ifp);
 		TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
 		IF_ADDR_WUNLOCK(ifp);
 		ifa_free(ifa);
 	}
 }
 
 /*
  * Remove any multicast network addresses from an interface when an ifnet
  * is going away.
  */
 static void
 if_purgemaddrs(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 	struct ifmultiaddr *next;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
 		if_delmulti_locked(ifp, ifma, 1);
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 /*
  * Detach an interface, removing it from the list of "active" interfaces.
  * If vmove flag is set on entry to if_detach_internal(), perform only a
  * limited subset of cleanup tasks, given that we are moving an ifnet from
  * one vnet to another, where it must be fully operational.
  *
  * XXXRW: There are some significant questions about event ordering, and
  * how to prevent things from starting to use the interface during detach.
  */
 void
 if_detach(struct ifnet *ifp)
 {
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	if_detach_internal(ifp, 0, NULL);
 	CURVNET_RESTORE();
 }
 
 /*
  * The vmove, if set, flag indicates that we are called from a callpath
  * that is moving an interface to a different vnet instance.
  *
  * The shutdown flag, if set, indicates that we are called in the
  * progress of shutting down a vnet instance.  Currently only the
  * vnet_if_return SYSUNINIT function sets it.  Note: we can be called
  * on a vnet instance shutdown without this flag being set, e.g., when
  * the cloned interfaces are destoyed as first thing of teardown.
  */
 static int
 if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp)
 {
 	struct ifaddr *ifa;
 	int i;
 	struct domain *dp;
  	struct ifnet *iter;
  	int found = 0, shutdown;
 
 	shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET &&
 		 ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0;
 	IFNET_WLOCK();
 	TAILQ_FOREACH(iter, &V_ifnet, if_link)
 		if (iter == ifp) {
 			TAILQ_REMOVE(&V_ifnet, ifp, if_link);
 			found = 1;
 			break;
 		}
-#ifdef VIMAGE
-	if (found)
-		curvnet->vnet_ifcnt--;
-#endif
 	IFNET_WUNLOCK();
 	if (!found) {
 		/*
 		 * While we would want to panic here, we cannot
 		 * guarantee that the interface is indeed still on
 		 * the list given we don't hold locks all the way.
 		 */
 		return (ENOENT);
 #if 0
 		if (vmove)
 			panic("%s: ifp=%p not on the ifnet tailq %p",
 			    __func__, ifp, &V_ifnet);
 		else
 			return; /* XXX this should panic as well? */
 #endif
 	}
 
 	/*
 	 * At this point we know the interface still was on the ifnet list
 	 * and we removed it so we are in a stable state.
 	 */
+#ifdef VIMAGE
+	curvnet->vnet_ifcnt--;
+#endif
 
 	/*
 	 * In any case (destroy or vmove) detach us from the groups
 	 * and remove/wait for pending events on the taskq.
 	 * XXX-BZ in theory an interface could still enqueue a taskq change?
 	 */
 	if_delgroups(ifp);
 
 	taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
 
 	/*
 	 * Check if this is a cloned interface or not. Must do even if
 	 * shutting down as a if_vmove_reclaim() would move the ifp and
 	 * the if_clone_addgroup() will have a corrupted string overwise
 	 * from a gibberish pointer.
 	 */
 	if (vmove && ifcp != NULL)
 		*ifcp = if_clone_findifc(ifp);
 
+	if_down(ifp);
+
 	/*
 	 * On VNET shutdown abort here as the stack teardown will do all
 	 * the work top-down for us.
 	 */
 	if (shutdown) {
 		/*
 		 * In case of a vmove we are done here without error.
 		 * If we would signal an error it would lead to the same
 		 * abort as if we did not find the ifnet anymore.
 		 * if_detach() calls us in void context and does not care
 		 * about an early abort notification, so life is splendid :)
 		 */
 		goto finish_vnet_shutdown;
 	}
 
 	/*
 	 * At this point we are not tearing down a VNET and are either
 	 * going to destroy or vmove the interface and have to cleanup
 	 * accordingly.
 	 */
 
 	/*
 	 * Remove routes and flush queues.
 	 */
-	if_down(ifp);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
 		altq_disable(&ifp->if_snd);
 	if (ALTQ_IS_ATTACHED(&ifp->if_snd))
 		altq_detach(&ifp->if_snd);
 #endif
 
 	if_purgeaddrs(ifp);
 
 #ifdef INET
 	in_ifdetach(ifp, 1);
 #endif
 
 #ifdef INET6
 	/*
 	 * Remove all IPv6 kernel structs related to ifp.  This should be done
 	 * before removing routing entries below, since IPv6 interface direct
 	 * routes are expected to be removed by the IPv6-specific kernel API.
 	 * Otherwise, the kernel will detect some inconsistency and bark it.
 	 */
 	in6_ifdetach(ifp, 1);
 #endif
 	if_purgemaddrs(ifp);
 
 	/* Announce that the interface is gone. */
 	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 	EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
 
 	if (!vmove) {
 		/*
 		 * Prevent further calls into the device driver via ifnet.
 		 */
 		if_dead(ifp);
 
 		/*
 		 * Remove link ifaddr pointer and maybe decrement if_index.
 		 * Clean up all addresses.
 		 */
 		ifp->if_addr = NULL;
 
 		/* We can now free link ifaddr. */
 		if (!TAILQ_EMPTY(&ifp->if_addrhead)) {
 			ifa = TAILQ_FIRST(&ifp->if_addrhead);
 			IF_ADDR_WLOCK(ifp);
 			TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
 			IF_ADDR_WUNLOCK(ifp);
 			ifa_free(ifa);
 		}
 	}
 
 	rt_flushifroutes(ifp);
 
 finish_vnet_shutdown:
 	/*
 	 * We cannot hold the lock over dom_ifdetach calls as they might
 	 * sleep, for example trying to drain a callout, thus open up the
 	 * theoretical race with re-attaching.
 	 */
 	IF_AFDATA_LOCK(ifp);
 	i = ifp->if_afdata_initialized;
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_UNLOCK(ifp);
 	for (dp = domains; i > 0 && dp; dp = dp->dom_next) {
-		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
+		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) {
 			(*dp->dom_ifdetach)(ifp,
 			    ifp->if_afdata[dp->dom_family]);
+			ifp->if_afdata[dp->dom_family] = NULL;
+		}
 	}
 
 	return (0);
 }
 
 #ifdef VIMAGE
 /*
  * if_vmove() performs a limited version of if_detach() in current
  * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg.
  * An attempt is made to shrink if_index in current vnet, find an
  * unused if_index in target vnet and calls if_grow() if necessary,
  * and finally find an unused if_xname for the target vnet.
  */
 static void
 if_vmove(struct ifnet *ifp, struct vnet *new_vnet)
 {
 	struct if_clone *ifc;
 	int rc;
 	u_int bif_dlt, bif_hdrlen;
 
 	/*
 	 * if_detach_internal() will call the eventhandler to notify
 	 * interface departure.  That will detach if_bpf.  We need to
 	 * safe the dlt and hdrlen so we can re-attach it later.
 	 */
 	bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen);
 
 	/*
 	 * Detach from current vnet, but preserve LLADDR info, do not
 	 * mark as dead etc. so that the ifnet can be reattached later.
 	 * If we cannot find it, we lost the race to someone else.
 	 */
 	rc = if_detach_internal(ifp, 1, &ifc);
 	if (rc != 0)
 		return;
 
 	/*
 	 * Unlink the ifnet from ifindex_table[] in current vnet, and shrink
 	 * the if_index for that vnet if possible.
 	 *
 	 * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized,
 	 * or we'd lock on one vnet and unlock on another.
 	 */
 	IFNET_WLOCK();
 	ifindex_free_locked(ifp->if_index);
 	IFNET_WUNLOCK();
 
 	/*
 	 * Perform interface-specific reassignment tasks, if provided by
 	 * the driver.
 	 */
 	if (ifp->if_reassign != NULL)
 		ifp->if_reassign(ifp, new_vnet, NULL);
 
 	/*
 	 * Switch to the context of the target vnet.
 	 */
 	CURVNET_SET_QUIET(new_vnet);
 
 	IFNET_WLOCK();
 	ifp->if_index = ifindex_alloc();
 	ifnet_setbyindex_locked(ifp->if_index, ifp);
 	IFNET_WUNLOCK();
 
 	if_attach_internal(ifp, 1, ifc);
 
 	if (ifp->if_bpf == NULL)
 		bpfattach(ifp, bif_dlt, bif_hdrlen);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Move an ifnet to or from another child prison/vnet, specified by the jail id.
  */
 static int
 if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct ifnet *difp;
+	int shutdown;
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Do not try to move the iface from and to the same prison. */
 	if (pr->pr_vnet == ifp->if_vnet) {
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Make sure the named iface does not exists in the dst. prison/vnet. */
 	/* XXX Lock interfaces to avoid races. */
 	CURVNET_SET_QUIET(pr->pr_vnet);
 	difp = ifunit(ifname);
-	CURVNET_RESTORE();
 	if (difp != NULL) {
+		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EEXIST);
 	}
 
+	/* Make sure the VNET is stable. */
+	shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET &&
+		 ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0;
+	if (shutdown) {
+		CURVNET_RESTORE();
+		prison_free(pr);
+		return (EBUSY);
+	}
+	CURVNET_RESTORE();
+
 	/* Move the interface into the child jail/vnet. */
 	if_vmove(ifp, pr->pr_vnet);
 
 	/* Report the new if_xname back to the userland. */
 	sprintf(ifname, "%s", ifp->if_xname);
 
 	prison_free(pr);
 	return (0);
 }
 
 static int
 if_vmove_reclaim(struct thread *td, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct vnet *vnet_dst;
 	struct ifnet *ifp;
+ 	int shutdown;
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Make sure the named iface exists in the source prison/vnet. */
 	CURVNET_SET(pr->pr_vnet);
 	ifp = ifunit(ifname);		/* XXX Lock to avoid races. */
 	if (ifp == NULL) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (ENXIO);
 	}
 
 	/* Do not try to move the iface from and to the same prison. */
 	vnet_dst = TD_TO_VNET(td);
 	if (vnet_dst == ifp->if_vnet) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EEXIST);
 	}
 
+	/* Make sure the VNET is stable. */
+	shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET &&
+		 ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0;
+	if (shutdown) {
+		CURVNET_RESTORE();
+		prison_free(pr);
+		return (EBUSY);
+	}
+
 	/* Get interface back from child jail/vnet. */
 	if_vmove(ifp, vnet_dst);
 	CURVNET_RESTORE();
 
 	/* Report the new if_xname back to the userland. */
 	sprintf(ifname, "%s", ifp->if_xname);
 
 	prison_free(pr);
 	return (0);
 }
 #endif /* VIMAGE */
 
 /*
  * Add a group to an interface
  */
 int
 if_addgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_group	*ifg = NULL;
 	struct ifg_member	*ifgm;
 	int 			 new = 0;
 
 	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
 	    groupname[strlen(groupname) - 1] <= '9')
 		return (EINVAL);
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) {
 			IFNET_WUNLOCK();
 			return (EEXIST);
 		}
 
 	if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP,
 	    M_NOWAIT)) == NULL) {
 	    	IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member),
 	    M_TEMP, M_NOWAIT)) == NULL) {
 		free(ifgl, M_TEMP);
 		IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, groupname))
 			break;
 
 	if (ifg == NULL) {
 		if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group),
 		    M_TEMP, M_NOWAIT)) == NULL) {
 			free(ifgl, M_TEMP);
 			free(ifgm, M_TEMP);
 			IFNET_WUNLOCK();
 			return (ENOMEM);
 		}
 		strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
 		ifg->ifg_refcnt = 0;
 		TAILQ_INIT(&ifg->ifg_members);
 		TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next);
 		new = 1;
 	}
 
 	ifg->ifg_refcnt++;
 	ifgl->ifgl_group = ifg;
 	ifgm->ifgm_ifp = ifp;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
 	TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	IFNET_WUNLOCK();
 
 	if (new)
 		EVENTHANDLER_INVOKE(group_attach_event, ifg);
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Remove a group from an interface
  */
 int
 if_delgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_member	*ifgm;
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
 			break;
 	if (ifgl == NULL) {
 		IFNET_WUNLOCK();
 		return (ENOENT);
 	}
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
 		if (ifgm->ifgm_ifp == ifp)
 			break;
 
 	if (ifgm != NULL) {
 		TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
 		free(ifgm, M_TEMP);
 	}
 
 	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 		TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
 		IFNET_WUNLOCK();
 		EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group);
 		free(ifgl->ifgl_group, M_TEMP);
 	} else
 		IFNET_WUNLOCK();
 
 	free(ifgl, M_TEMP);
 
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Remove an interface from all groups
  */
 static void
 if_delgroups(struct ifnet *ifp)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_member	*ifgm;
 	char groupname[IFNAMSIZ];
 
 	IFNET_WLOCK();
 	while (!TAILQ_EMPTY(&ifp->if_groups)) {
 		ifgl = TAILQ_FIRST(&ifp->if_groups);
 
 		strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ);
 
 		IF_ADDR_WLOCK(ifp);
 		TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);		// <<<<
 		IF_ADDR_WUNLOCK(ifp);
 
 		TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
 			if (ifgm->ifgm_ifp == ifp)
 				break;
 
 		if (ifgm != NULL) {
 			TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm,
 			    ifgm_next);
 			free(ifgm, M_TEMP);
 		}
 
 		if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 			TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
 			IFNET_WUNLOCK();
 			EVENTHANDLER_INVOKE(group_detach_event,
 			    ifgl->ifgl_group);
 			free(ifgl->ifgl_group, M_TEMP);
 		} else
 			IFNET_WUNLOCK();
 
 		free(ifgl, M_TEMP);
 
 		EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 		IFNET_WLOCK();
 	}
 	IFNET_WUNLOCK();
 }
 
 /*
  * Stores all groups from an interface in memory pointed
  * to by data
  */
 static int
 if_getgroup(struct ifgroupreq *data, struct ifnet *ifp)
 {
 	int			 len, error;
 	struct ifg_list		*ifgl;
 	struct ifg_req		 ifgrq, *ifgp;
 	struct ifgroupreq	*ifgr = data;
 
 	if (ifgr->ifgr_len == 0) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 			ifgr->ifgr_len += sizeof(struct ifg_req);
 		IF_ADDR_RUNLOCK(ifp);
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	/* XXX: wire */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
 		if (len < sizeof(ifgrq)) {
 			IF_ADDR_RUNLOCK(ifp);
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
 		    sizeof(ifgrq.ifgrq_group));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 		    	IF_ADDR_RUNLOCK(ifp);
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (0);
 }
 
 /*
  * Stores all members of a group in memory pointed to by data
  */
 static int
 if_getgroupmembers(struct ifgroupreq *data)
 {
 	struct ifgroupreq	*ifgr = data;
 	struct ifg_group	*ifg;
 	struct ifg_member	*ifgm;
 	struct ifg_req		 ifgrq, *ifgp;
 	int			 len, error;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
 			break;
 	if (ifg == NULL) {
 		IFNET_RUNLOCK();
 		return (ENOENT);
 	}
 
 	if (ifgr->ifgr_len == 0) {
 		TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
 			ifgr->ifgr_len += sizeof(ifgrq);
 		IFNET_RUNLOCK();
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
 		if (len < sizeof(ifgrq)) {
 			IFNET_RUNLOCK();
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
 		    sizeof(ifgrq.ifgrq_member));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 			IFNET_RUNLOCK();
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IFNET_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * Return counter values from counter(9)s stored in ifnet.
  */
 uint64_t
 if_get_counter_default(struct ifnet *ifp, ift_counter cnt)
 {
 
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	return (counter_u64_fetch(ifp->if_counters[cnt]));
 }
 
 /*
  * Increase an ifnet counter. Usually used for counters shared
  * between the stack and a driver, but function supports them all.
  */
 void
 if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc)
 {
 
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	counter_u64_add(ifp->if_counters[cnt], inc);
 }
 
 /*
  * Copy data from ifnet to userland API structure if_data.
  */
 void
 if_data_copy(struct ifnet *ifp, struct if_data *ifd)
 {
 
 	ifd->ifi_type = ifp->if_type;
 	ifd->ifi_physical = 0;
 	ifd->ifi_addrlen = ifp->if_addrlen;
 	ifd->ifi_hdrlen = ifp->if_hdrlen;
 	ifd->ifi_link_state = ifp->if_link_state;
 	ifd->ifi_vhid = 0;
 	ifd->ifi_datalen = sizeof(struct if_data);
 	ifd->ifi_mtu = ifp->if_mtu;
 	ifd->ifi_metric = ifp->if_metric;
 	ifd->ifi_baudrate = ifp->if_baudrate;
 	ifd->ifi_hwassist = ifp->if_hwassist;
 	ifd->ifi_epoch = ifp->if_epoch;
 	ifd->ifi_lastchange = ifp->if_lastchange;
 
 	ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS);
 	ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS);
 	ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS);
 	ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS);
 	ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS);
 	ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES);
 	ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES);
 	ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS);
 	ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS);
 	ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS);
 	ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS);
 	ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO);
 }
 
 /*
  * Wrapper functions for struct ifnet address list locking macros.  These are
  * used by kernel modules to avoid encoding programming interface or binary
  * interface assumptions that may be violated when kernel-internal locking
  * approaches change.
  */
 void
 if_addr_rlock(struct ifnet *ifp)
 {
 
 	IF_ADDR_RLOCK(ifp);
 }
 
 void
 if_addr_runlock(struct ifnet *ifp)
 {
 
 	IF_ADDR_RUNLOCK(ifp);
 }
 
 void
 if_maddr_rlock(if_t ifp)
 {
 
 	IF_ADDR_RLOCK((struct ifnet *)ifp);
 }
 
 void
 if_maddr_runlock(if_t ifp)
 {
 
 	IF_ADDR_RUNLOCK((struct ifnet *)ifp);
 }
 
 /*
  * Initialization, destruction and refcounting functions for ifaddrs.
  */
 struct ifaddr *
 ifa_alloc(size_t size, int flags)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(size >= sizeof(struct ifaddr),
 	    ("%s: invalid size %zu", __func__, size));
 
 	ifa = malloc(size, M_IFADDR, M_ZERO | flags);
 	if (ifa == NULL)
 		return (NULL);
 
 	if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 
 	refcount_init(&ifa->ifa_refcnt, 1);
 
 	return (ifa);
 
 fail:
 	/* free(NULL) is okay */
 	counter_u64_free(ifa->ifa_opackets);
 	counter_u64_free(ifa->ifa_ipackets);
 	counter_u64_free(ifa->ifa_obytes);
 	counter_u64_free(ifa->ifa_ibytes);
 	free(ifa, M_IFADDR);
 
 	return (NULL);
 }
 
 void
 ifa_ref(struct ifaddr *ifa)
 {
 
 	refcount_acquire(&ifa->ifa_refcnt);
 }
 
 void
 ifa_free(struct ifaddr *ifa)
 {
 
 	if (refcount_release(&ifa->ifa_refcnt)) {
 		counter_u64_free(ifa->ifa_opackets);
 		counter_u64_free(ifa->ifa_ipackets);
 		counter_u64_free(ifa->ifa_obytes);
 		counter_u64_free(ifa->ifa_ibytes);
 		free(ifa, M_IFADDR);
 	}
 }
 
 static int
 ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa,
     struct sockaddr *ia)
 {
 	int error;
 	struct rt_addrinfo info;
 	struct sockaddr_dl null_sdl;
 	struct ifnet *ifp;
 
 	ifp = ifa->ifa_ifp;
 
 	bzero(&info, sizeof(info));
 	if (cmd != RTM_DELETE)
 		info.rti_ifp = V_loif;
 	info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC;
 	info.rti_info[RTAX_DST] = ia;
 	info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl;
 	link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type);
 
 	error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib);
 
 	if (error != 0)
 		log(LOG_DEBUG, "%s: %s failed for interface %s: %u\n",
 		    __func__, otype, if_name(ifp), error);
 
 	return (error);
 }
 
 int
 ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 
 	return (ifa_maintain_loopback_route(RTM_ADD, "insertion", ifa, ia));
 }
 
 int
 ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 
 	return (ifa_maintain_loopback_route(RTM_DELETE, "deletion", ifa, ia));
 }
 
 int
 ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 
 	return (ifa_maintain_loopback_route(RTM_CHANGE, "switch", ifa, ia));
 }
 
 /*
  * XXX: Because sockaddr_dl has deeper structure than the sockaddr
  * structs used to represent other address families, it is necessary
  * to perform a different comparison.
  */
 
 #define	sa_dl_equal(a1, a2)	\
 	((((const struct sockaddr_dl *)(a1))->sdl_len ==		\
 	 ((const struct sockaddr_dl *)(a2))->sdl_len) &&		\
 	 (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)),		\
 	       CLLADDR((const struct sockaddr_dl *)(a2)),		\
 	       ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0))
 
 /*
  * Locate an interface based on a complete address.
  */
 /*ARGSUSED*/
 static struct ifaddr *
 ifa_ifwithaddr_internal(const struct sockaddr *addr, int getref)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (sa_equal(addr, ifa->ifa_addr)) {
 				if (getref)
 					ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 			/* IP6 doesn't have broadcast */
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				if (getref)
 					ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 struct ifaddr *
 ifa_ifwithaddr(const struct sockaddr *addr)
 {
 
 	return (ifa_ifwithaddr_internal(addr, 1));
 }
 
 int
 ifa_ifwithaddr_check(const struct sockaddr *addr)
 {
 
 	return (ifa_ifwithaddr_internal(addr, 0) != NULL);
 }
 
 /*
  * Locate an interface based on the broadcast address.
  */
 /* ARGSUSED */
 struct ifaddr *
 ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Locate the point to point interface with a given destination address.
  */
 /*ARGSUSED*/
 struct ifaddr *
 ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
 			continue;
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (ifa->ifa_dstaddr != NULL &&
 			    sa_equal(addr, ifa->ifa_dstaddr)) {
 				ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Find an interface on a specific network.  If many, choice
  * is most specific found.
  */
 struct ifaddr *
 ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 	const char *addr_data = addr->sa_data, *cplim;
 
 	/*
 	 * AF_LINK addresses can be looked up directly by their index number,
 	 * so do that if we can.
 	 */
 	if (af == AF_LINK) {
 	    const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr;
 	    if (sdl->sdl_index && sdl->sdl_index <= V_if_index)
 		return (ifaddr_byindex(sdl->sdl_index));
 	}
 
 	/*
 	 * Scan though each interface, looking for ones that have addresses
 	 * in this address family and the requested fib.  Maintain a reference
 	 * on ifa_maybe once we find one, as we release the IF_ADDR_RLOCK() that
 	 * kept it stable when we move onto the next interface.
 	 */
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			const char *cp, *cp2, *cp3;
 
 			if (ifa->ifa_addr->sa_family != af)
 next:				continue;
 			if (af == AF_INET && 
 			    ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) {
 				/*
 				 * This is a bit broken as it doesn't
 				 * take into account that the remote end may
 				 * be a single node in the network we are
 				 * looking for.
 				 * The trouble is that we don't know the
 				 * netmask for the remote end.
 				 */
 				if (ifa->ifa_dstaddr != NULL &&
 				    sa_equal(addr, ifa->ifa_dstaddr)) {
 					ifa_ref(ifa);
 					IF_ADDR_RUNLOCK(ifp);
 					goto done;
 				}
 			} else {
 				/*
 				 * Scan all the bits in the ifa's address.
 				 * If a bit dissagrees with what we are
 				 * looking for, mask it with the netmask
 				 * to see if it really matters.
 				 * (A byte at a time)
 				 */
 				if (ifa->ifa_netmask == 0)
 					continue;
 				cp = addr_data;
 				cp2 = ifa->ifa_addr->sa_data;
 				cp3 = ifa->ifa_netmask->sa_data;
 				cplim = ifa->ifa_netmask->sa_len
 					+ (char *)ifa->ifa_netmask;
 				while (cp3 < cplim)
 					if ((*cp++ ^ *cp2++) & *cp3++)
 						goto next; /* next address! */
 				/*
 				 * If the netmask of what we just found
 				 * is more specific than what we had before
 				 * (if we had one), or if the virtual status
 				 * of new prefix is better than of the old one,
 				 * then remember the new one before continuing
 				 * to search for an even better one.
 				 */
 				if (ifa_maybe == NULL ||
 				    ifa_preferred(ifa_maybe, ifa) ||
 				    rn_refines((caddr_t)ifa->ifa_netmask,
 				    (caddr_t)ifa_maybe->ifa_netmask)) {
 					if (ifa_maybe != NULL)
 						ifa_free(ifa_maybe);
 					ifa_maybe = ifa;
 					ifa_ref(ifa_maybe);
 				}
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = ifa_maybe;
 	ifa_maybe = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	if (ifa_maybe != NULL)
 		ifa_free(ifa_maybe);
 	return (ifa);
 }
 
 /*
  * Find an interface address specific to an interface best matching
  * a given address.
  */
 struct ifaddr *
 ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 	const char *cp, *cp2, *cp3;
 	char *cplim;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 
 	if (af >= AF_MAX)
 		return (NULL);
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != af)
 			continue;
 		if (ifa_maybe == NULL)
 			ifa_maybe = ifa;
 		if (ifa->ifa_netmask == 0) {
 			if (sa_equal(addr, ifa->ifa_addr) ||
 			    (ifa->ifa_dstaddr &&
 			    sa_equal(addr, ifa->ifa_dstaddr)))
 				goto done;
 			continue;
 		}
 		if (ifp->if_flags & IFF_POINTOPOINT) {
 			if (sa_equal(addr, ifa->ifa_dstaddr))
 				goto done;
 		} else {
 			cp = addr->sa_data;
 			cp2 = ifa->ifa_addr->sa_data;
 			cp3 = ifa->ifa_netmask->sa_data;
 			cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
 			for (; cp3 < cplim; cp3++)
 				if ((*cp++ ^ *cp2++) & *cp3)
 					break;
 			if (cp3 == cplim)
 				goto done;
 		}
 	}
 	ifa = ifa_maybe;
 done:
 	if (ifa != NULL)
 		ifa_ref(ifa);
 	IF_ADDR_RUNLOCK(ifp);
 	return (ifa);
 }
 
 /*
  * See whether new ifa is better than current one:
  * 1) A non-virtual one is preferred over virtual.
  * 2) A virtual in master state preferred over any other state.
  *
  * Used in several address selecting functions.
  */
 int
 ifa_preferred(struct ifaddr *cur, struct ifaddr *next)
 {
 
 	return (cur->ifa_carp && (!next->ifa_carp ||
 	    ((*carp_master_p)(next) && !(*carp_master_p)(cur))));
 }
 
 #include <net/if_llatbl.h>
 
 /*
  * Default action when installing a route with a Link Level gateway.
  * Lookup an appropriate real ifa to point to.
  * This should be moved to /sys/net/link.c eventually.
  */
 static void
 link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info)
 {
 	struct ifaddr *ifa, *oifa;
 	struct sockaddr *dst;
 	struct ifnet *ifp;
 
 	if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) ||
 	    ((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0))
 		return;
 	ifa = ifaof_ifpforaddr(dst, ifp);
 	if (ifa) {
 		oifa = rt->rt_ifa;
 		rt->rt_ifa = ifa;
 		ifa_free(oifa);
 		if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
 			ifa->ifa_rtrequest(cmd, rt, info);
 	}
 }
 
 struct sockaddr_dl *
 link_alloc_sdl(size_t size, int flags)
 {
 
 	return (malloc(size, M_TEMP, flags));
 }
 
 void
 link_free_sdl(struct sockaddr *sa)
 {
 	free(sa, M_TEMP);
 }
 
 /*
  * Fills in given sdl with interface basic info.
  * Returns pointer to filled sdl.
  */
 struct sockaddr_dl *
 link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype)
 {
 	struct sockaddr_dl *sdl;
 
 	sdl = (struct sockaddr_dl *)paddr;
 	memset(sdl, 0, sizeof(struct sockaddr_dl));
 	sdl->sdl_len = sizeof(struct sockaddr_dl);
 	sdl->sdl_family = AF_LINK;
 	sdl->sdl_index = ifp->if_index;
 	sdl->sdl_type = iftype;
 
 	return (sdl);
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 static void
 if_unroute(struct ifnet *ifp, int flag, int fam)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP"));
 
 	ifp->if_flags &= ~flag;
 	getmicrotime(&ifp->if_lastchange);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
 	ifp->if_qflush(ifp);
 
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 static void
 if_route(struct ifnet *ifp, int flag, int fam)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP"));
 
 	ifp->if_flags |= flag;
 	getmicrotime(&ifp->if_lastchange);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFUP, ifa->ifa_addr);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp);
 #ifdef INET6
 	in6_if_up(ifp);
 #endif
 }
 
 void	(*vlan_link_state_p)(struct ifnet *);	/* XXX: private from if_vlan */
 void	(*vlan_trunk_cap_p)(struct ifnet *);		/* XXX: private from if_vlan */
 struct ifnet *(*vlan_trunkdev_p)(struct ifnet *);
 struct	ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t);
 int	(*vlan_tag_p)(struct ifnet *, uint16_t *);
 int	(*vlan_setcookie_p)(struct ifnet *, void *);
 void	*(*vlan_cookie_p)(struct ifnet *);
 
 /*
  * Handle a change in the interface link state. To avoid LORs
  * between driver lock and upper layer locks, as well as possible
  * recursions, we post event to taskqueue, and all job
  * is done in static do_link_state_change().
  */
 void
 if_link_state_change(struct ifnet *ifp, int link_state)
 {
 	/* Return if state hasn't changed. */
 	if (ifp->if_link_state == link_state)
 		return;
 
 	ifp->if_link_state = link_state;
 
 	taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask);
 }
 
 static void
 do_link_state_change(void *arg, int pending)
 {
 	struct ifnet *ifp = (struct ifnet *)arg;
 	int link_state = ifp->if_link_state;
 	CURVNET_SET(ifp->if_vnet);
 
 	/* Notify that the link state has changed. */
 	rt_ifmsg(ifp);
 	if (ifp->if_vlantrunk != NULL)
 		(*vlan_link_state_p)(ifp);
 
 	if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) &&
 	    ifp->if_l2com != NULL)
 		(*ng_ether_link_state_p)(ifp, link_state);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	if (ifp->if_bridge)
 		(*bridge_linkstate_p)(ifp);
 	if (ifp->if_lagg)
 		(*lagg_linkstate_p)(ifp, link_state);
 
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname,
 		    (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN",
 		    NULL);
 	if (pending > 1)
 		if_printf(ifp, "%d link states coalesced\n", pending);
 	if (log_link_state_change)
 		log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname,
 		    (link_state == LINK_STATE_UP) ? "UP" : "DOWN" );
 	EVENTHANDLER_INVOKE(ifnet_link_event, ifp, ifp->if_link_state);
 	CURVNET_RESTORE();
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 void
 if_down(struct ifnet *ifp)
 {
 
 	if_unroute(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 void
 if_up(struct ifnet *ifp)
 {
 
 	if_route(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Flush an interface queue.
  */
 void
 if_qflush(struct ifnet *ifp)
 {
 	struct mbuf *m, *n;
 	struct ifaltq *ifq;
 	
 	ifq = &ifp->if_snd;
 	IFQ_LOCK(ifq);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(ifq))
 		ALTQ_PURGE(ifq);
 #endif
 	n = ifq->ifq_head;
 	while ((m = n) != 0) {
 		n = m->m_nextpkt;
 		m_freem(m);
 	}
 	ifq->ifq_head = 0;
 	ifq->ifq_tail = 0;
 	ifq->ifq_len = 0;
 	IFQ_UNLOCK(ifq);
 }
 
 /*
  * Map interface name to interface structure pointer, with or without
  * returning a reference.
  */
 struct ifnet *
 ifunit_ref(const char *name)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 &&
 		    !(ifp->if_flags & IFF_DYING))
 			break;
 	}
 	if (ifp != NULL)
 		if_ref(ifp);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 struct ifnet *
 ifunit(const char *name)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0)
 			break;
 	}
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 /*
  * Hardware specific interface ioctls.
  */
 static int
 ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
 {
 	struct ifreq *ifr;
 	int error = 0;
 	int new_flags, temp_flags;
 	size_t namelen, onamelen;
 	size_t descrlen;
 	char *descrbuf, *odescrbuf;
 	char new_name[IFNAMSIZ];
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 	case SIOCGIFINDEX:
 		ifr->ifr_index = ifp->if_index;
 		break;
 
 	case SIOCGIFFLAGS:
 		temp_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifr->ifr_flags = temp_flags & 0xffff;
 		ifr->ifr_flagshigh = temp_flags >> 16;
 		break;
 
 	case SIOCGIFCAP:
 		ifr->ifr_reqcap = ifp->if_capabilities;
 		ifr->ifr_curcap = ifp->if_capenable;
 		break;
 
 #ifdef MAC
 	case SIOCGIFMAC:
 		error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCGIFMETRIC:
 		ifr->ifr_metric = ifp->if_metric;
 		break;
 
 	case SIOCGIFMTU:
 		ifr->ifr_mtu = ifp->if_mtu;
 		break;
 
 	case SIOCGIFPHYS:
 		/* XXXGL: did this ever worked? */
 		ifr->ifr_phys = 0;
 		break;
 
 	case SIOCGIFDESCR:
 		error = 0;
 		sx_slock(&ifdescr_sx);
 		if (ifp->if_description == NULL)
 			error = ENOMSG;
 		else {
 			/* space for terminating nul */
 			descrlen = strlen(ifp->if_description) + 1;
 			if (ifr->ifr_buffer.length < descrlen)
 				ifr->ifr_buffer.buffer = NULL;
 			else
 				error = copyout(ifp->if_description,
 				    ifr->ifr_buffer.buffer, descrlen);
 			ifr->ifr_buffer.length = descrlen;
 		}
 		sx_sunlock(&ifdescr_sx);
 		break;
 
 	case SIOCSIFDESCR:
 		error = priv_check(td, PRIV_NET_SETIFDESCR);
 		if (error)
 			return (error);
 
 		/*
 		 * Copy only (length-1) bytes to make sure that
 		 * if_description is always nul terminated.  The
 		 * length parameter is supposed to count the
 		 * terminating nul in.
 		 */
 		if (ifr->ifr_buffer.length > ifdescr_maxlen)
 			return (ENAMETOOLONG);
 		else if (ifr->ifr_buffer.length == 0)
 			descrbuf = NULL;
 		else {
 			descrbuf = malloc(ifr->ifr_buffer.length, M_IFDESCR,
 			    M_WAITOK | M_ZERO);
 			error = copyin(ifr->ifr_buffer.buffer, descrbuf,
 			    ifr->ifr_buffer.length - 1);
 			if (error) {
 				free(descrbuf, M_IFDESCR);
 				break;
 			}
 		}
 
 		sx_xlock(&ifdescr_sx);
 		odescrbuf = ifp->if_description;
 		ifp->if_description = descrbuf;
 		sx_xunlock(&ifdescr_sx);
 
 		getmicrotime(&ifp->if_lastchange);
 		free(odescrbuf, M_IFDESCR);
 		break;
 
 	case SIOCGIFFIB:
 		ifr->ifr_fib = ifp->if_fib;
 		break;
 
 	case SIOCSIFFIB:
 		error = priv_check(td, PRIV_NET_SETIFFIB);
 		if (error)
 			return (error);
 		if (ifr->ifr_fib >= rt_numfibs)
 			return (EINVAL);
 
 		ifp->if_fib = ifr->ifr_fib;
 		break;
 
 	case SIOCSIFFLAGS:
 		error = priv_check(td, PRIV_NET_SETIFFLAGS);
 		if (error)
 			return (error);
 		/*
 		 * Currently, no driver owned flags pass the IFF_CANTCHANGE
 		 * check, so we don't need special handling here yet.
 		 */
 		new_flags = (ifr->ifr_flags & 0xffff) |
 		    (ifr->ifr_flagshigh << 16);
 		if (ifp->if_flags & IFF_UP &&
 		    (new_flags & IFF_UP) == 0) {
 			if_down(ifp);
 		} else if (new_flags & IFF_UP &&
 		    (ifp->if_flags & IFF_UP) == 0) {
 			if_up(ifp);
 		}
 		/* See if permanently promiscuous mode bit is about to flip */
 		if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) {
 			if (new_flags & IFF_PPROMISC)
 				ifp->if_flags |= IFF_PROMISC;
 			else if (ifp->if_pcount == 0)
 				ifp->if_flags &= ~IFF_PROMISC;
 			log(LOG_INFO, "%s: permanently promiscuous mode %s\n",
 			    ifp->if_xname,
 			    (new_flags & IFF_PPROMISC) ? "enabled" : "disabled");
 		}
 		ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
 			(new_flags &~ IFF_CANTCHANGE);
 		if (ifp->if_ioctl) {
 			(void) (*ifp->if_ioctl)(ifp, cmd, data);
 		}
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFCAP:
 		error = priv_check(td, PRIV_NET_SETIFCAP);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		if (ifr->ifr_reqcap & ~ifp->if_capabilities)
 			return (EINVAL);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 #ifdef MAC
 	case SIOCSIFMAC:
 		error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCSIFNAME:
 		error = priv_check(td, PRIV_NET_SETIFNAME);
 		if (error)
 			return (error);
 		error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
 		if (error != 0)
 			return (error);
 		if (new_name[0] == '\0')
 			return (EINVAL);
 		if (ifunit(new_name) != NULL)
 			return (EEXIST);
 
 		/*
 		 * XXX: Locking.  Nothing else seems to lock if_flags,
 		 * and there are numerous other races with the
 		 * ifunit() checks not being atomic with namespace
 		 * changes (renames, vmoves, if_attach, etc).
 		 */
 		ifp->if_flags |= IFF_RENAMING;
 		
 		/* Announce the departure of the interface. */
 		rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 		EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 
 		log(LOG_INFO, "%s: changing name to '%s'\n",
 		    ifp->if_xname, new_name);
 
 		IF_ADDR_WLOCK(ifp);
 		strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
 		ifa = ifp->if_addr;
 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 		namelen = strlen(new_name);
 		onamelen = sdl->sdl_nlen;
 		/*
 		 * Move the address if needed.  This is safe because we
 		 * allocate space for a name of length IFNAMSIZ when we
 		 * create this in if_attach().
 		 */
 		if (namelen != onamelen) {
 			bcopy(sdl->sdl_data + onamelen,
 			    sdl->sdl_data + namelen, sdl->sdl_alen);
 		}
 		bcopy(new_name, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
 		bzero(sdl->sdl_data, onamelen);
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		IF_ADDR_WUNLOCK(ifp);
 
 		EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 		/* Announce the return of the interface. */
 		rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 
 		ifp->if_flags &= ~IFF_RENAMING;
 		break;
 
 #ifdef VIMAGE
 	case SIOCSIFVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error)
 			return (error);
 		error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid);
 		break;
 #endif
 
 	case SIOCSIFMETRIC:
 		error = priv_check(td, PRIV_NET_SETIFMETRIC);
 		if (error)
 			return (error);
 		ifp->if_metric = ifr->ifr_metric;
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYS:
 		error = priv_check(td, PRIV_NET_SETIFPHYS);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFMTU:
 	{
 		u_long oldmtu = ifp->if_mtu;
 
 		error = priv_check(td, PRIV_NET_SETIFMTU);
 		if (error)
 			return (error);
 		if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU)
 			return (EINVAL);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0) {
 			getmicrotime(&ifp->if_lastchange);
 			rt_ifmsg(ifp);
 		}
 		/*
 		 * If the link MTU changed, do network layer specific procedure.
 		 */
 		if (ifp->if_mtu != oldmtu) {
 #ifdef INET6
 			nd6_setmtu(ifp);
 #endif
 			rt_updatemtu(ifp);
 		}
 		break;
 	}
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (cmd == SIOCADDMULTI)
 			error = priv_check(td, PRIV_NET_ADDMULTI);
 		else
 			error = priv_check(td, PRIV_NET_DELMULTI);
 		if (error)
 			return (error);
 
 		/* Don't allow group membership on non-multicast interfaces. */
 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
 			return (EOPNOTSUPP);
 
 		/* Don't let users screw up protocols' entries. */
 		if (ifr->ifr_addr.sa_family != AF_LINK)
 			return (EINVAL);
 
 		if (cmd == SIOCADDMULTI) {
 			struct ifmultiaddr *ifma;
 
 			/*
 			 * Userland is only permitted to join groups once
 			 * via the if_addmulti() KPI, because it cannot hold
 			 * struct ifmultiaddr * between calls. It may also
 			 * lose a race while we check if the membership
 			 * already exists.
 			 */
 			IF_ADDR_RLOCK(ifp);
 			ifma = if_findmulti(ifp, &ifr->ifr_addr);
 			IF_ADDR_RUNLOCK(ifp);
 			if (ifma != NULL)
 				error = EADDRINUSE;
 			else
 				error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
 		} else {
 			error = if_delmulti(ifp, &ifr->ifr_addr);
 		}
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYADDR:
 	case SIOCDIFPHYADDR:
 #ifdef INET6
 	case SIOCSIFPHYADDR_IN6:
 #endif
 	case SIOCSIFMEDIA:
 	case SIOCSIFGENERIC:
 		error = priv_check(td, PRIV_NET_HWIOCTL);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCGIFSTATUS:
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 	case SIOCGIFMEDIA:
 	case SIOCGIFXMEDIA:
 	case SIOCGIFGENERIC:
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		break;
 
 	case SIOCSIFLLADDR:
 		error = priv_check(td, PRIV_NET_SETLLADDR);
 		if (error)
 			return (error);
 		error = if_setlladdr(ifp,
 		    ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len);
 		break;
 
 	case SIOCAIFGROUP:
 	{
 		struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
 
 		error = priv_check(td, PRIV_NET_ADDIFGROUP);
 		if (error)
 			return (error);
 		if ((error = if_addgroup(ifp, ifgr->ifgr_group)))
 			return (error);
 		break;
 	}
 
 	case SIOCGIFGROUP:
 		if ((error = if_getgroup((struct ifgroupreq *)ifr, ifp)))
 			return (error);
 		break;
 
 	case SIOCDIFGROUP:
 	{
 		struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
 
 		error = priv_check(td, PRIV_NET_DELIFGROUP);
 		if (error)
 			return (error);
 		if ((error = if_delgroup(ifp, ifgr->ifgr_group)))
 			return (error);
 		break;
 	}
 
 	default:
 		error = ENOIOCTL;
 		break;
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD32
 struct ifconf32 {
 	int32_t	ifc_len;
 	union {
 		uint32_t	ifcu_buf;
 		uint32_t	ifcu_req;
 	} ifc_ifcu;
 };
 #define	SIOCGIFCONF32	_IOWR('i', 36, struct ifconf32)
 #endif
 
 /*
  * Interface ioctls.
  */
 int
 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td)
 {
 	struct ifnet *ifp;
 	struct ifreq *ifr;
 	int error;
 	int oif_flags;
+#ifdef VIMAGE
+	int shutdown;
+#endif
 
 	CURVNET_SET(so->so_vnet);
+#ifdef VIMAGE
+	/* Make sure the VNET is stable. */
+	shutdown = (so->so_vnet->vnet_state > SI_SUB_VNET &&
+		 so->so_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0;
+	if (shutdown) {
+		CURVNET_RESTORE();
+		return (EBUSY);
+	}
+#endif
+
+
 	switch (cmd) {
 	case SIOCGIFCONF:
 		error = ifconf(cmd, data);
 		CURVNET_RESTORE();
 		return (error);
 
 #ifdef COMPAT_FREEBSD32
 	case SIOCGIFCONF32:
 		{
 			struct ifconf32 *ifc32;
 			struct ifconf ifc;
 
 			ifc32 = (struct ifconf32 *)data;
 			ifc.ifc_len = ifc32->ifc_len;
 			ifc.ifc_buf = PTRIN(ifc32->ifc_buf);
 
 			error = ifconf(SIOCGIFCONF, (void *)&ifc);
 			CURVNET_RESTORE();
 			if (error == 0)
 				ifc32->ifc_len = ifc.ifc_len;
 			return (error);
 		}
 #endif
 	}
 	ifr = (struct ifreq *)data;
 
 	switch (cmd) {
 #ifdef VIMAGE
 	case SIOCSIFRVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error == 0)
 			error = if_vmove_reclaim(td, ifr->ifr_name,
 			    ifr->ifr_jid);
 		CURVNET_RESTORE();
 		return (error);
 #endif
 	case SIOCIFCREATE:
 	case SIOCIFCREATE2:
 		error = priv_check(td, PRIV_NET_IFCREATE);
 		if (error == 0)
 			error = if_clone_create(ifr->ifr_name,
 			    sizeof(ifr->ifr_name),
 			    cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL);
 		CURVNET_RESTORE();
 		return (error);
 	case SIOCIFDESTROY:
 		error = priv_check(td, PRIV_NET_IFDESTROY);
 		if (error == 0)
 			error = if_clone_destroy(ifr->ifr_name);
 		CURVNET_RESTORE();
 		return (error);
 
 	case SIOCIFGCLONERS:
 		error = if_clone_list((struct if_clonereq *)data);
 		CURVNET_RESTORE();
 		return (error);
 	case SIOCGIFGMEMB:
 		error = if_getgroupmembers((struct ifgroupreq *)data);
 		CURVNET_RESTORE();
 		return (error);
 #if defined(INET) || defined(INET6)
 	case SIOCSVH:
 	case SIOCGVH:
 		if (carp_ioctl_p == NULL)
 			error = EPROTONOSUPPORT;
 		else
 			error = (*carp_ioctl_p)(ifr, cmd, td);
 		CURVNET_RESTORE();
 		return (error);
 #endif
 	}
 
 	ifp = ifunit_ref(ifr->ifr_name);
 	if (ifp == NULL) {
 		CURVNET_RESTORE();
 		return (ENXIO);
 	}
 
 	error = ifhwioctl(cmd, ifp, data, td);
 	if (error != ENOIOCTL) {
 		if_rele(ifp);
 		CURVNET_RESTORE();
 		return (error);
 	}
 
 	oif_flags = ifp->if_flags;
 	if (so->so_proto == NULL) {
 		if_rele(ifp);
 		CURVNET_RESTORE();
 		return (EOPNOTSUPP);
 	}
 
 	/*
 	 * Pass the request on to the socket control method, and if the
 	 * latter returns EOPNOTSUPP, directly to the interface.
 	 *
 	 * Make an exception for the legacy SIOCSIF* requests.  Drivers
 	 * trust SIOCSIFADDR et al to come from an already privileged
 	 * layer, and do not perform any credentials checks or input
 	 * validation.
 	 */
 	error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data,
 	    ifp, td));
 	if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL &&
 	    cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR &&
 	    cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK)
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 
 	if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
 #ifdef INET6
 		if (ifp->if_flags & IFF_UP)
 			in6_if_up(ifp);
 #endif
 	}
 	if_rele(ifp);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * The code common to handling reference counted flags,
  * e.g., in ifpromisc() and if_allmulti().
  * The "pflag" argument can specify a permanent mode flag to check,
  * such as IFF_PPROMISC for promiscuous mode; should be 0 if none.
  *
  * Only to be used on stack-owned flags, not driver-owned flags.
  */
 static int
 if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch)
 {
 	struct ifreq ifr;
 	int error;
 	int oldflags, oldcount;
 
 	/* Sanity checks to catch programming errors */
 	KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0,
 	    ("%s: setting driver-owned flag %d", __func__, flag));
 
 	if (onswitch)
 		KASSERT(*refcount >= 0,
 		    ("%s: increment negative refcount %d for flag %d",
 		    __func__, *refcount, flag));
 	else
 		KASSERT(*refcount > 0,
 		    ("%s: decrement non-positive refcount %d for flag %d",
 		    __func__, *refcount, flag));
 
 	/* In case this mode is permanent, just touch refcount */
 	if (ifp->if_flags & pflag) {
 		*refcount += onswitch ? 1 : -1;
 		return (0);
 	}
 
 	/* Save ifnet parameters for if_ioctl() may fail */
 	oldcount = *refcount;
 	oldflags = ifp->if_flags;
 	
 	/*
 	 * See if we aren't the only and touching refcount is enough.
 	 * Actually toggle interface flag if we are the first or last.
 	 */
 	if (onswitch) {
 		if ((*refcount)++)
 			return (0);
 		ifp->if_flags |= flag;
 	} else {
 		if (--(*refcount))
 			return (0);
 		ifp->if_flags &= ~flag;
 	}
 
 	/* Call down the driver since we've changed interface flags */
 	if (ifp->if_ioctl == NULL) {
 		error = EOPNOTSUPP;
 		goto recover;
 	}
 	ifr.ifr_flags = ifp->if_flags & 0xffff;
 	ifr.ifr_flagshigh = ifp->if_flags >> 16;
 	error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 	if (error)
 		goto recover;
 	/* Notify userland that interface flags have changed */
 	rt_ifmsg(ifp);
 	return (0);
 
 recover:
 	/* Recover after driver error */
 	*refcount = oldcount;
 	ifp->if_flags = oldflags;
 	return (error);
 }
 
 /*
  * Set/clear promiscuous mode on interface ifp based on the truth value
  * of pswitch.  The calls are reference counted so that only the first
  * "on" request actually has an effect, as does the final "off" request.
  * Results are undefined if the "off" and "on" requests are not matched.
  */
 int
 ifpromisc(struct ifnet *ifp, int pswitch)
 {
 	int error;
 	int oldflags = ifp->if_flags;
 
 	error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC,
 			   &ifp->if_pcount, pswitch);
 	/* If promiscuous mode status has changed, log a message */
 	if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC))
 		log(LOG_INFO, "%s: promiscuous mode %s\n",
 		    ifp->if_xname,
 		    (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled");
 	return (error);
 }
 
 /*
  * Return interface configuration
  * of system.  List may be used
  * in later ioctl's (above) to get
  * other information.
  */
 /*ARGSUSED*/
 static int
 ifconf(u_long cmd, caddr_t data)
 {
 	struct ifconf *ifc = (struct ifconf *)data;
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 	struct sbuf *sb;
 	int error, full = 0, valid_len, max_len;
 
 	/* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */
 	max_len = MAXPHYS - 1;
 
 	/* Prevent hostile input from being able to crash the system */
 	if (ifc->ifc_len <= 0)
 		return (EINVAL);
 
 again:
 	if (ifc->ifc_len <= max_len) {
 		max_len = ifc->ifc_len;
 		full = 1;
 	}
 	sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
 	max_len = 0;
 	valid_len = 0;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		int addrs;
 
 		/*
 		 * Zero the ifr_name buffer to make sure we don't
 		 * disclose the contents of the stack.
 		 */
 		memset(ifr.ifr_name, 0, sizeof(ifr.ifr_name));
 
 		if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
 		    >= sizeof(ifr.ifr_name)) {
 			sbuf_delete(sb);
 			IFNET_RUNLOCK();
 			return (ENAMETOOLONG);
 		}
 
 		addrs = 0;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa = ifa->ifa_addr;
 
 			if (prison_if(curthread->td_ucred, sa) != 0)
 				continue;
 			addrs++;
 			if (sa->sa_len <= sizeof(*sa)) {
 				ifr.ifr_addr = *sa;
 				sbuf_bcat(sb, &ifr, sizeof(ifr));
 				max_len += sizeof(ifr);
 			} else {
 				sbuf_bcat(sb, &ifr,
 				    offsetof(struct ifreq, ifr_addr));
 				max_len += offsetof(struct ifreq, ifr_addr);
 				sbuf_bcat(sb, sa, sa->sa_len);
 				max_len += sa->sa_len;
 			}
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (addrs == 0) {
 			bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr));
 			sbuf_bcat(sb, &ifr, sizeof(ifr));
 			max_len += sizeof(ifr);
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 	}
 	IFNET_RUNLOCK();
 
 	/*
 	 * If we didn't allocate enough space (uncommon), try again.  If
 	 * we have already allocated as much space as we are allowed,
 	 * return what we've got.
 	 */
 	if (valid_len != max_len && !full) {
 		sbuf_delete(sb);
 		goto again;
 	}
 
 	ifc->ifc_len = valid_len;
 	sbuf_finish(sb);
 	error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
 	sbuf_delete(sb);
 	return (error);
 }
 
 /*
  * Just like ifpromisc(), but for all-multicast-reception mode.
  */
 int
 if_allmulti(struct ifnet *ifp, int onswitch)
 {
 
 	return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch));
 }
 
 struct ifmultiaddr *
 if_findmulti(struct ifnet *ifp, const struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 
 	IF_ADDR_LOCK_ASSERT(ifp);
 
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (sa->sa_family == AF_LINK) {
 			if (sa_dl_equal(ifma->ifma_addr, sa))
 				break;
 		} else {
 			if (sa_equal(ifma->ifma_addr, sa))
 				break;
 		}
 	}
 
 	return ifma;
 }
 
 /*
  * Allocate a new ifmultiaddr and initialize based on passed arguments.  We
  * make copies of passed sockaddrs.  The ifmultiaddr will not be added to
  * the ifnet multicast address list here, so the caller must do that and
  * other setup work (such as notifying the device driver).  The reference
  * count is initialized to 1.
  */
 static struct ifmultiaddr *
 if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa,
     int mflags)
 {
 	struct ifmultiaddr *ifma;
 	struct sockaddr *dupsa;
 
 	ifma = malloc(sizeof *ifma, M_IFMADDR, mflags |
 	    M_ZERO);
 	if (ifma == NULL)
 		return (NULL);
 
 	dupsa = malloc(sa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(sa, dupsa, sa->sa_len);
 	ifma->ifma_addr = dupsa;
 
 	ifma->ifma_ifp = ifp;
 	ifma->ifma_refcount = 1;
 	ifma->ifma_protospec = NULL;
 
 	if (llsa == NULL) {
 		ifma->ifma_lladdr = NULL;
 		return (ifma);
 	}
 
 	dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma->ifma_addr, M_IFMADDR);
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(llsa, dupsa, llsa->sa_len);
 	ifma->ifma_lladdr = dupsa;
 
 	return (ifma);
 }
 
 /*
  * if_freemulti: free ifmultiaddr structure and possibly attached related
  * addresses.  The caller is responsible for implementing reference
  * counting, notifying the driver, handling routing messages, and releasing
  * any dependent link layer state.
  */
 static void
 if_freemulti(struct ifmultiaddr *ifma)
 {
 
 	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d",
 	    ifma->ifma_refcount));
 
 	if (ifma->ifma_lladdr != NULL)
 		free(ifma->ifma_lladdr, M_IFMADDR);
 	free(ifma->ifma_addr, M_IFMADDR);
 	free(ifma, M_IFMADDR);
 }
 
 /*
  * Register an additional multicast address with a network interface.
  *
  * - If the address is already present, bump the reference count on the
  *   address and return.
  * - If the address is not link-layer, look up a link layer address.
  * - Allocate address structures for one or both addresses, and attach to the
  *   multicast address list on the interface.  If automatically adding a link
  *   layer address, the protocol address will own a reference to the link
  *   layer address, to be freed when it is freed.
  * - Notify the network device driver of an addition to the multicast address
  *   list.
  *
  * 'sa' points to caller-owned memory with the desired multicast address.
  *
  * 'retifma' will be used to return a pointer to the resulting multicast
  * address reference, if desired.
  */
 int
 if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
     struct ifmultiaddr **retifma)
 {
 	struct ifmultiaddr *ifma, *ll_ifma;
 	struct sockaddr *llsa;
 	struct sockaddr_dl sdl;
 	int error;
 
 	/*
 	 * If the address is already present, return a new reference to it;
 	 * otherwise, allocate storage and set up a new address.
 	 */
 	IF_ADDR_WLOCK(ifp);
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL) {
 		ifma->ifma_refcount++;
 		if (retifma != NULL)
 			*retifma = ifma;
 		IF_ADDR_WUNLOCK(ifp);
 		return (0);
 	}
 
 	/*
 	 * The address isn't already present; resolve the protocol address
 	 * into a link layer address, and then look that up, bump its
 	 * refcount or allocate an ifma for that also.
 	 * Most link layer resolving functions returns address data which
 	 * fits inside default sockaddr_dl structure. However callback
 	 * can allocate another sockaddr structure, in that case we need to
 	 * free it later.
 	 */
 	llsa = NULL;
 	ll_ifma = NULL;
 	if (ifp->if_resolvemulti != NULL) {
 		/* Provide called function with buffer size information */
 		sdl.sdl_len = sizeof(sdl);
 		llsa = (struct sockaddr *)&sdl;
 		error = ifp->if_resolvemulti(ifp, &llsa, sa);
 		if (error)
 			goto unlock_out;
 	}
 
 	/*
 	 * Allocate the new address.  Don't hook it up yet, as we may also
 	 * need to allocate a link layer multicast address.
 	 */
 	ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT);
 	if (ifma == NULL) {
 		error = ENOMEM;
 		goto free_llsa_out;
 	}
 
 	/*
 	 * If a link layer address is found, we'll need to see if it's
 	 * already present in the address list, or allocate is as well.
 	 * When this block finishes, the link layer address will be on the
 	 * list.
 	 */
 	if (llsa != NULL) {
 		ll_ifma = if_findmulti(ifp, llsa);
 		if (ll_ifma == NULL) {
 			ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT);
 			if (ll_ifma == NULL) {
 				--ifma->ifma_refcount;
 				if_freemulti(ifma);
 				error = ENOMEM;
 				goto free_llsa_out;
 			}
 			TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma,
 			    ifma_link);
 		} else
 			ll_ifma->ifma_refcount++;
 		ifma->ifma_llifma = ll_ifma;
 	}
 
 	/*
 	 * We now have a new multicast address, ifma, and possibly a new or
 	 * referenced link layer address.  Add the primary address to the
 	 * ifnet address list.
 	 */
 	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if (retifma != NULL)
 		*retifma = ifma;
 
 	/*
 	 * Must generate the message while holding the lock so that 'ifma'
 	 * pointer is still valid.
 	 */
 	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
 	IF_ADDR_WUNLOCK(ifp);
 
 	/*
 	 * We are certain we have added something, so call down to the
 	 * interface to let them know about it.
 	 */
 	if (ifp->if_ioctl != NULL) {
 		(void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
 	}
 
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 	return (0);
 
 free_llsa_out:
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 unlock_out:
 	IF_ADDR_WUNLOCK(ifp);
 	return (error);
 }
 
 /*
  * Delete a multicast group membership by network-layer group address.
  *
  * Returns ENOENT if the entry could not be found. If ifp no longer
  * exists, results are undefined. This entry point should only be used
  * from subsystems which do appropriate locking to hold ifp for the
  * duration of the call.
  * Network-layer protocol domains must use if_delmulti_ifma().
  */
 int
 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 	int lastref;
 #ifdef INVARIANTS
 	struct ifnet *oifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(oifp, &V_ifnet, if_link)
 		if (ifp == oifp)
 			break;
 	if (ifp != oifp)
 		ifp = NULL;
 	IFNET_RUNLOCK_NOSLEEP();
 
 	KASSERT(ifp != NULL, ("%s: ifnet went away", __func__));
 #endif
 	if (ifp == NULL)
 		return (ENOENT);
 
 	IF_ADDR_WLOCK(ifp);
 	lastref = 0;
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL)
 		lastref = if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 
 	if (ifma == NULL)
 		return (ENOENT);
 
 	if (lastref && ifp->if_ioctl != NULL) {
 		(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 	}
 
 	return (0);
 }
 
 /*
  * Delete all multicast group membership for an interface.
  * Should be used to quickly flush all multicast filters.
  */
 void
 if_delallmulti(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 	struct ifmultiaddr *next;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
 		if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 /*
  * Delete a multicast group membership by group membership pointer.
  * Network-layer protocol domains must use this routine.
  *
  * It is safe to call this routine if the ifp disappeared.
  */
 void
 if_delmulti_ifma(struct ifmultiaddr *ifma)
 {
 	struct ifnet *ifp;
 	int lastref;
 
 	ifp = ifma->ifma_ifp;
 #ifdef DIAGNOSTIC
 	if (ifp == NULL) {
 		printf("%s: ifma_ifp seems to be detached\n", __func__);
 	} else {
 		struct ifnet *oifp;
 
 		IFNET_RLOCK_NOSLEEP();
 		TAILQ_FOREACH(oifp, &V_ifnet, if_link)
 			if (ifp == oifp)
 				break;
 		if (ifp != oifp) {
 			printf("%s: ifnet %p disappeared\n", __func__, ifp);
 			ifp = NULL;
 		}
 		IFNET_RUNLOCK_NOSLEEP();
 	}
 #endif
 	/*
 	 * If and only if the ifnet instance exists: Acquire the address lock.
 	 */
 	if (ifp != NULL)
 		IF_ADDR_WLOCK(ifp);
 
 	lastref = if_delmulti_locked(ifp, ifma, 0);
 
 	if (ifp != NULL) {
 		/*
 		 * If and only if the ifnet instance exists:
 		 *  Release the address lock.
 		 *  If the group was left: update the hardware hash filter.
 		 */
 		IF_ADDR_WUNLOCK(ifp);
 		if (lastref && ifp->if_ioctl != NULL) {
 			(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 		}
 	}
 }
 
 /*
  * Perform deletion of network-layer and/or link-layer multicast address.
  *
  * Return 0 if the reference count was decremented.
  * Return 1 if the final reference was released, indicating that the
  * hardware hash filter should be reprogrammed.
  */
 static int
 if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching)
 {
 	struct ifmultiaddr *ll_ifma;
 
 	if (ifp != NULL && ifma->ifma_ifp != NULL) {
 		KASSERT(ifma->ifma_ifp == ifp,
 		    ("%s: inconsistent ifp %p", __func__, ifp));
 		IF_ADDR_WLOCK_ASSERT(ifp);
 	}
 
 	ifp = ifma->ifma_ifp;
 
 	/*
 	 * If the ifnet is detaching, null out references to ifnet,
 	 * so that upper protocol layers will notice, and not attempt
 	 * to obtain locks for an ifnet which no longer exists. The
 	 * routing socket announcement must happen before the ifnet
 	 * instance is detached from the system.
 	 */
 	if (detaching) {
 #ifdef DIAGNOSTIC
 		printf("%s: detaching ifnet instance %p\n", __func__, ifp);
 #endif
 		/*
 		 * ifp may already be nulled out if we are being reentered
 		 * to delete the ll_ifma.
 		 */
 		if (ifp != NULL) {
 			rt_newmaddrmsg(RTM_DELMADDR, ifma);
 			ifma->ifma_ifp = NULL;
 		}
 	}
 
 	if (--ifma->ifma_refcount > 0)
 		return 0;
 
 	/*
 	 * If this ifma is a network-layer ifma, a link-layer ifma may
 	 * have been associated with it. Release it first if so.
 	 */
 	ll_ifma = ifma->ifma_llifma;
 	if (ll_ifma != NULL) {
 		KASSERT(ifma->ifma_lladdr != NULL,
 		    ("%s: llifma w/o lladdr", __func__));
 		if (detaching)
 			ll_ifma->ifma_ifp = NULL;	/* XXX */
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ifp != NULL) {
 				TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma,
 				    ifma_link);
 			}
 			if_freemulti(ll_ifma);
 		}
 	}
 
 	if (ifp != NULL)
 		TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if_freemulti(ifma);
 
 	/*
 	 * The last reference to this instance of struct ifmultiaddr
 	 * was released; the hardware should be notified of this change.
 	 */
 	return 1;
 }
 
 /*
  * Set the link layer address on an interface.
  *
  * At this time we only support certain types of interfaces,
  * and we don't allow the length of the address to change.
  *
  * Set noinline to be dtrace-friendly
  */
 __noinline int
 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
 {
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 
 	IF_ADDR_RLOCK(ifp);
 	ifa = ifp->if_addr;
 	if (ifa == NULL) {
 		IF_ADDR_RUNLOCK(ifp);
 		return (EINVAL);
 	}
 	ifa_ref(ifa);
 	IF_ADDR_RUNLOCK(ifp);
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	if (sdl == NULL) {
 		ifa_free(ifa);
 		return (EINVAL);
 	}
 	if (len != sdl->sdl_alen) {	/* don't allow length to change */
 		ifa_free(ifa);
 		return (EINVAL);
 	}
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_FDDI:
 	case IFT_XETHER:
 	case IFT_ISO88025:
 	case IFT_L2VLAN:
 	case IFT_BRIDGE:
 	case IFT_ARCNET:
 	case IFT_IEEE8023ADLAG:
 	case IFT_IEEE80211:
 		bcopy(lladdr, LLADDR(sdl), len);
 		ifa_free(ifa);
 		break;
 	default:
 		ifa_free(ifa);
 		return (ENODEV);
 	}
 
 	/*
 	 * If the interface is already up, we need
 	 * to re-init it in order to reprogram its
 	 * address filter.
 	 */
 	if ((ifp->if_flags & IFF_UP) != 0) {
 		if (ifp->if_ioctl) {
 			ifp->if_flags &= ~IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 			ifp->if_flags |= IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 		}
 	}
 	EVENTHANDLER_INVOKE(iflladdr_event, ifp);
 	return (0);
 }
 
 /*
  * Compat function for handling basic encapsulation requests.
  * Not converted stacks (FDDI, IB, ..) supports traditional
  * output model: ARP (and other similar L2 protocols) are handled
  * inside output routine, arpresolve/nd6_resolve() returns MAC
  * address instead of full prepend.
  *
  * This function creates calculated header==MAC for IPv4/IPv6 and
  * returns EAFNOSUPPORT (which is then handled in ARP code) for other
  * address families.
  */
 static int
 if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req)
 {
 
 	if (req->rtype != IFENCAP_LL)
 		return (EOPNOTSUPP);
 
 	if (req->bufsize < req->lladdr_len)
 		return (ENOMEM);
 
 	switch (req->family) {
 	case AF_INET:
 	case AF_INET6:
 		break;
 	default:
 		return (EAFNOSUPPORT);
 	}
 
 	/* Copy lladdr to storage as is */
 	memmove(req->buf, req->lladdr, req->lladdr_len);
 	req->bufsize = req->lladdr_len;
 	req->lladdr_off = 0;
 
 	return (0);
 }
 
 /*
  * The name argument must be a pointer to storage which will last as
  * long as the interface does.  For physical devices, the result of
  * device_get_name(dev) is a good choice and for pseudo-devices a
  * static string works well.
  */
 void
 if_initname(struct ifnet *ifp, const char *name, int unit)
 {
 	ifp->if_dname = name;
 	ifp->if_dunit = unit;
 	if (unit != IF_DUNIT_NONE)
 		snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
 	else
 		strlcpy(ifp->if_xname, name, IFNAMSIZ);
 }
 
 int
 if_printf(struct ifnet *ifp, const char * fmt, ...)
 {
 	va_list ap;
 	int retval;
 
 	retval = printf("%s: ", ifp->if_xname);
 	va_start(ap, fmt);
 	retval += vprintf(fmt, ap);
 	va_end(ap);
 	return (retval);
 }
 
 void
 if_start(struct ifnet *ifp)
 {
 
 	(*(ifp)->if_start)(ifp);
 }
 
 /*
  * Backwards compatibility interface for drivers 
  * that have not implemented it
  */
 static int
 if_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	int error;
 
 	IFQ_HANDOFF(ifp, m, error);
 	return (error);
 }
 
 static void
 if_input_default(struct ifnet *ifp __unused, struct mbuf *m)
 {
 
 	m_freem(m);
 }
 
 int
 if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust)
 {
 	int active = 0;
 
 	IF_LOCK(ifq);
 	if (_IF_QFULL(ifq)) {
 		IF_UNLOCK(ifq);
 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 		m_freem(m);
 		return (0);
 	}
 	if (ifp != NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust);
 		if (m->m_flags & (M_BCAST|M_MCAST))
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 		active = ifp->if_drv_flags & IFF_DRV_OACTIVE;
 	}
 	_IF_ENQUEUE(ifq, m);
 	IF_UNLOCK(ifq);
 	if (ifp != NULL && !active)
 		(*(ifp)->if_start)(ifp);
 	return (1);
 }
 
 void
 if_register_com_alloc(u_char type,
     if_com_alloc_t *a, if_com_free_t *f)
 {
 	
 	KASSERT(if_com_alloc[type] == NULL,
 	    ("if_register_com_alloc: %d already registered", type));
 	KASSERT(if_com_free[type] == NULL,
 	    ("if_register_com_alloc: %d free already registered", type));
 
 	if_com_alloc[type] = a;
 	if_com_free[type] = f;
 }
 
 void
 if_deregister_com_alloc(u_char type)
 {
 	
 	KASSERT(if_com_alloc[type] != NULL,
 	    ("if_deregister_com_alloc: %d not registered", type));
 	KASSERT(if_com_free[type] != NULL,
 	    ("if_deregister_com_alloc: %d free not registered", type));
 	if_com_alloc[type] = NULL;
 	if_com_free[type] = NULL;
 }
 
 /* API for driver access to network stack owned ifnet.*/
 uint64_t
 if_setbaudrate(struct ifnet *ifp, uint64_t baudrate)
 {
 	uint64_t oldbrate;
 
 	oldbrate = ifp->if_baudrate;
 	ifp->if_baudrate = baudrate;
 	return (oldbrate);
 }
 
 uint64_t
 if_getbaudrate(if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_baudrate);
 }
 
 int
 if_setcapabilities(if_t ifp, int capabilities)
 {
 	((struct ifnet *)ifp)->if_capabilities = capabilities;
 	return (0);
 }
 
 int
 if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit)
 {
 	((struct ifnet *)ifp)->if_capabilities |= setbit;
 	((struct ifnet *)ifp)->if_capabilities &= ~clearbit;
 
 	return (0);
 }
 
 int
 if_getcapabilities(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_capabilities;
 }
 
 int 
 if_setcapenable(if_t ifp, int capabilities)
 {
 	((struct ifnet *)ifp)->if_capenable = capabilities;
 	return (0);
 }
 
 int 
 if_setcapenablebit(if_t ifp, int setcap, int clearcap)
 {
 	if(setcap) 
 		((struct ifnet *)ifp)->if_capenable |= setcap;
 	if(clearcap)
 		((struct ifnet *)ifp)->if_capenable &= ~clearcap;
 
 	return (0);
 }
 
 const char *
 if_getdname(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_dname;
 }
 
 int 
 if_togglecapenable(if_t ifp, int togglecap)
 {
 	((struct ifnet *)ifp)->if_capenable ^= togglecap;
 	return (0);
 }
 
 int
 if_getcapenable(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_capenable;
 }
 
 /*
  * This is largely undesirable because it ties ifnet to a device, but does
  * provide flexiblity for an embedded product vendor. Should be used with
  * the understanding that it violates the interface boundaries, and should be
  * a last resort only.
  */
 int
 if_setdev(if_t ifp, void *dev)
 {
 	return (0);
 }
 
 int
 if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags)
 {
 	((struct ifnet *)ifp)->if_drv_flags |= set_flags;
 	((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags;
 
 	return (0);
 }
 
 int
 if_getdrvflags(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_drv_flags;
 }
  
 int
 if_setdrvflags(if_t ifp, int flags)
 {
 	((struct ifnet *)ifp)->if_drv_flags = flags;
 	return (0);
 }
 
 
 int
 if_setflags(if_t ifp, int flags)
 {
 	((struct ifnet *)ifp)->if_flags = flags;
 	return (0);
 }
 
 int
 if_setflagbits(if_t ifp, int set, int clear)
 {
 	((struct ifnet *)ifp)->if_flags |= set;
 	((struct ifnet *)ifp)->if_flags &= ~clear;
 
 	return (0);
 }
 
 int
 if_getflags(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_flags;
 }
 
 int
 if_clearhwassist(if_t ifp)
 {
 	((struct ifnet *)ifp)->if_hwassist = 0;
 	return (0);
 }
 
 int
 if_sethwassistbits(if_t ifp, int toset, int toclear)
 {
 	((struct ifnet *)ifp)->if_hwassist |= toset;
 	((struct ifnet *)ifp)->if_hwassist &= ~toclear;
 
 	return (0);
 }
 
 int
 if_sethwassist(if_t ifp, int hwassist_bit)
 {
 	((struct ifnet *)ifp)->if_hwassist = hwassist_bit;
 	return (0);
 }
 
 int
 if_gethwassist(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_hwassist;
 }
 
 int
 if_setmtu(if_t ifp, int mtu)
 {
 	((struct ifnet *)ifp)->if_mtu = mtu;
 	return (0);
 }
 
 int
 if_getmtu(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_mtu;
 }
 
 int
 if_getmtu_family(if_t ifp, int family)
 {
 	struct domain *dp;
 
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_family == family && dp->dom_ifmtu != NULL)
 			return (dp->dom_ifmtu((struct ifnet *)ifp));
 	}
 
 	return (((struct ifnet *)ifp)->if_mtu);
 }
 
 int
 if_setsoftc(if_t ifp, void *softc)
 {
 	((struct ifnet *)ifp)->if_softc = softc;
 	return (0);
 }
 
 void *
 if_getsoftc(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_softc;
 }
 
 void 
 if_setrcvif(struct mbuf *m, if_t ifp)
 {
 	m->m_pkthdr.rcvif = (struct ifnet *)ifp;
 }
 
 void 
 if_setvtag(struct mbuf *m, uint16_t tag)
 {
 	m->m_pkthdr.ether_vtag = tag;	
 }
 
 uint16_t
 if_getvtag(struct mbuf *m)
 {
 
 	return (m->m_pkthdr.ether_vtag);
 }
 
 int
 if_sendq_empty(if_t ifp)
 {
 	return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd);
 }
 
 struct ifaddr *
 if_getifaddr(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_addr;
 }
 
 int
 if_getamcount(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_amcount;
 }
 
 
 int
 if_setsendqready(if_t ifp)
 {
 	IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd);
 	return (0);
 }
 
 int
 if_setsendqlen(if_t ifp, int tx_desc_count)
 {
 	IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count);
 	((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count;
 
 	return (0);
 }
 
 int
 if_vlantrunkinuse(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0;
 }
 
 int
 if_input(if_t ifp, struct mbuf* sendmp)
 {
 	(*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp);
 	return (0);
 
 }
 
 /* XXX */
 #ifndef ETH_ADDR_LEN
 #define ETH_ADDR_LEN 6
 #endif
 
 int 
 if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max)
 {
 	struct ifmultiaddr *ifma;
 	uint8_t *lmta = (uint8_t *)mta;
 	int mcnt = 0;
 
 	TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 
 		if (mcnt == max)
 			break;
 
 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
 		    &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN);
 		mcnt++;
 	}
 	*cnt = mcnt;
 
 	return (0);
 }
 
 int
 if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max)
 {
 	int error;
 
 	if_maddr_rlock(ifp);
 	error = if_setupmultiaddr(ifp, mta, cnt, max);
 	if_maddr_runlock(ifp);
 	return (error);
 }
 
 int
 if_multiaddr_count(if_t ifp, int max)
 {
 	struct ifmultiaddr *ifma;
 	int count;
 
 	count = 0;
 	if_maddr_rlock(ifp);
 	TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		count++;
 		if (count == max)
 			break;
 	}
 	if_maddr_runlock(ifp);
 	return (count);
 }
 
 struct mbuf *
 if_dequeue(if_t ifp)
 {
 	struct mbuf *m;
 	IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m);
 
 	return (m);
 }
 
 int
 if_sendq_prepend(if_t ifp, struct mbuf *m)
 {
 	IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m);
 	return (0);
 }
 
 int
 if_setifheaderlen(if_t ifp, int len)
 {
 	((struct ifnet *)ifp)->if_hdrlen = len;
 	return (0);
 }
 
 caddr_t
 if_getlladdr(if_t ifp)
 {
 	return (IF_LLADDR((struct ifnet *)ifp));
 }
 
 void *
 if_gethandle(u_char type)
 {
 	return (if_alloc(type));
 }
 
 void
 if_bpfmtap(if_t ifh, struct mbuf *m)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 
 	BPF_MTAP(ifp, m);
 }
 
 void
 if_etherbpfmtap(if_t ifh, struct mbuf *m)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 
 	ETHER_BPF_MTAP(ifp, m);
 }
 
 void
 if_vlancap(if_t ifh)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 	VLAN_CAPABILITIES(ifp);
 }
 
 void
 if_setinitfn(if_t ifp, void (*init_fn)(void *))
 {
 	((struct ifnet *)ifp)->if_init = init_fn;
 }
 
 void
 if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t))
 {
 	((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn;
 }
 
 void
 if_setstartfn(if_t ifp, void (*start_fn)(if_t))
 {
 	((struct ifnet *)ifp)->if_start = (void *)start_fn;
 }
 
 void
 if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn)
 {
 	((struct ifnet *)ifp)->if_transmit = start_fn;
 }
 
 void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn)
 {
 	((struct ifnet *)ifp)->if_qflush = flush_fn;
 	
 }
 
 void
 if_setgetcounterfn(if_t ifp, if_get_counter_t fn)
 {
 
 	ifp->if_get_counter = fn;
 }
 
 /* Revisit these - These are inline functions originally. */
 int
 drbr_inuse_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_inuse(ifh, br);
 }
 
 struct mbuf*
 drbr_dequeue_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_dequeue(ifh, br);
 }
 
 int
 drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_needs_enqueue(ifh, br);
 }
 
 int
 drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m)
 {
 	return drbr_enqueue(ifh, br, m);
 
 }
Index: projects/vnet/sys/net/if_epair.c
===================================================================
--- projects/vnet/sys/net/if_epair.c	(revision 299172)
+++ projects/vnet/sys/net/if_epair.c	(revision 299173)
@@ -1,1010 +1,1012 @@
 /*-
  * Copyright (c) 2008 The FreeBSD Foundation
  * Copyright (c) 2009-2010 Bjoern A. Zeeb <bz@FreeBSD.org>
  * All rights reserved.
  *
  * This software was developed by CK Software GmbH under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  * notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  * notice, this list of conditions and the following disclaimer in the
  * documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * A pair of virtual back-to-back connected ethernet like interfaces
  * (``two interfaces with a virtual cross-over cable'').
  *
  * This is mostly intended to be used to provide connectivity between
  * different virtual network stack instances.
  */
 /*
  * Things to re-think once we have more experience:
  * - ifp->if_reassign function once we can test with vimage. Depending on
  *   how if_vmove() is going to be improved.
  * - Real random etheraddrs that are checked to be uniquish; we would need
  *   to re-do them in case we move the interface between network stacks
  *   in a private if_reassign function.
  *   In case we bridge to a real interface/network or between indepedent
  *   epairs on multiple stacks/machines, we may need this.
  *   For now let the user handle that case.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/refcount.h>
 #include <sys/queue.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/types.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_media.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 SYSCTL_DECL(_net_link);
 static SYSCTL_NODE(_net_link, OID_AUTO, epair, CTLFLAG_RW, 0, "epair sysctl");
 
 #ifdef EPAIR_DEBUG
 static int epair_debug = 0;
 SYSCTL_INT(_net_link_epair, OID_AUTO, epair_debug, CTLFLAG_RW,
     &epair_debug, 0, "if_epair(4) debugging.");
 #define	DPRINTF(fmt, arg...)						\
 	if (epair_debug)						\
 		printf("[%s:%d] " fmt, __func__, __LINE__, ##arg)
 #else
 #define	DPRINTF(fmt, arg...)
 #endif
 
 static void epair_nh_sintr(struct mbuf *);
 static struct mbuf *epair_nh_m2cpuid(struct mbuf *, uintptr_t, u_int *);
 static void epair_nh_drainedcpu(u_int);
 
 static void epair_start_locked(struct ifnet *);
 static int epair_media_change(struct ifnet *);
 static void epair_media_status(struct ifnet *, struct ifmediareq *);
 
 static int epair_clone_match(struct if_clone *, const char *);
 static int epair_clone_create(struct if_clone *, char *, size_t, caddr_t);
 static int epair_clone_destroy(struct if_clone *, struct ifnet *);
 
 static const char epairname[] = "epair";
 
 /* Netisr related definitions and sysctl. */
 static struct netisr_handler epair_nh = {
 	.nh_name	= epairname,
 	.nh_proto	= NETISR_EPAIR,
 	.nh_policy	= NETISR_POLICY_CPU,
 	.nh_handler	= epair_nh_sintr,
 	.nh_m2cpuid	= epair_nh_m2cpuid,
 	.nh_drainedcpu	= epair_nh_drainedcpu,
 };
 
 static int
 sysctl_epair_netisr_maxqlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&epair_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&epair_nh, qlimit));
 }
 SYSCTL_PROC(_net_link_epair, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_epair_netisr_maxqlen, "I",
     "Maximum if_epair(4) netisr \"hw\" queue length");
 
 struct epair_softc {
 	struct ifnet	*ifp;		/* This ifp. */
 	struct ifnet	*oifp;		/* other ifp of pair. */
 	struct ifmedia	media;		/* Media config (fake). */
 	u_int		refcount;	/* # of mbufs in flight. */
 	u_int		cpuid;		/* CPU ID assigned upon creation. */
 	void		(*if_qflush)(struct ifnet *);
 					/* Original if_qflush routine. */
 };
 
 /*
  * Per-CPU list of ifps with data in the ifq that needs to be flushed
  * to the netisr ``hw'' queue before we allow any further direct queuing
  * to the ``hw'' queue.
  */
 struct epair_ifp_drain {
 	STAILQ_ENTRY(epair_ifp_drain)	ifp_next;
 	struct ifnet			*ifp;
 };
 STAILQ_HEAD(eid_list, epair_ifp_drain);
 
 #define	EPAIR_LOCK_INIT(dpcpu)		mtx_init(&(dpcpu)->if_epair_mtx, \
 					    "if_epair", NULL, MTX_DEF)
 #define	EPAIR_LOCK_DESTROY(dpcpu)	mtx_destroy(&(dpcpu)->if_epair_mtx)
 #define	EPAIR_LOCK_ASSERT(dpcpu)	mtx_assert(&(dpcpu)->if_epair_mtx, \
 					    MA_OWNED)
 #define	EPAIR_LOCK(dpcpu)		mtx_lock(&(dpcpu)->if_epair_mtx)
 #define	EPAIR_UNLOCK(dpcpu)		mtx_unlock(&(dpcpu)->if_epair_mtx)
 
 #ifdef INVARIANTS
 #define	EPAIR_REFCOUNT_INIT(r, v)	refcount_init((r), (v))
 #define	EPAIR_REFCOUNT_AQUIRE(r)	refcount_acquire((r))
 #define	EPAIR_REFCOUNT_RELEASE(r)	refcount_release((r))
 #define	EPAIR_REFCOUNT_ASSERT(a, p)	KASSERT(a, p)
 #else
 #define	EPAIR_REFCOUNT_INIT(r, v)
 #define	EPAIR_REFCOUNT_AQUIRE(r)
 #define	EPAIR_REFCOUNT_RELEASE(r)
 #define	EPAIR_REFCOUNT_ASSERT(a, p)
 #endif
 
 static MALLOC_DEFINE(M_EPAIR, epairname,
     "Pair of virtual cross-over connected Ethernet-like interfaces");
 
 static VNET_DEFINE(struct if_clone *, epair_cloner);
 #define	V_epair_cloner	VNET(epair_cloner)
 
 /*
  * DPCPU area and functions.
  */
 struct epair_dpcpu {
 	struct mtx	if_epair_mtx;		/* Per-CPU locking. */
 	int		epair_drv_flags;	/* Per-CPU ``hw'' drv flags. */
 	struct eid_list	epair_ifp_drain_list;	/* Per-CPU list of ifps with
 						 * data in the ifq. */
 };
 DPCPU_DEFINE(struct epair_dpcpu, epair_dpcpu);
 
 static void
 epair_dpcpu_init(void)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct eid_list *s;
 	u_int cpuid;
 
 	CPU_FOREACH(cpuid) {
 		epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
 
 		/* Initialize per-cpu lock. */
 		EPAIR_LOCK_INIT(epair_dpcpu);
 
 		/* Driver flags are per-cpu as are our netisr "hw" queues. */
 		epair_dpcpu->epair_drv_flags = 0;
 
 		/*
 		 * Initialize per-cpu drain list.
 		 * Manually do what STAILQ_HEAD_INITIALIZER would do.
 		 */
 		s = &epair_dpcpu->epair_ifp_drain_list;
 		s->stqh_first = NULL;
 		s->stqh_last = &s->stqh_first;
 	} 
 }
 
 static void
 epair_dpcpu_detach(void)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	u_int cpuid;
 
 	CPU_FOREACH(cpuid) {
 		epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
 
 		/* Destroy per-cpu lock. */
 		EPAIR_LOCK_DESTROY(epair_dpcpu);
 	}
 }
 
 /*
  * Helper functions.
  */
 static u_int
 cpuid_from_ifp(struct ifnet *ifp)
 {
 	struct epair_softc *sc;
 
 	if (ifp == NULL)
 		return (0);
 	sc = ifp->if_softc;
 
 	return (sc->cpuid);
 }
 
 /*
  * Netisr handler functions.
  */
 static void
 epair_nh_sintr(struct mbuf *m)
 {
 	struct ifnet *ifp;
 	struct epair_softc *sc;
 
 	ifp = m->m_pkthdr.rcvif;
 	(*ifp->if_input)(ifp, m);
 	sc = ifp->if_softc;
 	EPAIR_REFCOUNT_RELEASE(&sc->refcount);
 	EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
 	    ("%s: ifp=%p sc->refcount not >= 1: %d",
 	    __func__, ifp, sc->refcount));
 	DPRINTF("ifp=%p refcount=%u\n", ifp, sc->refcount);
 }
 
 static struct mbuf *
 epair_nh_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
 {
 
 	*cpuid = cpuid_from_ifp(m->m_pkthdr.rcvif);
 
 	return (m);
 }
 
 static void
 epair_nh_drainedcpu(u_int cpuid)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct epair_ifp_drain *elm, *tvar;
 	struct ifnet *ifp;
 
 	epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
 	EPAIR_LOCK(epair_dpcpu);
 	/*
 	 * Assume our "hw" queue and possibly ifq will be emptied
 	 * again. In case we will overflow the "hw" queue while
 	 * draining, epair_start_locked will set IFF_DRV_OACTIVE
 	 * again and we will stop and return.
 	 */
 	STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list,
 	    ifp_next, tvar) {
 		ifp = elm->ifp;
 		epair_dpcpu->epair_drv_flags &= ~IFF_DRV_OACTIVE;
 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 		epair_start_locked(ifp);
 
 		IFQ_LOCK(&ifp->if_snd);
 		if (IFQ_IS_EMPTY(&ifp->if_snd)) {
 			struct epair_softc *sc;
 
 			STAILQ_REMOVE(&epair_dpcpu->epair_ifp_drain_list,
 			    elm, epair_ifp_drain, ifp_next);
 			/* The cached ifp goes off the list. */
 			sc = ifp->if_softc;
 			EPAIR_REFCOUNT_RELEASE(&sc->refcount);
 			EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
 			    ("%s: ifp=%p sc->refcount not >= 1: %d",
 			    __func__, ifp, sc->refcount));
 			free(elm, M_EPAIR);
 		}
 		IFQ_UNLOCK(&ifp->if_snd);
 
 		if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) != 0) {
 			/* Our "hw"q overflew again. */
 			epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
 			DPRINTF("hw queue length overflow at %u\n",
 			    epair_nh.nh_qlimit);
 			break;
 		}
 	}
 	EPAIR_UNLOCK(epair_dpcpu);
 }
 
 /*
  * Network interface (`if') related functions.
  */
 static void
 epair_remove_ifp_from_draining(struct ifnet *ifp)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct epair_ifp_drain *elm, *tvar;
 	u_int cpuid;
 
 	CPU_FOREACH(cpuid) {
 		epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
 		EPAIR_LOCK(epair_dpcpu);
 		STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list,
 		    ifp_next, tvar) {
 			if (ifp == elm->ifp) {
 				struct epair_softc *sc;
 
 				STAILQ_REMOVE(
 				    &epair_dpcpu->epair_ifp_drain_list, elm,
 				    epair_ifp_drain, ifp_next);
 				/* The cached ifp goes off the list. */
 				sc = ifp->if_softc;
 				EPAIR_REFCOUNT_RELEASE(&sc->refcount);
 				EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
 				    ("%s: ifp=%p sc->refcount not >= 1: %d",
 				    __func__, ifp, sc->refcount));
 				free(elm, M_EPAIR);
 			}
 		}
 		EPAIR_UNLOCK(epair_dpcpu);
 	}
 }
 
 static int
 epair_add_ifp_for_draining(struct ifnet *ifp)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct epair_softc *sc;
 	struct epair_ifp_drain *elm = NULL;
 
 	sc = ifp->if_softc;
 	epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
 	EPAIR_LOCK_ASSERT(epair_dpcpu);
 	STAILQ_FOREACH(elm, &epair_dpcpu->epair_ifp_drain_list, ifp_next)
 		if (elm->ifp == ifp)
 			break;
 	/* If the ifp is there already, return success. */
 	if (elm != NULL)
 		return (0);
 
 	elm = malloc(sizeof(struct epair_ifp_drain), M_EPAIR, M_NOWAIT|M_ZERO);
 	if (elm == NULL)
 		return (ENOMEM);
 
 	elm->ifp = ifp;
 	/* Add a reference for the ifp pointer on the list. */
 	EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
 	STAILQ_INSERT_TAIL(&epair_dpcpu->epair_ifp_drain_list, elm, ifp_next);
 
 	return (0);
 }
 
 static void
 epair_start_locked(struct ifnet *ifp)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct mbuf *m;
 	struct epair_softc *sc;
 	struct ifnet *oifp;
 	int error;
 
 	DPRINTF("ifp=%p\n", ifp);
 	sc = ifp->if_softc;
 	epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
 	EPAIR_LOCK_ASSERT(epair_dpcpu);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 	if ((ifp->if_flags & IFF_UP) == 0)
 		return;
 
 	/*
 	 * We get patckets here from ether_output via if_handoff()
 	 * and ned to put them into the input queue of the oifp
 	 * and call oifp->if_input() via netisr/epair_sintr().
 	 */
 	oifp = sc->oifp;
 	sc = oifp->if_softc;
 	for (;;) {
 		IFQ_DEQUEUE(&ifp->if_snd, m);
 		if (m == NULL)
 			break;
 		BPF_MTAP(ifp, m);
 
 		/*
 		 * In case the outgoing interface is not usable,
 		 * drop the packet.
 		 */
 		if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 		    (oifp->if_flags & IFF_UP) ==0) {
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			m_freem(m);
 			continue;
 		}
 		DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname);
 
 		/*
 		 * Add a reference so the interface cannot go while the
 		 * packet is in transit as we rely on rcvif to stay valid.
 		 */
 		EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
 		m->m_pkthdr.rcvif = oifp;
 		CURVNET_SET_QUIET(oifp->if_vnet);
 		error = netisr_queue(NETISR_EPAIR, m);
 		CURVNET_RESTORE();
 		if (!error) {
 			if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 			/* Someone else received the packet. */
 			if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
 		} else {
 			/* The packet was freed already. */
 			epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
 			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 			(void) epair_add_ifp_for_draining(ifp);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			EPAIR_REFCOUNT_RELEASE(&sc->refcount);
 			EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
 			    ("%s: ifp=%p sc->refcount not >= 1: %d",
 			    __func__, oifp, sc->refcount));
 		}
 	}
 }
 
 static void
 epair_start(struct ifnet *ifp)
 {
 	struct epair_dpcpu *epair_dpcpu;
 
 	epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu);
 	EPAIR_LOCK(epair_dpcpu);
 	epair_start_locked(ifp);
 	EPAIR_UNLOCK(epair_dpcpu);
 }
 
 static int
 epair_transmit_locked(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct epair_softc *sc;
 	struct ifnet *oifp;
 	int error, len;
 	short mflags;
 
 	DPRINTF("ifp=%p m=%p\n", ifp, m);
 	sc = ifp->if_softc;
 	epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
 	EPAIR_LOCK_ASSERT(epair_dpcpu);
 
 	if (m == NULL)
 		return (0);
 	
 	/*
 	 * We are not going to use the interface en/dequeue mechanism
 	 * on the TX side. We are called from ether_output_frame()
 	 * and will put the packet into the incoming queue of the
 	 * other interface of our pair via the netsir.
 	 */
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		m_freem(m);
 		return (ENXIO);
 	}
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	BPF_MTAP(ifp, m);
 
 	/*
 	 * In case the outgoing interface is not usable,
 	 * drop the packet.
 	 */
 	oifp = sc->oifp;
 	if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    (oifp->if_flags & IFF_UP) ==0) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (0);
 	}
 	len = m->m_pkthdr.len;
 	mflags = m->m_flags;
 	DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname);
 
 #ifdef ALTQ
 	/* Support ALTQ via the clasic if_start() path. */
 	IF_LOCK(&ifp->if_snd);
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		ALTQ_ENQUEUE(&ifp->if_snd, m, NULL, error);
 		if (error)
 			if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 		IF_UNLOCK(&ifp->if_snd);
 		if (!error) {
 			if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 			if (mflags & (M_BCAST|M_MCAST))
 				if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 			
 			if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0)
 				epair_start_locked(ifp);
 			else
 				(void)epair_add_ifp_for_draining(ifp);
 		}
 		return (error);
 	}
 	IF_UNLOCK(&ifp->if_snd);
 #endif
 
 	if ((epair_dpcpu->epair_drv_flags & IFF_DRV_OACTIVE) != 0) {
 		/*
 		 * Our hardware queue is full, try to fall back
 		 * queuing to the ifq but do not call ifp->if_start.
 		 * Either we are lucky or the packet is gone.
 		 */
 		IFQ_ENQUEUE(&ifp->if_snd, m, error);
 		if (!error)
 			(void)epair_add_ifp_for_draining(ifp);
 		return (error);
 	}
 	sc = oifp->if_softc;
 	/*
 	 * Add a reference so the interface cannot go while the
 	 * packet is in transit as we rely on rcvif to stay valid.
 	 */
 	EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
 	m->m_pkthdr.rcvif = oifp;
 	CURVNET_SET_QUIET(oifp->if_vnet);
 	error = netisr_queue(NETISR_EPAIR, m);
 	CURVNET_RESTORE();
 	if (!error) {
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		/*
 		 * IFQ_HANDOFF_ADJ/ip_handoff() update statistics,
 		 * but as we bypass all this we have to duplicate
 		 * the logic another time.
 		 */
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 		if (mflags & (M_BCAST|M_MCAST))
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 		/* Someone else received the packet. */
 		if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
 	} else {
 		/* The packet was freed already. */
 		epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
 		ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		EPAIR_REFCOUNT_RELEASE(&sc->refcount);
 		EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
 		    ("%s: ifp=%p sc->refcount not >= 1: %d",
 		    __func__, oifp, sc->refcount));
 	}
 
 	return (error);
 }
 
 static int
 epair_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	int error;
 
 	epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu);
 	EPAIR_LOCK(epair_dpcpu);
 	error = epair_transmit_locked(ifp, m);
 	EPAIR_UNLOCK(epair_dpcpu);
 	return (error);
 }
 
 static void
 epair_qflush(struct ifnet *ifp)
 {
 	struct epair_softc *sc;
 	
 	sc = ifp->if_softc;
 	KASSERT(sc != NULL, ("%s: ifp=%p, epair_softc gone? sc=%p\n",
 	    __func__, ifp, sc));
 	/*
 	 * Remove this ifp from all backpointer lists. The interface will not
 	 * usable for flushing anyway nor should it have anything to flush
 	 * after if_qflush().
 	 */
 	epair_remove_ifp_from_draining(ifp);
 
 	if (sc->if_qflush)
 		sc->if_qflush(ifp);
 }
 
 static int
 epair_media_change(struct ifnet *ifp __unused)
 {
 
 	/* Do nothing. */
 	return (0);
 }
 
 static void
 epair_media_status(struct ifnet *ifp __unused, struct ifmediareq *imr)
 {
 
 	imr->ifm_status = IFM_AVALID | IFM_ACTIVE;
 	imr->ifm_active = IFM_ETHER | IFM_10G_T | IFM_FDX;
 }
 
 static int
 epair_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct epair_softc *sc;
 	struct ifreq *ifr;
 	int error;
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 	case SIOCSIFFLAGS:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		error = 0;
 		break;
 
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		sc = ifp->if_softc;
 		error = ifmedia_ioctl(ifp, ifr, &sc->media, cmd);
 		break;
 
 	case SIOCSIFMTU:
 		/* We basically allow all kinds of MTUs. */
 		ifp->if_mtu = ifr->ifr_mtu;
 		error = 0;
 		break;
 
 	default:
 		/* Let the common ethernet handler process this. */
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 
 	return (error);
 }
 
 static void
 epair_init(void *dummy __unused)
 {
 }
 
 
 /*
  * Interface cloning functions.
  * We use our private ones so that we can create/destroy our secondary
  * device along with the primary one.
  */
 static int
 epair_clone_match(struct if_clone *ifc, const char *name)
 {
 	const char *cp;
 
 	DPRINTF("name='%s'\n", name);
 
 	/*
 	 * Our base name is epair.
 	 * Our interfaces will be named epair<n>[ab].
 	 * So accept anything of the following list:
 	 * - epair
 	 * - epair<n>
 	 * but not the epair<n>[ab] versions.
 	 */
 	if (strncmp(epairname, name, sizeof(epairname)-1) != 0)
 		return (0);
 
 	for (cp = name + sizeof(epairname) - 1; *cp != '\0'; cp++) {
 		if (*cp < '0' || *cp > '9')
 			return (0);
 	}
 
 	return (1);
 }
 
 static int
 epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 {
 	struct epair_softc *sca, *scb;
 	struct ifnet *ifp;
 	char *dp;
 	int error, unit, wildcard;
 	uint8_t eaddr[ETHER_ADDR_LEN];	/* 00:00:00:00:00:00 */
 
 	/*
 	 * We are abusing params to create our second interface.
 	 * Actually we already created it and called if_clone_create()
 	 * for it to do the official insertion procedure the moment we knew
 	 * it cannot fail anymore. So just do attach it here.
 	 */
 	if (params) {
 		scb = (struct epair_softc *)params;
 		ifp = scb->ifp;
 		/* Assign a hopefully unique, locally administered etheraddr. */
 		eaddr[0] = 0x02;
 		eaddr[3] = (ifp->if_index >> 8) & 0xff;
 		eaddr[4] = ifp->if_index & 0xff;
 		eaddr[5] = 0x0b;
 		ether_ifattach(ifp, eaddr);
 		/* Correctly set the name for the cloner list. */
 		strlcpy(name, scb->ifp->if_xname, len);
 		return (0);
 	}
 
 	/* Try to see if a special unit was requested. */
 	error = ifc_name2unit(name, &unit);
 	if (error != 0)
 		return (error);
 	wildcard = (unit < 0);
 
 	error = ifc_alloc_unit(ifc, &unit);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * If no unit had been given, we need to adjust the ifName.
 	 * Also make sure there is space for our extra [ab] suffix.
 	 */
 	for (dp = name; *dp != '\0'; dp++);
 	if (wildcard) {
 		error = snprintf(dp, len - (dp - name), "%d", unit);
 		if (error > len - (dp - name) - 1) {
 			/* ifName too long. */
 			ifc_free_unit(ifc, unit);
 			return (ENOSPC);
 		}
 		dp += error;
 	}
 	if (len - (dp - name) - 1 < 1) {
 		/* No space left for our [ab] suffix. */
 		ifc_free_unit(ifc, unit);
 		return (ENOSPC);
 	}
 	*dp = 'b';
 	/* Must not change dp so we can replace 'a' by 'b' later. */
 	*(dp+1) = '\0';
 
 	/* Check if 'a' and 'b' interfaces already exist. */ 
 	if (ifunit(name) != NULL)
 		return (EEXIST);
 	*dp = 'a';
 	if (ifunit(name) != NULL)
 		return (EEXIST);
 
 	/* Allocate memory for both [ab] interfaces */
 	sca = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO);
 	EPAIR_REFCOUNT_INIT(&sca->refcount, 1);
 	sca->ifp = if_alloc(IFT_ETHER);
 	if (sca->ifp == NULL) {
 		free(sca, M_EPAIR);
 		ifc_free_unit(ifc, unit);
 		return (ENOSPC);
 	}
 
 	scb = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO);
 	EPAIR_REFCOUNT_INIT(&scb->refcount, 1);
 	scb->ifp = if_alloc(IFT_ETHER);
 	if (scb->ifp == NULL) {
 		free(scb, M_EPAIR);
 		if_free(sca->ifp);
 		free(sca, M_EPAIR);
 		ifc_free_unit(ifc, unit);
 		return (ENOSPC);
 	}
 	
 	/*
 	 * Cross-reference the interfaces so we will be able to free both.
 	 */
 	sca->oifp = scb->ifp;
 	scb->oifp = sca->ifp;
 
 	/*
 	 * Calculate the cpuid for netisr queueing based on the
 	 * ifIndex of the interfaces. As long as we cannot configure
 	 * this or use cpuset information easily we cannot guarantee
 	 * cache locality but we can at least allow parallelism.
 	 */
 	sca->cpuid =
 	    netisr_get_cpuid(sca->ifp->if_index % netisr_get_cpucount());
 	scb->cpuid =
 	    netisr_get_cpuid(scb->ifp->if_index % netisr_get_cpucount());
 
 	/* Initialise pseudo media types. */
 	ifmedia_init(&sca->media, 0, epair_media_change, epair_media_status);
 	ifmedia_add(&sca->media, IFM_ETHER | IFM_10G_T, 0, NULL);
 	ifmedia_set(&sca->media, IFM_ETHER | IFM_10G_T);
 	ifmedia_init(&scb->media, 0, epair_media_change, epair_media_status);
 	ifmedia_add(&scb->media, IFM_ETHER | IFM_10G_T, 0, NULL);
 	ifmedia_set(&scb->media, IFM_ETHER | IFM_10G_T);
 	
 	/* Finish initialization of interface <n>a. */
 	ifp = sca->ifp;
 	ifp->if_softc = sca;
 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
 	ifp->if_dname = epairname;
 	ifp->if_dunit = unit;
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_capabilities = IFCAP_VLAN_MTU;
 	ifp->if_capenable = IFCAP_VLAN_MTU;
 	ifp->if_start = epair_start;
 	ifp->if_ioctl = epair_ioctl;
 	ifp->if_init  = epair_init;
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	/* Assign a hopefully unique, locally administered etheraddr. */
 	eaddr[0] = 0x02;
 	eaddr[3] = (ifp->if_index >> 8) & 0xff;
 	eaddr[4] = ifp->if_index & 0xff;
 	eaddr[5] = 0x0a;
 	ether_ifattach(ifp, eaddr);
 	sca->if_qflush = ifp->if_qflush;
 	ifp->if_qflush = epair_qflush;
 	ifp->if_transmit = epair_transmit;
 	ifp->if_baudrate = IF_Gbps(10);	/* arbitrary maximum */
 
 	/* Swap the name and finish initialization of interface <n>b. */
 	*dp = 'b';
 
 	ifp = scb->ifp;
 	ifp->if_softc = scb;
 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
 	ifp->if_dname = epairname;
 	ifp->if_dunit = unit;
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_capabilities = IFCAP_VLAN_MTU;
 	ifp->if_capenable = IFCAP_VLAN_MTU;
 	ifp->if_start = epair_start;
 	ifp->if_ioctl = epair_ioctl;
 	ifp->if_init  = epair_init;
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	/* We need to play some tricks here for the second interface. */
 	strlcpy(name, epairname, len);
 	error = if_clone_create(name, len, (caddr_t)scb);
 	if (error)
 		panic("%s: if_clone_create() for our 2nd iface failed: %d",
 		    __func__, error);
 	scb->if_qflush = ifp->if_qflush;
 	ifp->if_qflush = epair_qflush;
 	ifp->if_transmit = epair_transmit;
 	ifp->if_baudrate = IF_Gbps(10);	/* arbitrary maximum */
 
 	/*
 	 * Restore name to <n>a as the ifp for this will go into the
 	 * cloner list for the initial call.
 	 */
 	strlcpy(name, sca->ifp->if_xname, len);
 	DPRINTF("name='%s/%db' created sca=%p scb=%p\n", name, unit, sca, scb);
 
 	/* Tell the world, that we are ready to rock. */
 	sca->ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	scb->ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	if_link_state_change(sca->ifp, LINK_STATE_UP);
 	if_link_state_change(scb->ifp, LINK_STATE_UP);
 
 	return (0);
 }
 
 static int
 epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
 {
 	struct ifnet *oifp;
 	struct epair_softc *sca, *scb;
 	int unit, error;
 
 	DPRINTF("ifp=%p\n", ifp);
 
 	/*
 	 * In case we called into if_clone_destroyif() ourselves
 	 * again to remove the second interface, the softc will be
 	 * NULL. In that case so not do anything but return success.
 	 */
 	if (ifp->if_softc == NULL)
 		return (0);
 	
 	unit = ifp->if_dunit;
 	sca = ifp->if_softc;
 	oifp = sca->oifp;
 	scb = oifp->if_softc;
 
 	DPRINTF("ifp=%p oifp=%p\n", ifp, oifp);
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 	if_link_state_change(oifp, LINK_STATE_DOWN);
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	oifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 
 	/*
 	 * Get rid of our second half. As the other of the two
 	 * interfaces may reside in a different vnet, we need to
 	 * switch before freeing them.
 	 */
 	CURVNET_SET_QUIET(oifp->if_vnet);
 	ether_ifdetach(oifp);
 	/*
 	 * Wait for all packets to be dispatched to if_input.
 	 * The numbers can only go down as the interface is
 	 * detached so there is no need to use atomics.
 	 */
 	DPRINTF("scb refcnt=%u\n", scb->refcount);
 	EPAIR_REFCOUNT_ASSERT(scb->refcount == 1,
 	    ("%s: ifp=%p scb->refcount!=1: %d", __func__, oifp, scb->refcount));
 	oifp->if_softc = NULL;
 	error = if_clone_destroyif(ifc, oifp);
 	if (error)
 		panic("%s: if_clone_destroyif() for our 2nd iface failed: %d",
 		    __func__, error);
 	if_free(oifp);
 	ifmedia_removeall(&scb->media);
 	free(scb, M_EPAIR);
 	CURVNET_RESTORE();
 
 	ether_ifdetach(ifp);
 	/*
 	 * Wait for all packets to be dispatched to if_input.
 	 */
 	DPRINTF("sca refcnt=%u\n", sca->refcount);
 	EPAIR_REFCOUNT_ASSERT(sca->refcount == 1,
 	    ("%s: ifp=%p sca->refcount!=1: %d", __func__, ifp, sca->refcount));
 	if_free(ifp);
 	ifmedia_removeall(&sca->media);
 	free(sca, M_EPAIR);
 	ifc_free_unit(ifc, unit);
 
 	return (0);
 }
 
 static void
 vnet_epair_init(const void *unused __unused)
 {
 
 	V_epair_cloner = if_clone_advanced(epairname, 0,
 	    epair_clone_match, epair_clone_create, epair_clone_destroy);
-	netisr_register(&epair_nh);
+	netisr_register_vnet(&epair_nh);
 }
 VNET_SYSINIT(vnet_epair_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_epair_init, NULL);
 
 static void
 vnet_epair_uninit(const void *unused __unused)
 {
 
-	netisr_unregister(&epair_nh);
+	netisr_unregister_vnet(&epair_nh);
 	if_clone_detach(V_epair_cloner);
 }
 VNET_SYSUNINIT(vnet_epair_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_epair_uninit, NULL);
 
 static int
 epair_modevent(module_t mod, int type, void *data)
 {
 	int qlimit;
 
 	switch (type) {
 	case MOD_LOAD:
 		/* For now limit us to one global mutex and one inq. */
 		epair_dpcpu_init();
 		epair_nh.nh_qlimit = 42 * ifqmaxlen; /* 42 shall be the number. */
 		if (TUNABLE_INT_FETCH("net.link.epair.netisr_maxqlen", &qlimit))
 		    epair_nh.nh_qlimit = qlimit;
+		netisr_register(&epair_nh);
 		if (bootverbose)
 			printf("%s initialized.\n", epairname);
 		break;
 	case MOD_UNLOAD:
+		netisr_unregister(&epair_nh);
 		epair_dpcpu_detach();
 		if (bootverbose)
 			printf("%s unloaded.\n", epairname);
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t epair_mod = {
 	"if_epair",
 	epair_modevent,
 	0
 };
 
 DECLARE_MODULE(if_epair, epair_mod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE);
 MODULE_VERSION(if_epair, 1);
Index: projects/vnet/sys/net/if_ethersubr.c
===================================================================
--- projects/vnet/sys/net/if_ethersubr.c	(revision 299172)
+++ projects/vnet/sys/net/if_ethersubr.c	(revision 299173)
@@ -1,1200 +1,1209 @@
 /*-
  * Copyright (c) 1982, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_ethersubr.c	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_netgraph.h"
 #include "opt_mbuf_profiling.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mbuf.h>
 #include <sys/random.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/uuid.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arp.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/if_llc.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if_bridgevar.h>
 #include <net/if_vlan_var.h>
 #include <net/if_llatbl.h>
 #include <net/pfil.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netpfil/pf/pf_mtag.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip_carp.h>
 #include <netinet/ip_var.h>
 #endif
 #ifdef INET6
 #include <netinet6/nd6.h>
 #endif
 #include <security/mac/mac_framework.h>
 
 #ifdef CTASSERT
 CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2);
 CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN);
 #endif
 
 VNET_DEFINE(struct pfil_head, link_pfil_hook);	/* Packet filter hooks */
 
 /* netgraph node hooks for ng_ether(4) */
 void	(*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp);
 void	(*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m);
 int	(*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp);
 void	(*ng_ether_attach_p)(struct ifnet *ifp);
 void	(*ng_ether_detach_p)(struct ifnet *ifp);
 
 void	(*vlan_input_p)(struct ifnet *, struct mbuf *);
 
 /* if_bridge(4) support */
 struct mbuf *(*bridge_input_p)(struct ifnet *, struct mbuf *); 
 int	(*bridge_output_p)(struct ifnet *, struct mbuf *, 
 		struct sockaddr *, struct rtentry *);
 void	(*bridge_dn_p)(struct mbuf *, struct ifnet *);
 
 /* if_lagg(4) support */
 struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); 
 
 static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] =
 			{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 static	int ether_resolvemulti(struct ifnet *, struct sockaddr **,
 		struct sockaddr *);
 #ifdef VIMAGE
 static	void ether_reassign(struct ifnet *, struct vnet *, char *);
 #endif
 static	int ether_requestencap(struct ifnet *, struct if_encap_req *);
 
 #define	ETHER_IS_BROADCAST(addr) \
 	(bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0)
 
 #define senderr(e) do { error = (e); goto bad;} while (0)
 
 static void
 update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst)
 {
 	int csum_flags = 0;
 
 	if (src->m_pkthdr.csum_flags & CSUM_IP)
 		csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID);
 	if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
 		csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
 	if (src->m_pkthdr.csum_flags & CSUM_SCTP)
 		csum_flags |= CSUM_SCTP_VALID;
 	dst->m_pkthdr.csum_flags |= csum_flags;
 	if (csum_flags & CSUM_DATA_VALID)
 		dst->m_pkthdr.csum_data = 0xffff;
 }
 
 /*
  * Handle link-layer encapsulation requests.
  */
 static int
 ether_requestencap(struct ifnet *ifp, struct if_encap_req *req)
 {
 	struct ether_header *eh;
 	struct arphdr *ah;
 	uint16_t etype;
 	const u_char *lladdr;
 
 	if (req->rtype != IFENCAP_LL)
 		return (EOPNOTSUPP);
 
 	if (req->bufsize < ETHER_HDR_LEN)
 		return (ENOMEM);
 
 	eh = (struct ether_header *)req->buf;
 	lladdr = req->lladdr;
 	req->lladdr_off = 0;
 
 	switch (req->family) {
 	case AF_INET:
 		etype = htons(ETHERTYPE_IP);
 		break;
 	case AF_INET6:
 		etype = htons(ETHERTYPE_IPV6);
 		break;
 	case AF_ARP:
 		ah = (struct arphdr *)req->hdata;
 		ah->ar_hrd = htons(ARPHRD_ETHER);
 
 		switch(ntohs(ah->ar_op)) {
 		case ARPOP_REVREQUEST:
 		case ARPOP_REVREPLY:
 			etype = htons(ETHERTYPE_REVARP);
 			break;
 		case ARPOP_REQUEST:
 		case ARPOP_REPLY:
 		default:
 			etype = htons(ETHERTYPE_ARP);
 			break;
 		}
 
 		if (req->flags & IFENCAP_FLAG_BROADCAST)
 			lladdr = ifp->if_broadcastaddr;
 		break;
 	default:
 		return (EAFNOSUPPORT);
 	}
 
 	memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type));
 	memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN);
 	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 	req->bufsize = sizeof(struct ether_header);
 
 	return (0);
 }
 
 
 static int
 ether_resolve_addr(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro, u_char *phdr,
 	uint32_t *pflags)
 {
 	struct ether_header *eh;
 	uint32_t lleflags = 0;
 	int error = 0;
 #if defined(INET) || defined(INET6)
 	uint16_t etype;
 #endif
 
 	eh = (struct ether_header *)phdr;
 
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if ((m->m_flags & (M_BCAST | M_MCAST)) == 0)
 			error = arpresolve(ifp, 0, m, dst, phdr, &lleflags);
 		else {
 			if (m->m_flags & M_BCAST)
 				memcpy(eh->ether_dhost, ifp->if_broadcastaddr,
 				    ETHER_ADDR_LEN);
 			else {
 				const struct in_addr *a;
 				a = &(((const struct sockaddr_in *)dst)->sin_addr);
 				ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost);
 			}
 			etype = htons(ETHERTYPE_IP);
 			memcpy(&eh->ether_type, &etype, sizeof(etype));
 			memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if ((m->m_flags & M_MCAST) == 0)
 			error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags);
 		else {
 			const struct in6_addr *a6;
 			a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr);
 			ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost);
 			etype = htons(ETHERTYPE_IPV6);
 			memcpy(&eh->ether_type, &etype, sizeof(etype));
 			memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 		}
 		break;
 #endif
 	default:
 		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
 		if (m != NULL)
 			m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 
 	if (error == EHOSTDOWN) {
 		if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0)
 			error = EHOSTUNREACH;
 	}
 
 	if (error != 0)
 		return (error);
 
 	*pflags = RT_MAY_LOOP;
 	if (lleflags & LLE_IFADDR)
 		*pflags |= RT_L2_ME;
 
 	return (0);
 }
 
 /*
  * Ethernet output routine.
  * Encapsulate a packet of type family for the local net.
  * Use trailer local net encapsulation if enough data in first
  * packet leaves a multiple of 512 bytes of data in remainder.
  */
 int
 ether_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 	int error = 0;
 	char linkhdr[ETHER_HDR_LEN], *phdr;
 	struct ether_header *eh;
 	struct pf_mtag *t;
 	int loop_copy = 1;
 	int hlen;	/* link layer header length */
 	uint32_t pflags;
 
 	phdr = NULL;
 	pflags = 0;
 	if (ro != NULL) {
 		phdr = ro->ro_prepend;
 		hlen = ro->ro_plen;
 		pflags = ro->ro_flags;
 	}
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		senderr(error);
 #endif
 
 	M_PROFILE(m);
 	if (ifp->if_flags & IFF_MONITOR)
 		senderr(ENETDOWN);
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		senderr(ENETDOWN);
 
 	if (phdr == NULL) {
 		/* No prepend data supplied. Try to calculate ourselves. */
 		phdr = linkhdr;
 		hlen = ETHER_HDR_LEN;
 		error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags);
 		if (error != 0)
 			return (error == EWOULDBLOCK ? 0 : error);
 	}
 
 	if ((pflags & RT_L2_ME) != 0) {
 		update_mbuf_csumflags(m, m);
 		return (if_simloop(ifp, m, dst->sa_family, 0));
 	}
 	loop_copy = pflags & RT_MAY_LOOP;
 
 	/*
 	 * Add local net header.  If no space in first mbuf,
 	 * allocate another.
 	 *
 	 * Note that we do prepend regardless of RT_HAS_HEADER flag.
 	 * This is done because BPF code shifts m_data pointer
 	 * to the end of ethernet header prior to calling if_output().
 	 */
 	M_PREPEND(m, hlen, M_NOWAIT);
 	if (m == NULL)
 		senderr(ENOBUFS);
 	if ((pflags & RT_HAS_HEADER) == 0) {
 		eh = mtod(m, struct ether_header *);
 		memcpy(eh, phdr, hlen);
 	}
 
 	/*
 	 * If a simplex interface, and the packet is being sent to our
 	 * Ethernet address or a broadcast address, loopback a copy.
 	 * XXX To make a simplex device behave exactly like a duplex
 	 * device, we should copy in the case of sending to our own
 	 * ethernet address (thus letting the original actually appear
 	 * on the wire). However, we don't do that here for security
 	 * reasons and compatibility with the original behavior.
 	 */
 	if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) &&
 	    ((t = pf_find_mtag(m)) == NULL || !t->routed)) {
 		struct mbuf *n;
 
 		/*
 		 * Because if_simloop() modifies the packet, we need a
 		 * writable copy through m_dup() instead of a readonly
 		 * one as m_copy[m] would give us. The alternative would
 		 * be to modify if_simloop() to handle the readonly mbuf,
 		 * but performancewise it is mostly equivalent (trading
 		 * extra data copying vs. extra locking).
 		 *
 		 * XXX This is a local workaround.  A number of less
 		 * often used kernel parts suffer from the same bug.
 		 * See PR kern/105943 for a proposed general solution.
 		 */
 		if ((n = m_dup(m, M_NOWAIT)) != NULL) {
 			update_mbuf_csumflags(m, n);
 			(void)if_simloop(ifp, n, dst->sa_family, hlen);
 		} else
 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 	}
 
        /*
 	* Bridges require special output handling.
 	*/
 	if (ifp->if_bridge) {
 		BRIDGE_OUTPUT(ifp, m, error);
 		return (error);
 	}
 
 #if defined(INET) || defined(INET6)
 	if (ifp->if_carp &&
 	    (error = (*carp_output_p)(ifp, m, dst)))
 		goto bad;
 #endif
 
 	/* Handle ng_ether(4) processing, if any */
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_output_p != NULL,
 		    ("ng_ether_output_p is NULL"));
 		if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) {
 bad:			if (m != NULL)
 				m_freem(m);
 			return (error);
 		}
 		if (m == NULL)
 			return (0);
 	}
 
 	/* Continue with link-layer output */
 	return ether_output_frame(ifp, m);
 }
 
 /*
  * Ethernet link layer output routine to send a raw frame to the device.
  *
  * This assumes that the 14 byte Ethernet header is present and contiguous
  * in the first mbuf (if BRIDGE'ing).
  */
 int
 ether_output_frame(struct ifnet *ifp, struct mbuf *m)
 {
 	int i;
 
 	if (PFIL_HOOKED(&V_link_pfil_hook)) {
 		i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_OUT, NULL);
 
 		if (i != 0)
 			return (EACCES);
 
 		if (m == NULL)
 			return (0);
 	}
 
 	/*
 	 * Queue message on interface, update output statistics if
 	 * successful, and start output if interface not yet active.
 	 */
 	return ((ifp->if_transmit)(ifp, m));
 }
 
 #if defined(INET) || defined(INET6)
 #endif
 
 /*
  * Process a received Ethernet packet; the packet is in the
  * mbuf chain m with the ethernet header at the front.
  */
 static void
 ether_input_internal(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ether_header *eh;
 	u_short etype;
 
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		return;
 	}
 #ifdef DIAGNOSTIC
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n");
 		m_freem(m);
 		return;
 	}
 #endif
 	if (m->m_len < ETHER_HDR_LEN) {
 		/* XXX maybe should pullup? */
 		if_printf(ifp, "discard frame w/o leading ethernet "
 				"header (len %u pkt len %u)\n",
 				m->m_len, m->m_pkthdr.len);
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		return;
 	}
 	eh = mtod(m, struct ether_header *);
 	etype = ntohs(eh->ether_type);
 	random_harvest_queue(m, sizeof(*m), 2, RANDOM_NET_ETHER);
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 
 	if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 		if (ETHER_IS_BROADCAST(eh->ether_dhost))
 			m->m_flags |= M_BCAST;
 		else
 			m->m_flags |= M_MCAST;
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 	}
 
 #ifdef MAC
 	/*
 	 * Tag the mbuf with an appropriate MAC label before any other
 	 * consumers can get to it.
 	 */
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	/*
 	 * Give bpf a chance at the packet.
 	 */
 	ETHER_BPF_MTAP(ifp, m);
 
 	/*
 	 * If the CRC is still on the packet, trim it off. We do this once
 	 * and once only in case we are re-entered. Nothing else on the
 	 * Ethernet receive path expects to see the FCS.
 	 */
 	if (m->m_flags & M_HASFCS) {
 		m_adj(m, -ETHER_CRC_LEN);
 		m->m_flags &= ~M_HASFCS;
 	}
 
 	if (!(ifp->if_capenable & IFCAP_HWSTATS))
 		if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 
 	/* Allow monitor mode to claim this frame, after stats are updated. */
 	if (ifp->if_flags & IFF_MONITOR) {
 		m_freem(m);
 		CURVNET_RESTORE();
 		return;
 	}
 
 	/* Handle input from a lagg(4) port */
 	if (ifp->if_type == IFT_IEEE8023ADLAG) {
 		KASSERT(lagg_input_p != NULL,
 		    ("%s: if_lagg not loaded!", __func__));
 		m = (*lagg_input_p)(ifp, m);
 		if (m != NULL)
 			ifp = m->m_pkthdr.rcvif;
 		else {
 			CURVNET_RESTORE();
 			return;
 		}
 	}
 
 	/*
 	 * If the hardware did not process an 802.1Q tag, do this now,
 	 * to allow 802.1P priority frames to be passed to the main input
 	 * path correctly.
 	 * TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels.
 	 */
 	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) {
 		struct ether_vlan_header *evl;
 
 		if (m->m_len < sizeof(*evl) &&
 		    (m = m_pullup(m, sizeof(*evl))) == NULL) {
 #ifdef DIAGNOSTIC
 			if_printf(ifp, "cannot pullup VLAN header\n");
 #endif
 			if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 			CURVNET_RESTORE();
 			return;
 		}
 
 		evl = mtod(m, struct ether_vlan_header *);
 		m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
 		m->m_flags |= M_VLANTAG;
 
 		bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
 		    ETHER_HDR_LEN - ETHER_TYPE_LEN);
 		m_adj(m, ETHER_VLAN_ENCAP_LEN);
 		eh = mtod(m, struct ether_header *);
 	}
 
 	M_SETFIB(m, ifp->if_fib);
 
 	/* Allow ng_ether(4) to claim this frame. */
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_input_p != NULL,
 		    ("%s: ng_ether_input_p is NULL", __func__));
 		m->m_flags &= ~M_PROMISC;
 		(*ng_ether_input_p)(ifp, &m);
 		if (m == NULL) {
 			CURVNET_RESTORE();
 			return;
 		}
 		eh = mtod(m, struct ether_header *);
 	}
 
 	/*
 	 * Allow if_bridge(4) to claim this frame.
 	 * The BRIDGE_INPUT() macro will update ifp if the bridge changed it
 	 * and the frame should be delivered locally.
 	 */
 	if (ifp->if_bridge != NULL) {
 		m->m_flags &= ~M_PROMISC;
 		BRIDGE_INPUT(ifp, m);
 		if (m == NULL) {
 			CURVNET_RESTORE();
 			return;
 		}
 		eh = mtod(m, struct ether_header *);
 	}
 
 #if defined(INET) || defined(INET6)
 	/*
 	 * Clear M_PROMISC on frame so that carp(4) will see it when the
 	 * mbuf flows up to Layer 3.
 	 * FreeBSD's implementation of carp(4) uses the inprotosw
 	 * to dispatch IPPROTO_CARP. carp(4) also allocates its own
 	 * Ethernet addresses of the form 00:00:5e:00:01:xx, which
 	 * is outside the scope of the M_PROMISC test below.
 	 * TODO: Maintain a hash table of ethernet addresses other than
 	 * ether_dhost which may be active on this ifp.
 	 */
 	if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) {
 		m->m_flags &= ~M_PROMISC;
 	} else
 #endif
 	{
 		/*
 		 * If the frame received was not for our MAC address, set the
 		 * M_PROMISC flag on the mbuf chain. The frame may need to
 		 * be seen by the rest of the Ethernet input path in case of
 		 * re-entry (e.g. bridge, vlan, netgraph) but should not be
 		 * seen by upper protocol layers.
 		 */
 		if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
 		    bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0)
 			m->m_flags |= M_PROMISC;
 	}
 
 	ether_demux(ifp, m);
 	CURVNET_RESTORE();
 }
 
 /*
  * Ethernet input dispatch; by default, direct dispatch here regardless of
  * global configuration.  However, if RSS is enabled, hook up RSS affinity
  * so that when deferred or hybrid dispatch is enabled, we can redistribute
  * load based on RSS.
  *
  * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or
  * not it had already done work distribution via multi-queue.  Then we could
  * direct dispatch in the event load balancing was already complete and
  * handle the case of interfaces with different capabilities better.
  *
  * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions
  * at multiple layers?
  *
  * XXXRW: For now, enable all this only if RSS is compiled in, although it
  * works fine without RSS.  Need to characterise the performance overhead
  * of the detour through the netisr code in the event the result is always
  * direct dispatch.
  */
 static void
 ether_nh_input(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	KASSERT(m->m_pkthdr.rcvif != NULL,
 	    ("%s: NULL interface pointer", __func__));
 	ether_input_internal(m->m_pkthdr.rcvif, m);
 }
 
 static struct netisr_handler	ether_nh = {
 	.nh_name = "ether",
 	.nh_handler = ether_nh_input,
 	.nh_proto = NETISR_ETHER,
 #ifdef RSS
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_DIRECT,
 	.nh_m2cpuid = rss_m2cpuid,
 #else
 	.nh_policy = NETISR_POLICY_SOURCE,
 	.nh_dispatch = NETISR_DISPATCH_DIRECT,
 #endif
 };
 
 static void
+ether_init(__unused void *arg)
+{
+
+	netisr_register(&ether_nh);
+}
+SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL);
+
+static void
 vnet_ether_init(__unused void *arg)
 {
 	int i;
 
 	/* Initialize packet filter hooks. */
 	V_link_pfil_hook.ph_type = PFIL_TYPE_AF;
 	V_link_pfil_hook.ph_af = AF_LINK;
 	if ((i = pfil_head_register(&V_link_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to register pfil link hook, "
 			"error %d\n", __func__, i);
-	netisr_register(&ether_nh);
+	netisr_register_vnet(&ether_nh);
 }
 VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY,
     vnet_ether_init, NULL);
  
 static void
 vnet_ether_pfil_destroy(__unused void *arg)
 {
 	int i;
 
 	if ((i = pfil_head_unregister(&V_link_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to unregister pfil link hook, "
 			"error %d\n", __func__, i);
 }
 VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY,
     vnet_ether_pfil_destroy, NULL);
 
 static void
 vnet_ether_destroy(__unused void *arg)
 {
 
-	netisr_unregister(&ether_nh);
+	netisr_unregister_vnet(&ether_nh);
 }
 VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY,
     vnet_ether_destroy, NULL);
 
 static void
 ether_input(struct ifnet *ifp, struct mbuf *m)
 {
 
 	struct mbuf *mn;
 
 	/*
 	 * The drivers are allowed to pass in a chain of packets linked with
 	 * m_nextpkt. We split them up into separate packets here and pass
 	 * them up. This allows the drivers to amortize the receive lock.
 	 */
 	while (m) {
 		mn = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 
 		/*
 		 * We will rely on rcvif being set properly in the deferred context,
 		 * so assert it is correct here.
 		 */
-		KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch", __func__));
+		KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p "
+		    "rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp));
 		CURVNET_SET_QUIET(ifp->if_vnet);
 		netisr_dispatch(NETISR_ETHER, m);
 		CURVNET_RESTORE();
 		m = mn;
 	}
 }
 
 /*
  * Upper layer processing for a received Ethernet packet.
  */
 void
 ether_demux(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ether_header *eh;
 	int i, isr;
 	u_short ether_type;
 
 	KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__));
 
 	/* Do not grab PROMISC frames in case we are re-entered. */
 	if (PFIL_HOOKED(&V_link_pfil_hook) && !(m->m_flags & M_PROMISC)) {
 		i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_IN, NULL);
 
 		if (i != 0 || m == NULL)
 			return;
 	}
 
 	eh = mtod(m, struct ether_header *);
 	ether_type = ntohs(eh->ether_type);
 
 	/*
 	 * If this frame has a VLAN tag other than 0, call vlan_input()
 	 * if its module is loaded. Otherwise, drop.
 	 */
 	if ((m->m_flags & M_VLANTAG) &&
 	    EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) {
 		if (ifp->if_vlantrunk == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			m_freem(m);
 			return;
 		}
 		KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!",
 		    __func__));
 		/* Clear before possibly re-entering ether_input(). */
 		m->m_flags &= ~M_PROMISC;
 		(*vlan_input_p)(ifp, m);
 		return;
 	}
 
 	/*
 	 * Pass promiscuously received frames to the upper layer if the user
 	 * requested this by setting IFF_PPROMISC. Otherwise, drop them.
 	 */
 	if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) {
 		m_freem(m);
 		return;
 	}
 
 	/*
 	 * Reset layer specific mbuf flags to avoid confusing upper layers.
 	 * Strip off Ethernet header.
 	 */
 	m->m_flags &= ~M_VLANTAG;
 	m_clrprotoflags(m);
 	m_adj(m, ETHER_HDR_LEN);
 
 	/*
 	 * Dispatch frame to upper layer.
 	 */
 	switch (ether_type) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		isr = NETISR_IP;
 		break;
 
 	case ETHERTYPE_ARP:
 		if (ifp->if_flags & IFF_NOARP) {
 			/* Discard packet if ARP is disabled on interface */
 			m_freem(m);
 			return;
 		}
 		isr = NETISR_ARP;
 		break;
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		goto discard;
 	}
 	netisr_dispatch(isr, m);
 	return;
 
 discard:
 	/*
 	 * Packet is to be discarded.  If netgraph is present,
 	 * hand the packet to it for last chance processing;
 	 * otherwise dispose of it.
 	 */
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_input_orphan_p != NULL,
 		    ("ng_ether_input_orphan_p is NULL"));
 		/*
 		 * Put back the ethernet header so netgraph has a
 		 * consistent view of inbound packets.
 		 */
 		M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
 		(*ng_ether_input_orphan_p)(ifp, m);
 		return;
 	}
 	m_freem(m);
 }
 
 /*
  * Convert Ethernet address to printable (loggable) representation.
  * This routine is for compatibility; it's better to just use
  *
  *	printf("%6D", <pointer to address>, ":");
  *
  * since there's no static buffer involved.
  */
 char *
 ether_sprintf(const u_char *ap)
 {
 	static char etherbuf[18];
 	snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":");
 	return (etherbuf);
 }
 
 /*
  * Perform common duties while attaching to interface list
  */
 void
 ether_ifattach(struct ifnet *ifp, const u_int8_t *lla)
 {
 	int i;
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 
 	ifp->if_addrlen = ETHER_ADDR_LEN;
 	ifp->if_hdrlen = ETHER_HDR_LEN;
 	if_attach(ifp);
 	ifp->if_mtu = ETHERMTU;
 	ifp->if_output = ether_output;
 	ifp->if_input = ether_input;
 	ifp->if_resolvemulti = ether_resolvemulti;
 	ifp->if_requestencap = ether_requestencap;
 #ifdef VIMAGE
 	ifp->if_reassign = ether_reassign;
 #endif
 	if (ifp->if_baudrate == 0)
 		ifp->if_baudrate = IF_Mbps(10);		/* just a default */
 	ifp->if_broadcastaddr = etherbroadcastaddr;
 
 	ifa = ifp->if_addr;
 	KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	sdl->sdl_type = IFT_ETHER;
 	sdl->sdl_alen = ifp->if_addrlen;
 	bcopy(lla, LLADDR(sdl), ifp->if_addrlen);
 
 	bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
 	if (ng_ether_attach_p != NULL)
 		(*ng_ether_attach_p)(ifp);
 
 	/* Announce Ethernet MAC address if non-zero. */
 	for (i = 0; i < ifp->if_addrlen; i++)
 		if (lla[i] != 0)
 			break; 
 	if (i != ifp->if_addrlen)
 		if_printf(ifp, "Ethernet address: %6D\n", lla, ":");
 
 	uuid_ether_add(LLADDR(sdl));
 }
 
 /*
  * Perform common duties while detaching an Ethernet interface
  */
 void
 ether_ifdetach(struct ifnet *ifp)
 {
 	struct sockaddr_dl *sdl;
 
 	sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr);
 	uuid_ether_del(LLADDR(sdl));
 
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_detach_p != NULL,
 		    ("ng_ether_detach_p is NULL"));
 		(*ng_ether_detach_p)(ifp);
 	}
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 }
 
 #ifdef VIMAGE
 void
 ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused)
 {
 
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_detach_p != NULL,
 		    ("ng_ether_detach_p is NULL"));
 		(*ng_ether_detach_p)(ifp);
 	}
 
 	if (ng_ether_attach_p != NULL) {
 		CURVNET_SET_QUIET(new_vnet);
 		(*ng_ether_attach_p)(ifp);
 		CURVNET_RESTORE();
 	}
 }
 #endif
 
 SYSCTL_DECL(_net_link);
 SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet");
 
 #if 0
 /*
  * This is for reference.  We have a table-driven version
  * of the little-endian crc32 generator, which is faster
  * than the double-loop.
  */
 uint32_t
 ether_crc32_le(const uint8_t *buf, size_t len)
 {
 	size_t i;
 	uint32_t crc;
 	int bit;
 	uint8_t data;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
 			carry = (crc ^ data) & 1;
 			crc >>= 1;
 			if (carry)
 				crc = (crc ^ ETHER_CRC_POLY_LE);
 		}
 	}
 
 	return (crc);
 }
 #else
 uint32_t
 ether_crc32_le(const uint8_t *buf, size_t len)
 {
 	static const uint32_t crctab[] = {
 		0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac,
 		0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c,
 		0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c,
 		0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c
 	};
 	size_t i;
 	uint32_t crc;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		crc ^= buf[i];
 		crc = (crc >> 4) ^ crctab[crc & 0xf];
 		crc = (crc >> 4) ^ crctab[crc & 0xf];
 	}
 
 	return (crc);
 }
 #endif
 
 uint32_t
 ether_crc32_be(const uint8_t *buf, size_t len)
 {
 	size_t i;
 	uint32_t crc, carry;
 	int bit;
 	uint8_t data;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
 			carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01);
 			crc <<= 1;
 			if (carry)
 				crc = (crc ^ ETHER_CRC_POLY_BE) | carry;
 		}
 	}
 
 	return (crc);
 }
 
 int
 ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	struct ifaddr *ifa = (struct ifaddr *) data;
 	struct ifreq *ifr = (struct ifreq *) data;
 	int error = 0;
 
 	switch (command) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 
 		switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			ifp->if_init(ifp->if_softc);	/* before arpwhohas */
 			arp_ifinit(ifp, ifa);
 			break;
 #endif
 		default:
 			ifp->if_init(ifp->if_softc);
 			break;
 		}
 		break;
 
 	case SIOCGIFADDR:
 		{
 			struct sockaddr *sa;
 
 			sa = (struct sockaddr *) & ifr->ifr_data;
 			bcopy(IF_LLADDR(ifp),
 			      (caddr_t) sa->sa_data, ETHER_ADDR_LEN);
 		}
 		break;
 
 	case SIOCSIFMTU:
 		/*
 		 * Set the interface MTU.
 		 */
 		if (ifr->ifr_mtu > ETHERMTU) {
 			error = EINVAL;
 		} else {
 			ifp->if_mtu = ifr->ifr_mtu;
 		}
 		break;
 	default:
 		error = EINVAL;			/* XXX netbsd has ENOTTY??? */
 		break;
 	}
 	return (error);
 }
 
 static int
 ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
 	struct sockaddr *sa)
 {
 	struct sockaddr_dl *sdl;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	u_char *e_addr;
 
 	switch(sa->sa_family) {
 	case AF_LINK:
 		/*
 		 * No mapping needed. Just check that it's a valid MC address.
 		 */
 		sdl = (struct sockaddr_dl *)sa;
 		e_addr = LLADDR(sdl);
 		if (!ETHER_IS_MULTICAST(e_addr))
 			return EADDRNOTAVAIL;
 		*llsa = 0;
 		return 0;
 
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)sa;
 		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 			return EADDRNOTAVAIL;
 		sdl = link_init_sdl(ifp, *llsa, IFT_ETHER);
 		sdl->sdl_alen = ETHER_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return 0;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)sa;
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			/*
 			 * An IP6 address of 0 means listen to all
 			 * of the Ethernet multicast address used for IP6.
 			 * (This is used for multicast routers.)
 			 */
 			ifp->if_flags |= IFF_ALLMULTI;
 			*llsa = 0;
 			return 0;
 		}
 		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 			return EADDRNOTAVAIL;
 		sdl = link_init_sdl(ifp, *llsa, IFT_ETHER);
 		sdl->sdl_alen = ETHER_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return 0;
 #endif
 
 	default:
 		/*
 		 * Well, the text isn't quite right, but it's the name
 		 * that counts...
 		 */
 		return EAFNOSUPPORT;
 	}
 }
 
 static moduledata_t ether_mod = {
 	.name = "ether",
 };
 
 void
 ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen)
 {
 	struct ether_vlan_header vlan;
 	struct mbuf mv, mb;
 
 	KASSERT((m->m_flags & M_VLANTAG) != 0,
 	    ("%s: vlan information not present", __func__));
 	KASSERT(m->m_len >= sizeof(struct ether_header),
 	    ("%s: mbuf not large enough for header", __func__));
 	bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header));
 	vlan.evl_proto = vlan.evl_encap_proto;
 	vlan.evl_encap_proto = htons(ETHERTYPE_VLAN);
 	vlan.evl_tag = htons(m->m_pkthdr.ether_vtag);
 	m->m_len -= sizeof(struct ether_header);
 	m->m_data += sizeof(struct ether_header);
 	/*
 	 * If a data link has been supplied by the caller, then we will need to
 	 * re-create a stack allocated mbuf chain with the following structure:
 	 *
 	 * (1) mbuf #1 will contain the supplied data link
 	 * (2) mbuf #2 will contain the vlan header
 	 * (3) mbuf #3 will contain the original mbuf's packet data
 	 *
 	 * Otherwise, submit the packet and vlan header via bpf_mtap2().
 	 */
 	if (data != NULL) {
 		mv.m_next = m;
 		mv.m_data = (caddr_t)&vlan;
 		mv.m_len = sizeof(vlan);
 		mb.m_next = &mv;
 		mb.m_data = data;
 		mb.m_len = dlen;
 		bpf_mtap(bp, &mb);
 	} else
 		bpf_mtap2(bp, &vlan, sizeof(vlan), m);
 	m->m_len += sizeof(struct ether_header);
 	m->m_data -= sizeof(struct ether_header);
 }
 
 struct mbuf *
 ether_vlanencap(struct mbuf *m, uint16_t tag)
 {
 	struct ether_vlan_header *evl;
 
 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
 	if (m == NULL)
 		return (NULL);
 	/* M_PREPEND takes care of m_len, m_pkthdr.len for us */
 
 	if (m->m_len < sizeof(*evl)) {
 		m = m_pullup(m, sizeof(*evl));
 		if (m == NULL)
 			return (NULL);
 	}
 
 	/*
 	 * Transform the Ethernet header into an Ethernet header
 	 * with 802.1Q encapsulation.
 	 */
 	evl = mtod(m, struct ether_vlan_header *);
 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
 	    (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
 	evl->evl_tag = htons(tag);
 	return (m);
 }
 
 DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
 MODULE_VERSION(ether, 1);
Index: projects/vnet/sys/net/netisr.c
===================================================================
--- projects/vnet/sys/net/netisr.c	(revision 299172)
+++ projects/vnet/sys/net/netisr.c	(revision 299173)
@@ -1,1380 +1,1545 @@
 /*-
  * Copyright (c) 2007-2009 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert N. M. Watson under contract
  * to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * netisr is a packet dispatch service, allowing synchronous (directly
  * dispatched) and asynchronous (deferred dispatch) processing of packets by
  * registered protocol handlers.  Callers pass a protocol identifier and
  * packet to netisr, along with a direct dispatch hint, and work will either
  * be immediately processed by the registered handler, or passed to a
  * software interrupt (SWI) thread for deferred dispatch.  Callers will
  * generally select one or the other based on:
  *
  * - Whether directly dispatching a netisr handler lead to code reentrance or
  *   lock recursion, such as entering the socket code from the socket code.
  * - Whether directly dispatching a netisr handler lead to recursive
  *   processing, such as when decapsulating several wrapped layers of tunnel
  *   information (IPSEC within IPSEC within ...).
  *
  * Maintaining ordering for protocol streams is a critical design concern.
  * Enforcing ordering limits the opportunity for concurrency, but maintains
  * the strong ordering requirements found in some protocols, such as TCP.  Of
  * related concern is CPU affinity--it is desirable to process all data
  * associated with a particular stream on the same CPU over time in order to
  * avoid acquiring locks associated with the connection on different CPUs,
  * keep connection data in one cache, and to generally encourage associated
  * user threads to live on the same CPU as the stream.  It's also desirable
  * to avoid lock migration and contention where locks are associated with
  * more than one flow.
  *
  * netisr supports several policy variations, represented by the
  * NETISR_POLICY_* constants, allowing protocols to play various roles in
  * identifying flows, assigning work to CPUs, etc.  These are described in
  * netisr.h.
  */
 
 #include "opt_ddb.h"
 #include "opt_device_polling.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/interrupt.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/rmlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #define	_WANT_NETISR_INTERNAL	/* Enable definitions from netisr_internal.h */
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 #include <net/netisr_internal.h>
 #include <net/vnet.h>
 
 /*-
  * Synchronize use and modification of the registered netisr data structures;
  * acquire a read lock while modifying the set of registered protocols to
  * prevent partially registered or unregistered protocols from being run.
  *
  * The following data structures and fields are protected by this lock:
  *
  * - The netisr_proto array, including all fields of struct netisr_proto.
  * - The nws array, including all fields of struct netisr_worker.
  * - The nws_array array.
  *
  * Note: the NETISR_LOCKING define controls whether read locks are acquired
  * in packet processing paths requiring netisr registration stability.  This
  * is disabled by default as it can lead to measurable performance
  * degradation even with rmlocks (3%-6% for loopback ping-pong traffic), and
  * because netisr registration and unregistration is extremely rare at
  * runtime.  If it becomes more common, this decision should be revisited.
  *
  * XXXRW: rmlocks don't support assertions.
  */
 static struct rmlock	netisr_rmlock;
 #define	NETISR_LOCK_INIT()	rm_init_flags(&netisr_rmlock, "netisr", \
 				    RM_NOWITNESS)
 #define	NETISR_LOCK_ASSERT()
 #define	NETISR_RLOCK(tracker)	rm_rlock(&netisr_rmlock, (tracker))
 #define	NETISR_RUNLOCK(tracker)	rm_runlock(&netisr_rmlock, (tracker))
 #define	NETISR_WLOCK()		rm_wlock(&netisr_rmlock)
 #define	NETISR_WUNLOCK()	rm_wunlock(&netisr_rmlock)
 /* #define	NETISR_LOCKING */
 
 static SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr");
 
 /*-
  * Three global direct dispatch policies are supported:
  *
  * NETISR_DISPATCH_DEFERRED: All work is deferred for a netisr, regardless of
  * context (may be overriden by protocols).
  *
  * NETISR_DISPATCH_HYBRID: If the executing context allows direct dispatch,
  * and we're running on the CPU the work would be performed on, then direct
  * dispatch it if it wouldn't violate ordering constraints on the workstream.
  *
  * NETISR_DISPATCH_DIRECT: If the executing context allows direct dispatch,
  * always direct dispatch.  (The default.)
  *
  * Notice that changing the global policy could lead to short periods of
  * misordered processing, but this is considered acceptable as compared to
  * the complexity of enforcing ordering during policy changes.  Protocols can
  * override the global policy (when they're not doing that, they select
  * NETISR_DISPATCH_DEFAULT).
  */
 #define	NETISR_DISPATCH_POLICY_DEFAULT	NETISR_DISPATCH_DIRECT
 #define	NETISR_DISPATCH_POLICY_MAXSTR	20 /* Used for temporary buffers. */
 static u_int	netisr_dispatch_policy = NETISR_DISPATCH_POLICY_DEFAULT;
 static int	sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_net_isr, OID_AUTO, dispatch, CTLTYPE_STRING | CTLFLAG_RWTUN,
     0, 0, sysctl_netisr_dispatch_policy, "A",
     "netisr dispatch policy");
 
 /*
  * Allow the administrator to limit the number of threads (CPUs) to use for
  * netisr.  We don't check netisr_maxthreads before creating the thread for
  * CPU 0. This must be set at boot. We will create at most one thread per CPU.
  * By default we initialize this to 1 which would assign just 1 cpu (cpu0) and
  * therefore only 1 workstream. If set to -1, netisr would use all cpus
  * (mp_ncpus) and therefore would have those many workstreams. One workstream
  * per thread (CPU).
  */
 static int	netisr_maxthreads = 1;		/* Max number of threads. */
 SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RDTUN,
     &netisr_maxthreads, 0,
     "Use at most this many CPUs for netisr processing");
 
 static int	netisr_bindthreads = 0;		/* Bind threads to CPUs. */
 SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RDTUN,
     &netisr_bindthreads, 0, "Bind netisr threads to CPUs.");
 
 /*
  * Limit per-workstream mbuf queue limits s to at most net.isr.maxqlimit,
  * both for initial configuration and later modification using
  * netisr_setqlimit().
  */
 #define	NETISR_DEFAULT_MAXQLIMIT	10240
 static u_int	netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT;
 SYSCTL_UINT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RDTUN,
     &netisr_maxqlimit, 0,
     "Maximum netisr per-protocol, per-CPU queue depth.");
 
 /*
  * The default per-workstream mbuf queue limit for protocols that don't
  * initialize the nh_qlimit field of their struct netisr_handler.  If this is
  * set above netisr_maxqlimit, we truncate it to the maximum during boot.
  */
 #define	NETISR_DEFAULT_DEFAULTQLIMIT	256
 static u_int	netisr_defaultqlimit = NETISR_DEFAULT_DEFAULTQLIMIT;
 SYSCTL_UINT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RDTUN,
     &netisr_defaultqlimit, 0,
     "Default netisr per-protocol, per-CPU queue limit if not set by protocol");
 
 /*
  * Store and export the compile-time constant NETISR_MAXPROT limit on the
  * number of protocols that can register with netisr at a time.  This is
  * required for crashdump analysis, as it sizes netisr_proto[].
  */
 static u_int	netisr_maxprot = NETISR_MAXPROT;
 SYSCTL_UINT(_net_isr, OID_AUTO, maxprot, CTLFLAG_RD,
     &netisr_maxprot, 0,
     "Compile-time limit on the number of protocols supported by netisr.");
 
 /*
  * The netisr_proto array describes all registered protocols, indexed by
  * protocol number.  See netisr_internal.h for more details.
  */
-static VNET_DEFINE(struct netisr_proto,	netisr_proto[NETISR_MAXPROT]);
-#define	V_netisr_proto		VNET(netisr_proto)
+static struct netisr_proto	netisr_proto[NETISR_MAXPROT];
 
 /*
+ * The netisr_enable array describes a per-VNET flag for registered
+ * protocols on whether this netisr is active in this VNET or not.
+ * netisr_register() will automatically enable the netisr for the
+ * default VNET and all currently active instances.
+ * netisr_unregister() will disable all active VNETs, including vnet0.
+ * Individual network stack instances can be enabled/disabled by the
+ * netisr_(un)register _vnet() functions.
+ * With this we keep the one netisr_proto per protocol but add a
+ * mechanism to stop netisr processing for vnet teardown.
+ * Apart from that we expect a VNET to always be enabled.
+ */
+static VNET_DEFINE(u_int,	netisr_enable[NETISR_MAXPROT]);
+#define	V_netisr_enable		VNET(netisr_enable)
+
+/*
  * Per-CPU workstream data.  See netisr_internal.h for more details.
  */
 DPCPU_DEFINE(struct netisr_workstream, nws);
 
 /*
  * Map contiguous values between 0 and nws_count into CPU IDs appropriate for
  * accessing workstreams.  This allows constructions of the form
  * DPCPU_ID_GET(nws_array[arbitraryvalue % nws_count], nws).
  */
 static u_int				 nws_array[MAXCPU];
 
 /*
  * Number of registered workstreams.  Will be at most the number of running
  * CPUs once fully started.
  */
 static u_int				 nws_count;
 SYSCTL_UINT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD,
     &nws_count, 0, "Number of extant netisr threads.");
 
 /*
  * Synchronization for each workstream: a mutex protects all mutable fields
  * in each stream, including per-protocol state (mbuf queues).  The SWI is
  * woken up if asynchronous dispatch is required.
  */
 #define	NWS_LOCK(s)		mtx_lock(&(s)->nws_mtx)
 #define	NWS_LOCK_ASSERT(s)	mtx_assert(&(s)->nws_mtx, MA_OWNED)
 #define	NWS_UNLOCK(s)		mtx_unlock(&(s)->nws_mtx)
 #define	NWS_SIGNAL(s)		swi_sched((s)->nws_swi_cookie, 0)
 
 /*
  * Utility routines for protocols that implement their own mapping of flows
  * to CPUs.
  */
 u_int
 netisr_get_cpucount(void)
 {
 
 	return (nws_count);
 }
 
 u_int
 netisr_get_cpuid(u_int cpunumber)
 {
 
 	KASSERT(cpunumber < nws_count, ("%s: %u > %u", __func__, cpunumber,
 	    nws_count));
 
 	return (nws_array[cpunumber]);
 }
 
 /*
  * The default implementation of flow -> CPU ID mapping.
  *
  * Non-static so that protocols can use it to map their own work to specific
  * CPUs in a manner consistent to netisr for affinity purposes.
  */
 u_int
 netisr_default_flow2cpu(u_int flowid)
 {
 
 	return (nws_array[flowid % nws_count]);
 }
 
 /*
  * Dispatch tunable and sysctl configuration.
  */
 struct netisr_dispatch_table_entry {
 	u_int		 ndte_policy;
 	const char	*ndte_policy_str;
 };
 static const struct netisr_dispatch_table_entry netisr_dispatch_table[] = {
 	{ NETISR_DISPATCH_DEFAULT, "default" },
 	{ NETISR_DISPATCH_DEFERRED, "deferred" },
 	{ NETISR_DISPATCH_HYBRID, "hybrid" },
 	{ NETISR_DISPATCH_DIRECT, "direct" },
 };
 static const u_int netisr_dispatch_table_len =
     (sizeof(netisr_dispatch_table) / sizeof(netisr_dispatch_table[0]));
 
 static void
 netisr_dispatch_policy_to_str(u_int dispatch_policy, char *buffer,
     u_int buflen)
 {
 	const struct netisr_dispatch_table_entry *ndtep;
 	const char *str;
 	u_int i;
 
 	str = "unknown";
 	for (i = 0; i < netisr_dispatch_table_len; i++) {
 		ndtep = &netisr_dispatch_table[i];
 		if (ndtep->ndte_policy == dispatch_policy) {
 			str = ndtep->ndte_policy_str;
 			break;
 		}
 	}
 	snprintf(buffer, buflen, "%s", str);
 }
 
 static int
 netisr_dispatch_policy_from_str(const char *str, u_int *dispatch_policyp)
 {
 	const struct netisr_dispatch_table_entry *ndtep;
 	u_int i;
 
 	for (i = 0; i < netisr_dispatch_table_len; i++) {
 		ndtep = &netisr_dispatch_table[i];
 		if (strcmp(ndtep->ndte_policy_str, str) == 0) {
 			*dispatch_policyp = ndtep->ndte_policy;
 			return (0);
 		}
 	}
 	return (EINVAL);
 }
 
 static int
 sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS)
 {
 	char tmp[NETISR_DISPATCH_POLICY_MAXSTR];
 	u_int dispatch_policy;
 	int error;
 
 	netisr_dispatch_policy_to_str(netisr_dispatch_policy, tmp,
 	    sizeof(tmp));
 	error = sysctl_handle_string(oidp, tmp, sizeof(tmp), req);
 	if (error == 0 && req->newptr != NULL) {
 		error = netisr_dispatch_policy_from_str(tmp,
 		    &dispatch_policy);
 		if (error == 0 && dispatch_policy == NETISR_DISPATCH_DEFAULT)
 			error = EINVAL;
 		if (error == 0)
 			netisr_dispatch_policy = dispatch_policy;
 	}
 	return (error);
 }
 
 /*
  * Register a new netisr handler, which requires initializing per-protocol
  * fields for each workstream.  All netisr work is briefly suspended while
  * the protocol is installed.
  */
 void
 netisr_register(const struct netisr_handler *nhp)
 {
+	VNET_ITERATOR_DECL(vnet_iter);
 	struct netisr_work *npwp;
 	const char *name;
 	u_int i, proto;
 
 	proto = nhp->nh_proto;
 	name = nhp->nh_name;
 
 	/*
 	 * Test that the requested registration is valid.
 	 */
 	KASSERT(nhp->nh_name != NULL,
 	    ("%s: nh_name NULL for %u", __func__, proto));
 	KASSERT(nhp->nh_handler != NULL,
 	    ("%s: nh_handler NULL for %s", __func__, name));
 	KASSERT(nhp->nh_policy == NETISR_POLICY_SOURCE ||
 	    nhp->nh_policy == NETISR_POLICY_FLOW ||
 	    nhp->nh_policy == NETISR_POLICY_CPU,
 	    ("%s: unsupported nh_policy %u for %s", __func__,
 	    nhp->nh_policy, name));
 	KASSERT(nhp->nh_policy == NETISR_POLICY_FLOW ||
 	    nhp->nh_m2flow == NULL,
 	    ("%s: nh_policy != FLOW but m2flow defined for %s", __func__,
 	    name));
 	KASSERT(nhp->nh_policy == NETISR_POLICY_CPU || nhp->nh_m2cpuid == NULL,
 	    ("%s: nh_policy != CPU but m2cpuid defined for %s", __func__,
 	    name));
 	KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL,
 	    ("%s: nh_policy == CPU but m2cpuid not defined for %s", __func__,
 	    name));
 	KASSERT(nhp->nh_dispatch == NETISR_DISPATCH_DEFAULT ||
 	    nhp->nh_dispatch == NETISR_DISPATCH_DEFERRED ||
 	    nhp->nh_dispatch == NETISR_DISPATCH_HYBRID ||
 	    nhp->nh_dispatch == NETISR_DISPATCH_DIRECT,
 	    ("%s: invalid nh_dispatch (%u)", __func__, nhp->nh_dispatch));
 
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u, %s): protocol too big", __func__, proto, name));
 
 	/*
 	 * Test that no existing registration exists for this protocol.
 	 */
 	NETISR_WLOCK();
-	KASSERT(V_netisr_proto[proto].np_name == NULL,
+	KASSERT(netisr_proto[proto].np_name == NULL,
 	    ("%s(%u, %s): name present", __func__, proto, name));
-	KASSERT(V_netisr_proto[proto].np_handler == NULL,
+	KASSERT(netisr_proto[proto].np_handler == NULL,
 	    ("%s(%u, %s): handler present", __func__, proto, name));
 
-	V_netisr_proto[proto].np_name = name;
-	V_netisr_proto[proto].np_handler = nhp->nh_handler;
-	V_netisr_proto[proto].np_m2flow = nhp->nh_m2flow;
-	V_netisr_proto[proto].np_m2cpuid = nhp->nh_m2cpuid;
-	V_netisr_proto[proto].np_drainedcpu = nhp->nh_drainedcpu;
+	netisr_proto[proto].np_name = name;
+	netisr_proto[proto].np_handler = nhp->nh_handler;
+	netisr_proto[proto].np_m2flow = nhp->nh_m2flow;
+	netisr_proto[proto].np_m2cpuid = nhp->nh_m2cpuid;
+	netisr_proto[proto].np_drainedcpu = nhp->nh_drainedcpu;
 	if (nhp->nh_qlimit == 0)
-		V_netisr_proto[proto].np_qlimit = netisr_defaultqlimit;
+		netisr_proto[proto].np_qlimit = netisr_defaultqlimit;
 	else if (nhp->nh_qlimit > netisr_maxqlimit) {
 		printf("%s: %s requested queue limit %u capped to "
 		    "net.isr.maxqlimit %u\n", __func__, name, nhp->nh_qlimit,
 		    netisr_maxqlimit);
-		V_netisr_proto[proto].np_qlimit = netisr_maxqlimit;
+		netisr_proto[proto].np_qlimit = netisr_maxqlimit;
 	} else
-		V_netisr_proto[proto].np_qlimit = nhp->nh_qlimit;
-	V_netisr_proto[proto].np_policy = nhp->nh_policy;
-	V_netisr_proto[proto].np_dispatch = nhp->nh_dispatch;
+		netisr_proto[proto].np_qlimit = nhp->nh_qlimit;
+	netisr_proto[proto].np_policy = nhp->nh_policy;
+	netisr_proto[proto].np_dispatch = nhp->nh_dispatch;
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		bzero(npwp, sizeof(*npwp));
-		npwp->nw_qlimit = V_netisr_proto[proto].np_qlimit;
+		npwp->nw_qlimit = netisr_proto[proto].np_qlimit;
 	}
+
+#ifdef VIMAGE
+	/*
+	 * Test that we are in vnet0 and have a curvnet set.
+	 */
+	KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__));
+	KASSERT(IS_DEFAULT_VNET(curvnet), ("%s: curvnet %p is not vnet0 %p",
+	    __func__, curvnet, vnet0));
+	VNET_LIST_RLOCK_NOSLEEP();
+	VNET_FOREACH(vnet_iter) {
+		CURVNET_SET(vnet_iter);
+		V_netisr_enable[proto] = 1;
+		CURVNET_RESTORE();
+	}
+	VNET_LIST_RUNLOCK_NOSLEEP();
+#endif
 	NETISR_WUNLOCK();
 }
 
 /*
  * Clear drop counters across all workstreams for a protocol.
  */
 void
 netisr_clearqdrops(const struct netisr_handler *nhp)
 {
 	struct netisr_work *npwp;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int i, proto;
 
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_WLOCK();
-	KASSERT(V_netisr_proto[proto].np_handler != NULL,
+	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		npwp->nw_qdrops = 0;
 	}
 	NETISR_WUNLOCK();
 }
 
 /*
  * Query current drop counters across all workstreams for a protocol.
  */
 void
 netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp)
 {
 	struct netisr_work *npwp;
 	struct rm_priotracker tracker;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int i, proto;
 
 	*qdropp = 0;
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_RLOCK(&tracker);
-	KASSERT(V_netisr_proto[proto].np_handler != NULL,
+	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		*qdropp += npwp->nw_qdrops;
 	}
 	NETISR_RUNLOCK(&tracker);
 }
 
 /*
  * Query current per-workstream queue limit for a protocol.
  */
 void
 netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp)
 {
 	struct rm_priotracker tracker;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int proto;
 
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_RLOCK(&tracker);
-	KASSERT(V_netisr_proto[proto].np_handler != NULL,
+	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
-	*qlimitp = V_netisr_proto[proto].np_qlimit;
+	*qlimitp = netisr_proto[proto].np_qlimit;
 	NETISR_RUNLOCK(&tracker);
 }
 
 /*
  * Update the queue limit across per-workstream queues for a protocol.  We
  * simply change the limits, and don't drain overflowed packets as they will
  * (hopefully) take care of themselves shortly.
  */
 int
 netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit)
 {
 	struct netisr_work *npwp;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int i, proto;
 
 	if (qlimit > netisr_maxqlimit)
 		return (EINVAL);
 
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_WLOCK();
-	KASSERT(V_netisr_proto[proto].np_handler != NULL,
+	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 
-	V_netisr_proto[proto].np_qlimit = qlimit;
+	netisr_proto[proto].np_qlimit = qlimit;
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		npwp->nw_qlimit = qlimit;
 	}
 	NETISR_WUNLOCK();
 	return (0);
 }
 
 /*
  * Drain all packets currently held in a particular protocol work queue.
  */
 static void
 netisr_drain_proto(struct netisr_work *npwp)
 {
 	struct mbuf *m;
 
 	/*
 	 * We would assert the lock on the workstream but it's not passed in.
 	 */
 	while ((m = npwp->nw_head) != NULL) {
 		npwp->nw_head = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		if (npwp->nw_head == NULL)
 			npwp->nw_tail = NULL;
 		npwp->nw_len--;
 		m_freem(m);
 	}
 	KASSERT(npwp->nw_tail == NULL, ("%s: tail", __func__));
 	KASSERT(npwp->nw_len == 0, ("%s: len", __func__));
 }
 
 /*
  * Remove the registration of a network protocol, which requires clearing
  * per-protocol fields across all workstreams, including freeing all mbufs in
  * the queues at time of unregister.  All work in netisr is briefly suspended
  * while this takes place.
  */
 void
 netisr_unregister(const struct netisr_handler *nhp)
 {
+	VNET_ITERATOR_DECL(vnet_iter);
 	struct netisr_work *npwp;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int i, proto;
 
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_WLOCK();
-	KASSERT(V_netisr_proto[proto].np_handler != NULL,
+	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 
-	V_netisr_proto[proto].np_name = NULL;
-	V_netisr_proto[proto].np_handler = NULL;
-	V_netisr_proto[proto].np_m2flow = NULL;
-	V_netisr_proto[proto].np_m2cpuid = NULL;
-	V_netisr_proto[proto].np_qlimit = 0;
-	V_netisr_proto[proto].np_policy = 0;
+#ifdef VIMAGE
+	VNET_LIST_RLOCK_NOSLEEP();
+	VNET_FOREACH(vnet_iter) {
+		CURVNET_SET(vnet_iter);
+		V_netisr_enable[proto] = 0;
+		CURVNET_RESTORE();
+	}
+	VNET_LIST_RUNLOCK_NOSLEEP();
+#endif
+
+	netisr_proto[proto].np_name = NULL;
+	netisr_proto[proto].np_handler = NULL;
+	netisr_proto[proto].np_m2flow = NULL;
+	netisr_proto[proto].np_m2cpuid = NULL;
+	netisr_proto[proto].np_qlimit = 0;
+	netisr_proto[proto].np_policy = 0;
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		netisr_drain_proto(npwp);
 		bzero(npwp, sizeof(*npwp));
 	}
 	NETISR_WUNLOCK();
 }
 
+#ifdef VIMAGE
+void
+netisr_register_vnet(const struct netisr_handler *nhp)
+{
+	u_int proto;
+
+	proto = nhp->nh_proto;
+
+	KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__));
+	KASSERT(proto < NETISR_MAXPROT,
+	    ("%s(%u): protocol too big for %s", __func__, proto, nhp->nh_name));
+	NETISR_WLOCK();
+	KASSERT(netisr_proto[proto].np_handler != NULL,
+	    ("%s(%u): protocol not registered for %s", __func__, proto,
+	    nhp->nh_name));
+	
+	V_netisr_enable[proto] = 1;
+	NETISR_WUNLOCK();
+}
+#endif
+
+#ifdef VIMAGE
+static void
+netisr_drain_proto_vnet(struct vnet *vnet, u_int proto)
+{
+	struct netisr_workstream *nwsp;
+	struct netisr_work *npwp;
+	struct mbuf *m, *mp, *n, *ne;
+	u_int i;
+#ifdef INVARIANTS
+	u_int b, k, f;
+#endif
+
+	KASSERT(vnet != NULL, ("%s: vnet is NULL", __func__));
+	NETISR_LOCK_ASSERT();
+
+	CPU_FOREACH(i) {
+		nwsp = DPCPU_ID_PTR(i, nws);
+		if (nwsp->nws_intr_event == NULL)
+			continue;
+		npwp = &nwsp->nws_work[proto];
+		NWS_LOCK(nwsp);
+#ifdef INVARIANTS
+		b = npwp->nw_len;
+		k = f = 0;
+#endif
+
+		/*
+		 * Rather than dissecting and removing mbufs from the middle
+		 * of the chain, we build a new chain if the packet stays and
+		 * update the head and tial pointers at the end.  All packets
+		 * matching the given vnet are freed.
+		 */
+		m = npwp->nw_head;
+		n = ne = NULL;
+		while (m != NULL) {
+			mp = m;
+			m = m->m_nextpkt;
+			mp->m_nextpkt = NULL;
+			if (mp->m_pkthdr.rcvif->if_vnet != vnet) {
+				if (n == NULL) {
+					n = ne = mp;
+				} else {
+					ne->m_nextpkt = mp;
+					ne = mp;
+				}
+#ifdef INVARIANTS
+				k++;
+#endif
+				continue;
+			}
+			/* This is a packet in the selected vnet. Free it. */
+			npwp->nw_len--;
+#ifdef INVARIANTS
+			f++;
+#endif
+			m_freem(mp);
+		}
+		npwp->nw_head = n;
+		npwp->nw_tail = ne;
+#ifdef INVARIANTS
+		KASSERT((k + f) == b, ("%s: b %u != k %u + f %u, nw_len %u "
+		    "vnet %p proto %u", __func__, b, k, f, npwp->nw_len,
+		    vnet, proto));
+#endif
+		NWS_UNLOCK(nwsp);
+	}
+}
+
+void
+netisr_unregister_vnet(const struct netisr_handler *nhp)
+{
+	u_int proto;
+
+	proto = nhp->nh_proto;
+
+	KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__));
+	KASSERT(proto < NETISR_MAXPROT,
+	    ("%s(%u): protocol too big for %s", __func__, proto, nhp->nh_name));
+	NETISR_WLOCK();
+	KASSERT(netisr_proto[proto].np_handler != NULL,
+	    ("%s(%u): protocol not registered for %s", __func__, proto,
+	    nhp->nh_name));
+	
+	V_netisr_enable[proto] = 0;
+
+	netisr_drain_proto_vnet(curvnet, proto);
+	NETISR_WUNLOCK();
+}
+#endif
+
 /*
  * Compose the global and per-protocol policies on dispatch, and return the
  * dispatch policy to use.
  */
 static u_int
 netisr_get_dispatch(struct netisr_proto *npp)
 {
 
 	/*
 	 * Protocol-specific configuration overrides the global default.
 	 */
 	if (npp->np_dispatch != NETISR_DISPATCH_DEFAULT)
 		return (npp->np_dispatch);
 	return (netisr_dispatch_policy);
 }
 
 /*
  * Look up the workstream given a packet and source identifier.  Do this by
  * checking the protocol's policy, and optionally call out to the protocol
  * for assistance if required.
  */
 static struct mbuf *
 netisr_select_cpuid(struct netisr_proto *npp, u_int dispatch_policy,
     uintptr_t source, struct mbuf *m, u_int *cpuidp)
 {
 	struct ifnet *ifp;
 	u_int policy;
 
 	NETISR_LOCK_ASSERT();
 
 	/*
 	 * In the event we have only one worker, shortcut and deliver to it
 	 * without further ado.
 	 */
 	if (nws_count == 1) {
 		*cpuidp = nws_array[0];
 		return (m);
 	}
 
 	/*
 	 * What happens next depends on the policy selected by the protocol.
 	 * If we want to support per-interface policies, we should do that
 	 * here first.
 	 */
 	policy = npp->np_policy;
 	if (policy == NETISR_POLICY_CPU) {
 		m = npp->np_m2cpuid(m, source, cpuidp);
 		if (m == NULL)
 			return (NULL);
 
 		/*
 		 * It's possible for a protocol not to have a good idea about
 		 * where to process a packet, in which case we fall back on
 		 * the netisr code to decide.  In the hybrid case, return the
 		 * current CPU ID, which will force an immediate direct
 		 * dispatch.  In the queued case, fall back on the SOURCE
 		 * policy.
 		 */
 		if (*cpuidp != NETISR_CPUID_NONE)
 			return (m);
 		if (dispatch_policy == NETISR_DISPATCH_HYBRID) {
 			*cpuidp = curcpu;
 			return (m);
 		}
 		policy = NETISR_POLICY_SOURCE;
 	}
 
 	if (policy == NETISR_POLICY_FLOW) {
 		if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE &&
 		    npp->np_m2flow != NULL) {
 			m = npp->np_m2flow(m, source);
 			if (m == NULL)
 				return (NULL);
 		}
 		if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
 			*cpuidp =
 			    netisr_default_flow2cpu(m->m_pkthdr.flowid);
 			return (m);
 		}
 		policy = NETISR_POLICY_SOURCE;
 	}
 
 	KASSERT(policy == NETISR_POLICY_SOURCE,
 	    ("%s: invalid policy %u for %s", __func__, npp->np_policy,
 	    npp->np_name));
 
 	ifp = m->m_pkthdr.rcvif;
 	if (ifp != NULL)
 		*cpuidp = nws_array[(ifp->if_index + source) % nws_count];
 	else
 		*cpuidp = nws_array[source % nws_count];
 	return (m);
 }
 
 /*
  * Process packets associated with a workstream and protocol.  For reasons of
  * fairness, we process up to one complete netisr queue at a time, moving the
  * queue to a stack-local queue for processing, but do not loop refreshing
  * from the global queue.  The caller is responsible for deciding whether to
  * loop, and for setting the NWS_RUNNING flag.  The passed workstream will be
  * locked on entry and relocked before return, but will be released while
  * processing.  The number of packets processed is returned.
  */
 static u_int
 netisr_process_workstream_proto(struct netisr_workstream *nwsp, u_int proto)
 {
 	struct netisr_work local_npw, *npwp;
 	u_int handled;
 	struct mbuf *m;
+#ifdef INVARIANTS
+	u_int b, f;
+#endif
 
 	NETISR_LOCK_ASSERT();
 	NWS_LOCK_ASSERT(nwsp);
 
 	KASSERT(nwsp->nws_flags & NWS_RUNNING,
 	    ("%s(%u): not running", __func__, proto));
 	KASSERT(proto >= 0 && proto < NETISR_MAXPROT,
 	    ("%s(%u): invalid proto\n", __func__, proto));
 
 	npwp = &nwsp->nws_work[proto];
 	if (npwp->nw_len == 0)
 		return (0);
 
+#ifdef INVARIANTS
+	b = npwp->nw_len;
+	f = 0;
+#endif
 	/*
 	 * Move the global work queue to a thread-local work queue.
 	 *
 	 * Notice that this means the effective maximum length of the queue
 	 * is actually twice that of the maximum queue length specified in
 	 * the protocol registration call.
 	 */
 	handled = npwp->nw_len;
 	local_npw = *npwp;
 	npwp->nw_head = NULL;
 	npwp->nw_tail = NULL;
 	npwp->nw_len = 0;
 	nwsp->nws_pendingbits &= ~(1 << proto);
 	NWS_UNLOCK(nwsp);
 	while ((m = local_npw.nw_head) != NULL) {
 		local_npw.nw_head = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		if (local_npw.nw_head == NULL)
 			local_npw.nw_tail = NULL;
 		local_npw.nw_len--;
+		f++;
 		VNET_ASSERT(m->m_pkthdr.rcvif != NULL,
 		    ("%s:%d rcvif == NULL: m=%p", __func__, __LINE__, m));
 		CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
-		V_netisr_proto[proto].np_handler(m);
+		netisr_proto[proto].np_handler(m);
 		CURVNET_RESTORE();
 	}
 	KASSERT(local_npw.nw_len == 0,
-	    ("%s(%u): len %u", __func__, proto, local_npw.nw_len));
-	/* We can just use the one from the default VNET. */
-	CURVNET_SET_QUIET(vnet0);
-	if (V_netisr_proto[proto].np_drainedcpu)
-		V_netisr_proto[proto].np_drainedcpu(nwsp->nws_cpu);
-	CURVNET_RESTORE();
+	    ("%s(%u): len %u b %u f %u", __func__, proto, local_npw.nw_len,
+	    b, f));
+	if (netisr_proto[proto].np_drainedcpu)
+		netisr_proto[proto].np_drainedcpu(nwsp->nws_cpu);
 	NWS_LOCK(nwsp);
 	npwp->nw_handled += handled;
 	return (handled);
 }
 
 /*
  * SWI handler for netisr -- processes packets in a set of workstreams that
  * it owns, woken up by calls to NWS_SIGNAL().  If this workstream is already
  * being direct dispatched, go back to sleep and wait for the dispatching
  * thread to wake us up again.
  */
 static void
 swi_net(void *arg)
 {
 #ifdef NETISR_LOCKING
 	struct rm_priotracker tracker;
 #endif
 	struct netisr_workstream *nwsp;
 	u_int bits, prot;
 
 	nwsp = arg;
 
 #ifdef DEVICE_POLLING
 	KASSERT(nws_count == 1,
 	    ("%s: device_polling but nws_count != 1", __func__));
 	netisr_poll();
 #endif
 #ifdef NETISR_LOCKING
 	NETISR_RLOCK(&tracker);
 #endif
 	NWS_LOCK(nwsp);
 	KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running"));
 	if (nwsp->nws_flags & NWS_DISPATCHING)
 		goto out;
 	nwsp->nws_flags |= NWS_RUNNING;
 	nwsp->nws_flags &= ~NWS_SCHEDULED;
 	while ((bits = nwsp->nws_pendingbits) != 0) {
 		while ((prot = ffs(bits)) != 0) {
 			prot--;
 			bits &= ~(1 << prot);
 			(void)netisr_process_workstream_proto(nwsp, prot);
 		}
 	}
 	nwsp->nws_flags &= ~NWS_RUNNING;
 out:
 	NWS_UNLOCK(nwsp);
 #ifdef NETISR_LOCKING
 	NETISR_RUNLOCK(&tracker);
 #endif
 #ifdef DEVICE_POLLING
 	netisr_pollmore();
 #endif
 }
 
 static int
 netisr_queue_workstream(struct netisr_workstream *nwsp, u_int proto,
     struct netisr_work *npwp, struct mbuf *m, int *dosignalp)
 {
 
 	NWS_LOCK_ASSERT(nwsp);
 
 	*dosignalp = 0;
 	if (npwp->nw_len < npwp->nw_qlimit) {
 		m->m_nextpkt = NULL;
 		if (npwp->nw_head == NULL) {
 			npwp->nw_head = m;
 			npwp->nw_tail = m;
 		} else {
 			npwp->nw_tail->m_nextpkt = m;
 			npwp->nw_tail = m;
 		}
 		npwp->nw_len++;
 		if (npwp->nw_len > npwp->nw_watermark)
 			npwp->nw_watermark = npwp->nw_len;
 
 		/*
 		 * We must set the bit regardless of NWS_RUNNING, so that
 		 * swi_net() keeps calling netisr_process_workstream_proto().
 		 */
 		nwsp->nws_pendingbits |= (1 << proto);
 		if (!(nwsp->nws_flags & 
 		    (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED))) {
 			nwsp->nws_flags |= NWS_SCHEDULED;
 			*dosignalp = 1;	/* Defer until unlocked. */
 		}
 		npwp->nw_queued++;
 		return (0);
 	} else {
 		m_freem(m);
 		npwp->nw_qdrops++;
 		return (ENOBUFS);
 	}
 }
 
 static int
 netisr_queue_internal(u_int proto, struct mbuf *m, u_int cpuid)
 {
 	struct netisr_workstream *nwsp;
 	struct netisr_work *npwp;
 	int dosignal, error;
 
 #ifdef NETISR_LOCKING
 	NETISR_LOCK_ASSERT();
 #endif
 	KASSERT(cpuid <= mp_maxid, ("%s: cpuid too big (%u, %u)", __func__,
 	    cpuid, mp_maxid));
 	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
 
 	dosignal = 0;
 	error = 0;
 	nwsp = DPCPU_ID_PTR(cpuid, nws);
 	npwp = &nwsp->nws_work[proto];
 	NWS_LOCK(nwsp);
 	error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal);
 	NWS_UNLOCK(nwsp);
 	if (dosignal)
 		NWS_SIGNAL(nwsp);
 	return (error);
 }
 
 int
 netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m)
 {
 #ifdef NETISR_LOCKING
 	struct rm_priotracker tracker;
 #endif
 	u_int cpuid;
 	int error;
 
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s: invalid proto %u", __func__, proto));
 
 #ifdef NETISR_LOCKING
 	NETISR_RLOCK(&tracker);
 #endif
-	if (V_netisr_proto[proto].np_handler == NULL) {
+	KASSERT(netisr_proto[proto].np_handler != NULL,
+	    ("%s: invalid proto %u", __func__, proto));
+
+	if (V_netisr_enable[proto] == 0) {
 		m_freem(m);
 		return (ENOPROTOOPT);
 	}
 
-	m = netisr_select_cpuid(&V_netisr_proto[proto], NETISR_DISPATCH_DEFERRED,
+	m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_DEFERRED,
 	    source, m, &cpuid);
 	if (m != NULL) {
 		KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__,
 		    cpuid));
 		error = netisr_queue_internal(proto, m, cpuid);
 	} else
 		error = ENOBUFS;
 #ifdef NETISR_LOCKING
 	NETISR_RUNLOCK(&tracker);
 #endif
 	return (error);
 }
 
 int
 netisr_queue(u_int proto, struct mbuf *m)
 {
 
 	return (netisr_queue_src(proto, 0, m));
 }
 
 /*
  * Dispatch a packet for netisr processing; direct dispatch is permitted by
  * calling context.
  */
 int
 netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m)
 {
 #ifdef NETISR_LOCKING
 	struct rm_priotracker tracker;
 #endif
 	struct netisr_workstream *nwsp;
 	struct netisr_proto *npp;
 	struct netisr_work *npwp;
 	int dosignal, error;
 	u_int cpuid, dispatch_policy;
 
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s: invalid proto %u", __func__, proto));
 #ifdef NETISR_LOCKING
 	NETISR_RLOCK(&tracker);
 #endif
-	npp = &V_netisr_proto[proto];
-	if (npp->np_handler == NULL) {
+	npp = &netisr_proto[proto];
+	KASSERT(npp->np_handler != NULL, ("%s: invalid proto %u", __func__,
+	    proto));
+
+	if (V_netisr_enable[proto] == 0) {
 		m_freem(m);
 		return (ENOPROTOOPT);
 	}
 
 	dispatch_policy = netisr_get_dispatch(npp);
 	if (dispatch_policy == NETISR_DISPATCH_DEFERRED)
 		return (netisr_queue_src(proto, source, m));
 
 	/*
 	 * If direct dispatch is forced, then unconditionally dispatch
 	 * without a formal CPU selection.  Borrow the current CPU's stats,
 	 * even if there's no worker on it.  In this case we don't update
 	 * nws_flags because all netisr processing will be source ordered due
 	 * to always being forced to directly dispatch.
 	 */
 	if (dispatch_policy == NETISR_DISPATCH_DIRECT) {
 		nwsp = DPCPU_PTR(nws);
 		npwp = &nwsp->nws_work[proto];
 		npwp->nw_dispatched++;
 		npwp->nw_handled++;
-		V_netisr_proto[proto].np_handler(m);
+		netisr_proto[proto].np_handler(m);
 		error = 0;
 		goto out_unlock;
 	}
 
 	KASSERT(dispatch_policy == NETISR_DISPATCH_HYBRID,
 	    ("%s: unknown dispatch policy (%u)", __func__, dispatch_policy));
 
 	/*
 	 * Otherwise, we execute in a hybrid mode where we will try to direct
 	 * dispatch if we're on the right CPU and the netisr worker isn't
 	 * already running.
 	 */
 	sched_pin();
-	m = netisr_select_cpuid(&V_netisr_proto[proto], NETISR_DISPATCH_HYBRID,
+	m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_HYBRID,
 	    source, m, &cpuid);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto out_unpin;
 	}
 	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
 	if (cpuid != curcpu)
 		goto queue_fallback;
 	nwsp = DPCPU_PTR(nws);
 	npwp = &nwsp->nws_work[proto];
 
 	/*-
 	 * We are willing to direct dispatch only if three conditions hold:
 	 *
 	 * (1) The netisr worker isn't already running,
 	 * (2) Another thread isn't already directly dispatching, and
 	 * (3) The netisr hasn't already been woken up.
 	 */
 	NWS_LOCK(nwsp);
 	if (nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED)) {
 		error = netisr_queue_workstream(nwsp, proto, npwp, m,
 		    &dosignal);
 		NWS_UNLOCK(nwsp);
 		if (dosignal)
 			NWS_SIGNAL(nwsp);
 		goto out_unpin;
 	}
 
 	/*
 	 * The current thread is now effectively the netisr worker, so set
 	 * the dispatching flag to prevent concurrent processing of the
 	 * stream from another thread (even the netisr worker), which could
 	 * otherwise lead to effective misordering of the stream.
 	 */
 	nwsp->nws_flags |= NWS_DISPATCHING;
 	NWS_UNLOCK(nwsp);
-	V_netisr_proto[proto].np_handler(m);
+	netisr_proto[proto].np_handler(m);
 	NWS_LOCK(nwsp);
 	nwsp->nws_flags &= ~NWS_DISPATCHING;
 	npwp->nw_handled++;
 	npwp->nw_hybrid_dispatched++;
 
 	/*
 	 * If other work was enqueued by another thread while we were direct
 	 * dispatching, we need to signal the netisr worker to do that work.
 	 * In the future, we might want to do some of that work in the
 	 * current thread, rather than trigger further context switches.  If
 	 * so, we'll want to establish a reasonable bound on the work done in
 	 * the "borrowed" context.
 	 */
 	if (nwsp->nws_pendingbits != 0) {
 		nwsp->nws_flags |= NWS_SCHEDULED;
 		dosignal = 1;
 	} else
 		dosignal = 0;
 	NWS_UNLOCK(nwsp);
 	if (dosignal)
 		NWS_SIGNAL(nwsp);
 	error = 0;
 	goto out_unpin;
 
 queue_fallback:
 	error = netisr_queue_internal(proto, m, cpuid);
 out_unpin:
 	sched_unpin();
 out_unlock:
 #ifdef NETISR_LOCKING
 	NETISR_RUNLOCK(&tracker);
 #endif
 	return (error);
 }
 
 int
 netisr_dispatch(u_int proto, struct mbuf *m)
 {
 
 	return (netisr_dispatch_src(proto, 0, m));
 }
 
 #ifdef DEVICE_POLLING
 /*
  * Kernel polling borrows a netisr thread to run interface polling in; this
  * function allows kernel polling to request that the netisr thread be
  * scheduled even if no packets are pending for protocols.
  */
 void
 netisr_sched_poll(void)
 {
 	struct netisr_workstream *nwsp;
 
 	nwsp = DPCPU_ID_PTR(nws_array[0], nws);
 	NWS_SIGNAL(nwsp);
 }
 #endif
 
 static void
 netisr_start_swi(u_int cpuid, struct pcpu *pc)
 {
 	char swiname[12];
 	struct netisr_workstream *nwsp;
 	int error;
 
 	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
 
 	nwsp = DPCPU_ID_PTR(cpuid, nws);
 	mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF);
 	nwsp->nws_cpu = cpuid;
 	snprintf(swiname, sizeof(swiname), "netisr %u", cpuid);
 	error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, nwsp,
 	    SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie);
 	if (error)
 		panic("%s: swi_add %d", __func__, error);
 	pc->pc_netisr = nwsp->nws_intr_event;
 	if (netisr_bindthreads) {
 		error = intr_event_bind(nwsp->nws_intr_event, cpuid);
 		if (error != 0)
 			printf("%s: cpu %u: intr_event_bind: %d", __func__,
 			    cpuid, error);
 	}
 	NETISR_WLOCK();
 	nws_array[nws_count] = nwsp->nws_cpu;
 	nws_count++;
 	NETISR_WUNLOCK();
 }
 
 /*
  * Initialize the netisr subsystem.  We rely on BSS and static initialization
  * of most fields in global data structures.
  *
  * Start a worker thread for the boot CPU so that we can support network
  * traffic immediately in case the network stack is used before additional
  * CPUs are started (for example, diskless boot).
  */
 static void
 netisr_init(void *arg)
 {
 	KASSERT(curcpu == 0, ("%s: not on CPU 0", __func__));
 
 	NETISR_LOCK_INIT();
 	if (netisr_maxthreads == 0 || netisr_maxthreads < -1 )
 		netisr_maxthreads = 1;		/* default behavior */
 	else if (netisr_maxthreads == -1)
 		netisr_maxthreads = mp_ncpus;	/* use max cpus */
 	if (netisr_maxthreads > mp_ncpus) {
 		printf("netisr_init: forcing maxthreads from %d to %d\n",
 		    netisr_maxthreads, mp_ncpus);
 		netisr_maxthreads = mp_ncpus;
 	}
 	if (netisr_defaultqlimit > netisr_maxqlimit) {
 		printf("netisr_init: forcing defaultqlimit from %d to %d\n",
 		    netisr_defaultqlimit, netisr_maxqlimit);
 		netisr_defaultqlimit = netisr_maxqlimit;
 	}
 #ifdef DEVICE_POLLING
 	/*
 	 * The device polling code is not yet aware of how to deal with
 	 * multiple netisr threads, so for the time being compiling in device
 	 * polling disables parallel netisr workers.
 	 */
 	if (netisr_maxthreads != 1 || netisr_bindthreads != 0) {
 		printf("netisr_init: forcing maxthreads to 1 and "
 		    "bindthreads to 0 for device polling\n");
 		netisr_maxthreads = 1;
 		netisr_bindthreads = 0;
 	}
 #endif
 	netisr_start_swi(curcpu, pcpu_find(curcpu));
 }
 SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL);
 
 /*
  * Start worker threads for additional CPUs.  No attempt to gracefully handle
  * work reassignment, we don't yet support dynamic reconfiguration.
  */
 static void
 netisr_start(void *arg)
 {
 	struct pcpu *pc;
 
 	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 		if (nws_count >= netisr_maxthreads)
 			break;
 		/* Worker will already be present for boot CPU. */
 		if (pc->pc_netisr != NULL)
 			continue;
 		netisr_start_swi(pc->pc_cpuid, pc);
 	}
 }
 SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL);
 
 /*
  * Sysctl monitoring for netisr: query a list of registered protocols.
  */
 static int
 sysctl_netisr_proto(SYSCTL_HANDLER_ARGS)
 {
 	struct rm_priotracker tracker;
 	struct sysctl_netisr_proto *snpp, *snp_array;
 	struct netisr_proto *npp;
 	u_int counter, proto;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EINVAL);
 	snp_array = malloc(sizeof(*snp_array) * NETISR_MAXPROT, M_TEMP,
 	    M_ZERO | M_WAITOK);
 	counter = 0;
 	NETISR_RLOCK(&tracker);
 	for (proto = 0; proto < NETISR_MAXPROT; proto++) {
-		npp = &V_netisr_proto[proto];
+		npp = &netisr_proto[proto];
 		if (npp->np_name == NULL)
 			continue;
 		snpp = &snp_array[counter];
 		snpp->snp_version = sizeof(*snpp);
 		strlcpy(snpp->snp_name, npp->np_name, NETISR_NAMEMAXLEN);
 		snpp->snp_proto = proto;
 		snpp->snp_qlimit = npp->np_qlimit;
 		snpp->snp_policy = npp->np_policy;
 		snpp->snp_dispatch = npp->np_dispatch;
 		if (npp->np_m2flow != NULL)
 			snpp->snp_flags |= NETISR_SNP_FLAGS_M2FLOW;
 		if (npp->np_m2cpuid != NULL)
 			snpp->snp_flags |= NETISR_SNP_FLAGS_M2CPUID;
 		if (npp->np_drainedcpu != NULL)
 			snpp->snp_flags |= NETISR_SNP_FLAGS_DRAINEDCPU;
 		counter++;
 	}
 	NETISR_RUNLOCK(&tracker);
 	KASSERT(counter <= NETISR_MAXPROT,
 	    ("sysctl_netisr_proto: counter too big (%d)", counter));
 	error = SYSCTL_OUT(req, snp_array, sizeof(*snp_array) * counter);
 	free(snp_array, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_isr, OID_AUTO, proto,
     CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_proto,
     "S,sysctl_netisr_proto",
     "Return list of protocols registered with netisr");
 
 /*
  * Sysctl monitoring for netisr: query a list of workstreams.
  */
 static int
 sysctl_netisr_workstream(SYSCTL_HANDLER_ARGS)
 {
 	struct rm_priotracker tracker;
 	struct sysctl_netisr_workstream *snwsp, *snws_array;
 	struct netisr_workstream *nwsp;
 	u_int counter, cpuid;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EINVAL);
 	snws_array = malloc(sizeof(*snws_array) * MAXCPU, M_TEMP,
 	    M_ZERO | M_WAITOK);
 	counter = 0;
 	NETISR_RLOCK(&tracker);
 	CPU_FOREACH(cpuid) {
 		nwsp = DPCPU_ID_PTR(cpuid, nws);
 		if (nwsp->nws_intr_event == NULL)
 			continue;
 		NWS_LOCK(nwsp);
 		snwsp = &snws_array[counter];
 		snwsp->snws_version = sizeof(*snwsp);
 
 		/*
 		 * For now, we equate workstream IDs and CPU IDs in the
 		 * kernel, but expose them independently to userspace in case
 		 * that assumption changes in the future.
 		 */
 		snwsp->snws_wsid = cpuid;
 		snwsp->snws_cpu = cpuid;
 		if (nwsp->nws_intr_event != NULL)
 			snwsp->snws_flags |= NETISR_SNWS_FLAGS_INTR;
 		NWS_UNLOCK(nwsp);
 		counter++;
 	}
 	NETISR_RUNLOCK(&tracker);
 	KASSERT(counter <= MAXCPU,
 	    ("sysctl_netisr_workstream: counter too big (%d)", counter));
 	error = SYSCTL_OUT(req, snws_array, sizeof(*snws_array) * counter);
 	free(snws_array, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_isr, OID_AUTO, workstream,
     CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_workstream,
     "S,sysctl_netisr_workstream",
     "Return list of workstreams implemented by netisr");
 
 /*
  * Sysctl monitoring for netisr: query per-protocol data across all
  * workstreams.
  */
 static int
 sysctl_netisr_work(SYSCTL_HANDLER_ARGS)
 {
 	struct rm_priotracker tracker;
 	struct sysctl_netisr_work *snwp, *snw_array;
 	struct netisr_workstream *nwsp;
 	struct netisr_proto *npp;
 	struct netisr_work *nwp;
 	u_int counter, cpuid, proto;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EINVAL);
 	snw_array = malloc(sizeof(*snw_array) * MAXCPU * NETISR_MAXPROT,
 	    M_TEMP, M_ZERO | M_WAITOK);
 	counter = 0;
 	NETISR_RLOCK(&tracker);
 	CPU_FOREACH(cpuid) {
 		nwsp = DPCPU_ID_PTR(cpuid, nws);
 		if (nwsp->nws_intr_event == NULL)
 			continue;
 		NWS_LOCK(nwsp);
 		for (proto = 0; proto < NETISR_MAXPROT; proto++) {
-			npp = &V_netisr_proto[proto];
+			npp = &netisr_proto[proto];
 			if (npp->np_name == NULL)
 				continue;
 			nwp = &nwsp->nws_work[proto];
 			snwp = &snw_array[counter];
 			snwp->snw_version = sizeof(*snwp);
 			snwp->snw_wsid = cpuid;		/* See comment above. */
 			snwp->snw_proto = proto;
 			snwp->snw_len = nwp->nw_len;
 			snwp->snw_watermark = nwp->nw_watermark;
 			snwp->snw_dispatched = nwp->nw_dispatched;
 			snwp->snw_hybrid_dispatched =
 			    nwp->nw_hybrid_dispatched;
 			snwp->snw_qdrops = nwp->nw_qdrops;
 			snwp->snw_queued = nwp->nw_queued;
 			snwp->snw_handled = nwp->nw_handled;
 			counter++;
 		}
 		NWS_UNLOCK(nwsp);
 	}
 	KASSERT(counter <= MAXCPU * NETISR_MAXPROT,
 	    ("sysctl_netisr_work: counter too big (%d)", counter));
 	NETISR_RUNLOCK(&tracker);
 	error = SYSCTL_OUT(req, snw_array, sizeof(*snw_array) * counter);
 	free(snw_array, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_isr, OID_AUTO, work,
     CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_work,
     "S,sysctl_netisr_work",
     "Return list of per-workstream, per-protocol work in netisr");
 
 #ifdef DDB
 DB_SHOW_COMMAND(netisr, db_show_netisr)
 {
 	struct netisr_workstream *nwsp;
 	struct netisr_work *nwp;
 	int first, proto;
 	u_int cpuid;
 
 	db_printf("%3s %6s %5s %5s %5s %8s %8s %8s %8s\n", "CPU", "Proto",
 	    "Len", "WMark", "Max", "Disp", "HDisp", "Drop", "Queue");
 	CPU_FOREACH(cpuid) {
 		nwsp = DPCPU_ID_PTR(cpuid, nws);
 		if (nwsp->nws_intr_event == NULL)
 			continue;
 		first = 1;
 		for (proto = 0; proto < NETISR_MAXPROT; proto++) {
-			if (V_netisr_proto[proto].np_handler == NULL)
+			if (netisr_proto[proto].np_handler == NULL)
 				continue;
 			nwp = &nwsp->nws_work[proto];
 			if (first) {
 				db_printf("%3d ", cpuid);
 				first = 0;
 			} else
 				db_printf("%3s ", "");
 			db_printf(
 			    "%6s %5d %5d %5d %8ju %8ju %8ju %8ju\n",
-			    V_netisr_proto[proto].np_name, nwp->nw_len,
+			    netisr_proto[proto].np_name, nwp->nw_len,
 			    nwp->nw_watermark, nwp->nw_qlimit,
 			    nwp->nw_dispatched, nwp->nw_hybrid_dispatched,
 			    nwp->nw_qdrops, nwp->nw_queued);
 		}
 	}
 }
 #endif
Index: projects/vnet/sys/net/netisr.h
===================================================================
--- projects/vnet/sys/net/netisr.h	(revision 299172)
+++ projects/vnet/sys/net/netisr.h	(revision 299173)
@@ -1,245 +1,249 @@
 /*-
  * Copyright (c) 2007-2009 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert N. M. Watson under contract
  * to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _NET_NETISR_H_
 #define _NET_NETISR_H_
 
 /*
  * The netisr (network interrupt service routine) provides a deferred
  * execution evironment in which (generally inbound) network processing can
  * take place.  Protocols register handlers which will be executed directly,
  * or via deferred dispatch, depending on the circumstances.
  *
  * Historically, this was implemented by the BSD software ISR facility; it is
  * now implemented via a software ithread (SWI).
  */
 
 /*
  * Protocol numbers, which are encoded in monitoring applications and kernel
  * modules.  Internally, these are used in bit shift operations so must have
  * a value 0 < proto < 32; we currently further limit at compile-time to 16
  * for array-sizing purposes.
  */
 #define	NETISR_IP	1
 #define	NETISR_IGMP	2		/* IGMPv3 output queue */
 #define	NETISR_ROUTE	3		/* routing socket */
 #define	NETISR_ARP	4		/* same as AF_LINK */
 #define	NETISR_ETHER	5		/* ethernet input */
 #define	NETISR_IPV6	6
 #define	NETISR_NATM	7
 #define	NETISR_EPAIR	8		/* if_epair(4) */
 #define	NETISR_IP_DIRECT	9	/* direct-dispatch IPv4 */
 #define	NETISR_IPV6_DIRECT	10	/* direct-dispatch IPv6 */
 
 /*
  * Protocol ordering and affinity policy constants.  See the detailed
  * discussion of policies later in the file.
  */
 #define	NETISR_POLICY_SOURCE	1	/* Maintain source ordering. */
 #define	NETISR_POLICY_FLOW	2	/* Maintain flow ordering. */
 #define	NETISR_POLICY_CPU	3	/* Protocol determines CPU placement. */
 
 /*
  * Protocol dispatch policy constants; selects whether and when direct
  * dispatch is permitted.
  */
 #define	NETISR_DISPATCH_DEFAULT		0	/* Use global default. */
 #define	NETISR_DISPATCH_DEFERRED	1	/* Always defer dispatch. */
 #define	NETISR_DISPATCH_HYBRID		2	/* Allow hybrid dispatch. */
 #define	NETISR_DISPATCH_DIRECT		3	/* Always direct dispatch. */
 
 /*
  * Monitoring data structures, exported by sysctl(2).
  *
  * Three sysctls are defined.  First, a per-protocol structure exported by
  * net.isr.proto.
  */
 #define	NETISR_NAMEMAXLEN	32
 struct sysctl_netisr_proto {
 	u_int	snp_version;			/* Length of struct. */
 	char	snp_name[NETISR_NAMEMAXLEN];	/* nh_name */
 	u_int	snp_proto;			/* nh_proto */
 	u_int	snp_qlimit;			/* nh_qlimit */
 	u_int	snp_policy;			/* nh_policy */
 	u_int	snp_flags;			/* Various flags. */
 	u_int	snp_dispatch;			/* Dispatch policy. */
 	u_int	_snp_ispare[6];
 };
 
 /*
  * Flags for sysctl_netisr_proto.snp_flags.
  */
 #define	NETISR_SNP_FLAGS_M2FLOW		0x00000001	/* nh_m2flow */
 #define	NETISR_SNP_FLAGS_M2CPUID	0x00000002	/* nh_m2cpuid */
 #define	NETISR_SNP_FLAGS_DRAINEDCPU	0x00000004	/* nh_drainedcpu */
 
 /*
  * Next, a structure per-workstream, with per-protocol data, exported as
  * net.isr.workstream.
  */
 struct sysctl_netisr_workstream {
 	u_int	snws_version;			/* Length of struct. */
 	u_int	snws_flags;			/* Various flags. */
 	u_int	snws_wsid;			/* Workstream ID. */
 	u_int	snws_cpu;			/* nws_cpu */
 	u_int	_snws_ispare[12];
 };
 
 /*
  * Flags for sysctl_netisr_workstream.snws_flags
  */
 #define	NETISR_SNWS_FLAGS_INTR		0x00000001	/* nws_intr_event */
 
 /*
  * Finally, a per-workstream-per-protocol structure, exported as
  * net.isr.work.
  */
 struct sysctl_netisr_work {
 	u_int	snw_version;			/* Length of struct. */
 	u_int	snw_wsid;			/* Workstream ID. */
 	u_int	snw_proto;			/* Protocol number. */
 	u_int	snw_len;			/* nw_len */
 	u_int	snw_watermark;			/* nw_watermark */
 	u_int	_snw_ispare[3];
 
 	uint64_t	snw_dispatched;		/* nw_dispatched */
 	uint64_t	snw_hybrid_dispatched;	/* nw_hybrid_dispatched */
 	uint64_t	snw_qdrops;		/* nw_qdrops */
 	uint64_t	snw_queued;		/* nw_queued */
 	uint64_t	snw_handled;		/* nw_handled */
 
 	uint64_t	_snw_llspare[7];
 };
 
 #ifdef _KERNEL
 
 /*-
  * Protocols express ordering constraints and affinity preferences by
  * implementing one or neither of nh_m2flow and nh_m2cpuid, which are used by
  * netisr to determine which per-CPU workstream to assign mbufs to.
  *
  * The following policies may be used by protocols:
  *
  * NETISR_POLICY_SOURCE - netisr should maintain source ordering without
  *                        advice from the protocol.  netisr will ignore any
  *                        flow IDs present on the mbuf for the purposes of
  *                        work placement.
  *
  * NETISR_POLICY_FLOW - netisr should maintain flow ordering as defined by
  *                      the mbuf header flow ID field.  If the protocol
  *                      implements nh_m2flow, then netisr will query the
  *                      protocol in the event that the mbuf doesn't have a
  *                      flow ID, falling back on source ordering.
  *
  * NETISR_POLICY_CPU - netisr will delegate all work placement decisions to
  *                     the protocol, querying nh_m2cpuid for each packet.
  *
  * Protocols might make decisions about work placement based on an existing
  * calculated flow ID on the mbuf, such as one provided in hardware, the
  * receive interface pointed to by the mbuf (if any), the optional source
  * identifier passed at some dispatch points, or even parse packet headers to
  * calculate a flow.  Both protocol handlers may return a new mbuf pointer
  * for the chain, or NULL if the packet proves invalid or m_pullup() fails.
  *
  * XXXRW: If we eventually support dynamic reconfiguration, there should be
  * protocol handlers to notify them of CPU configuration changes so that they
  * can rebalance work.
  */
 struct mbuf;
 typedef void		 netisr_handler_t(struct mbuf *m);
 typedef struct mbuf	*netisr_m2cpuid_t(struct mbuf *m, uintptr_t source,
 			 u_int *cpuid);
 typedef	struct mbuf	*netisr_m2flow_t(struct mbuf *m, uintptr_t source);
 typedef void		 netisr_drainedcpu_t(u_int cpuid);
 
 #define	NETISR_CPUID_NONE	((u_int)-1)	/* No affinity returned. */
 
 /*
  * Data structure describing a protocol handler.
  */
 struct netisr_handler {
 	const char	*nh_name;	/* Character string protocol name. */
 	netisr_handler_t *nh_handler;	/* Protocol handler. */
 	netisr_m2flow_t	*nh_m2flow;	/* Query flow for untagged packet. */
 	netisr_m2cpuid_t *nh_m2cpuid;	/* Query CPU to process mbuf on. */
 	netisr_drainedcpu_t *nh_drainedcpu; /* Callback when drained a queue. */
 	u_int		 nh_proto;	/* Integer protocol ID. */
 	u_int		 nh_qlimit;	/* Maximum per-CPU queue depth. */
 	u_int		 nh_policy;	/* Work placement policy. */
 	u_int		 nh_dispatch;	/* Dispatch policy. */
 	u_int		 nh_ispare[4];	/* For future use. */
 	void		*nh_pspare[4];	/* For future use. */
 };
 
 /*
  * Register, unregister, and other netisr handler management functions.
  */
 void	netisr_clearqdrops(const struct netisr_handler *nhp);
 void	netisr_getqdrops(const struct netisr_handler *nhp,
 	    u_int64_t *qdropsp);
 void	netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp);
 void	netisr_register(const struct netisr_handler *nhp);
 int	netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit);
 void	netisr_unregister(const struct netisr_handler *nhp);
+#ifdef VIMAGE
+void	netisr_register_vnet(const struct netisr_handler *nhp);
+void	netisr_unregister_vnet(const struct netisr_handler *nhp);
+#endif
 
 /*
  * Process a packet destined for a protocol, and attempt direct dispatch.
  * Supplemental source ordering information can be passed using the _src
  * variant.
  */
 int	netisr_dispatch(u_int proto, struct mbuf *m);
 int	netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m);
 int	netisr_queue(u_int proto, struct mbuf *m);
 int	netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m);
 
 /*
  * Provide a default implementation of "map an ID to a CPU ID".
  */
 u_int	netisr_default_flow2cpu(u_int flowid);
 
 /*
  * Utility routines to return the number of CPUs participting in netisr, and
  * to return a mapping from a number to a CPU ID that can be used with the
  * scheduler.
  */
 u_int	netisr_get_cpucount(void);
 u_int	netisr_get_cpuid(u_int cpunumber);
 
 /*
  * Interfaces between DEVICE_POLLING and netisr.
  */
 void	netisr_sched_poll(void);
 void	netisr_poll(void);
 void	netisr_pollmore(void);
 
 #endif /* !_KERNEL */
 #endif /* !_NET_NETISR_H_ */
Index: projects/vnet/sys/net/rtsock.c
===================================================================
--- projects/vnet/sys/net/rtsock.c	(revision 299172)
+++ projects/vnet/sys/net/rtsock.c	(revision 299173)
@@ -1,1942 +1,1938 @@
 /*-
  * Copyright (c) 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)rtsock.c	8.7 (Berkeley) 10/12/95
  * $FreeBSD$
  */
 #include "opt_compat.h"
 #include "opt_mpath.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/domain.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/raw_cb.h>
 #include <net/route.h>
 #include <net/route_var.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip_carp.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #endif
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 
 struct if_msghdr32 {
 	uint16_t ifm_msglen;
 	uint8_t	ifm_version;
 	uint8_t	ifm_type;
 	int32_t	ifm_addrs;
 	int32_t	ifm_flags;
 	uint16_t ifm_index;
 	struct	if_data ifm_data;
 };
 
 struct if_msghdrl32 {
 	uint16_t ifm_msglen;
 	uint8_t	ifm_version;
 	uint8_t	ifm_type;
 	int32_t	ifm_addrs;
 	int32_t	ifm_flags;
 	uint16_t ifm_index;
 	uint16_t _ifm_spare1;
 	uint16_t ifm_len;
 	uint16_t ifm_data_off;
 	struct	if_data ifm_data;
 };
 
 struct ifa_msghdrl32 {
 	uint16_t ifam_msglen;
 	uint8_t	ifam_version;
 	uint8_t	ifam_type;
 	int32_t	ifam_addrs;
 	int32_t	ifam_flags;
 	uint16_t ifam_index;
 	uint16_t _ifam_spare1;
 	uint16_t ifam_len;
 	uint16_t ifam_data_off;
 	int32_t	ifam_metric;
 	struct	if_data ifam_data;
 };
 #endif /* COMPAT_FREEBSD32 */
 
 MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables");
 
 /* NB: these are not modified */
 static struct	sockaddr route_src = { 2, PF_ROUTE, };
 static struct	sockaddr sa_zero   = { sizeof(sa_zero), AF_INET, };
 
 /* These are external hooks for CARP. */
 int	(*carp_get_vhid_p)(struct ifaddr *);
 
 /*
  * Used by rtsock/raw_input callback code to decide whether to filter the update
  * notification to a socket bound to a particular FIB.
  */
 #define	RTS_FILTER_FIB	M_PROTO8
 
 typedef struct {
 	int	ip_count;	/* attached w/ AF_INET */
 	int	ip6_count;	/* attached w/ AF_INET6 */
 	int	any_count;	/* total attached */
 } route_cb_t;
 static VNET_DEFINE(route_cb_t, route_cb);
 #define	V_route_cb VNET(route_cb)
 
 struct mtx rtsock_mtx;
 MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF);
 
 #define	RTSOCK_LOCK()	mtx_lock(&rtsock_mtx)
 #define	RTSOCK_UNLOCK()	mtx_unlock(&rtsock_mtx)
 #define	RTSOCK_LOCK_ASSERT()	mtx_assert(&rtsock_mtx, MA_OWNED)
 
 static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, "");
 
 struct walkarg {
 	int	w_tmemsize;
 	int	w_op, w_arg;
 	caddr_t	w_tmem;
 	struct sysctl_req *w_req;
 };
 
 static void	rts_input(struct mbuf *m);
 static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo);
 static int	rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo,
 			struct walkarg *w, int *plen);
 static int	rt_xaddrs(caddr_t cp, caddr_t cplim,
 			struct rt_addrinfo *rtinfo);
 static int	sysctl_dumpentry(struct radix_node *rn, void *vw);
 static int	sysctl_iflist(int af, struct walkarg *w);
 static int	sysctl_ifmalist(int af, struct walkarg *w);
 static int	route_output(struct mbuf *m, struct socket *so, ...);
 static void	rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out);
 static void	rt_dispatch(struct mbuf *, sa_family_t);
 static struct sockaddr	*rtsock_fix_netmask(struct sockaddr *dst,
 			struct sockaddr *smask, struct sockaddr_storage *dmask);
 
 static struct netisr_handler rtsock_nh = {
 	.nh_name = "rtsock",
 	.nh_handler = rts_input,
 	.nh_proto = NETISR_ROUTE,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 static int
 sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&rtsock_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
         if (error || !req->newptr)
                 return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&rtsock_nh, qlimit));
 }
 SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_route_netisr_maxqlen, "I",
     "maximum routing socket dispatch queue length");
 
 static void
-rts_init(void)
+vnet_rts_init(void)
 {
 	int tmp;
 
-	if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp))
-		rtsock_nh.nh_qlimit = tmp;
+	if (IS_DEFAULT_VNET(curvnet)) {
+		if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp))
+			rtsock_nh.nh_qlimit = tmp;
+		netisr_register(&rtsock_nh);
+	} else
+		netisr_register_vnet(&rtsock_nh);
 }
-SYSINIT(rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rts_init, 0);
-
-static void
-vnet_rts_init(void)
-{
-
-	netisr_register(&rtsock_nh);
-}
 VNET_SYSINIT(vnet_rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     vnet_rts_init, 0);
 
 static void
 vnet_rts_uninit(void)
 {
 
-	netisr_unregister(&rtsock_nh);
+	netisr_unregister_vnet(&rtsock_nh);
 }
 VNET_SYSUNINIT(vnet_rts_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     vnet_rts_uninit, 0);
 
 static int
 raw_input_rts_cb(struct mbuf *m, struct sockproto *proto, struct sockaddr *src,
     struct rawcb *rp)
 {
 	int fibnum;
 
 	KASSERT(m != NULL, ("%s: m is NULL", __func__));
 	KASSERT(proto != NULL, ("%s: proto is NULL", __func__));
 	KASSERT(rp != NULL, ("%s: rp is NULL", __func__));
 
 	/* No filtering requested. */
 	if ((m->m_flags & RTS_FILTER_FIB) == 0)
 		return (0);
 
 	/* Check if it is a rts and the fib matches the one of the socket. */
 	fibnum = M_GETFIB(m);
 	if (proto->sp_family != PF_ROUTE ||
 	    rp->rcb_socket == NULL ||
 	    rp->rcb_socket->so_fibnum == fibnum)
 		return (0);
 
 	/* Filtering requested and no match, the socket shall be skipped. */
 	return (1);
 }
 
 static void
 rts_input(struct mbuf *m)
 {
 	struct sockproto route_proto;
 	unsigned short *family;
 	struct m_tag *tag;
 
 	route_proto.sp_family = PF_ROUTE;
 	tag = m_tag_find(m, PACKET_TAG_RTSOCKFAM, NULL);
 	if (tag != NULL) {
 		family = (unsigned short *)(tag + 1);
 		route_proto.sp_protocol = *family;
 		m_tag_delete(m, tag);
 	} else
 		route_proto.sp_protocol = 0;
 
 	raw_input_ext(m, &route_proto, &route_src, raw_input_rts_cb);
 }
 
 /*
  * It really doesn't make any sense at all for this code to share much
  * with raw_usrreq.c, since its functionality is so restricted.  XXX
  */
 static void
 rts_abort(struct socket *so)
 {
 
 	raw_usrreqs.pru_abort(so);
 }
 
 static void
 rts_close(struct socket *so)
 {
 
 	raw_usrreqs.pru_close(so);
 }
 
 /* pru_accept is EOPNOTSUPP */
 
 static int
 rts_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct rawcb *rp;
 	int error;
 
 	KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL"));
 
 	/* XXX */
 	rp = malloc(sizeof *rp, M_PCB, M_WAITOK | M_ZERO);
 
 	so->so_pcb = (caddr_t)rp;
 	so->so_fibnum = td->td_proc->p_fibnum;
 	error = raw_attach(so, proto);
 	rp = sotorawcb(so);
 	if (error) {
 		so->so_pcb = NULL;
 		free(rp, M_PCB);
 		return error;
 	}
 	RTSOCK_LOCK();
 	switch(rp->rcb_proto.sp_protocol) {
 	case AF_INET:
 		V_route_cb.ip_count++;
 		break;
 	case AF_INET6:
 		V_route_cb.ip6_count++;
 		break;
 	}
 	V_route_cb.any_count++;
 	RTSOCK_UNLOCK();
 	soisconnected(so);
 	so->so_options |= SO_USELOOPBACK;
 	return 0;
 }
 
 static int
 rts_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */
 }
 
 static int
 rts_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (raw_usrreqs.pru_connect(so, nam, td)); /* XXX just EINVAL */
 }
 
 /* pru_connect2 is EOPNOTSUPP */
 /* pru_control is EOPNOTSUPP */
 
 static void
 rts_detach(struct socket *so)
 {
 	struct rawcb *rp = sotorawcb(so);
 
 	KASSERT(rp != NULL, ("rts_detach: rp == NULL"));
 
 	RTSOCK_LOCK();
 	switch(rp->rcb_proto.sp_protocol) {
 	case AF_INET:
 		V_route_cb.ip_count--;
 		break;
 	case AF_INET6:
 		V_route_cb.ip6_count--;
 		break;
 	}
 	V_route_cb.any_count--;
 	RTSOCK_UNLOCK();
 	raw_usrreqs.pru_detach(so);
 }
 
 static int
 rts_disconnect(struct socket *so)
 {
 
 	return (raw_usrreqs.pru_disconnect(so));
 }
 
 /* pru_listen is EOPNOTSUPP */
 
 static int
 rts_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 
 	return (raw_usrreqs.pru_peeraddr(so, nam));
 }
 
 /* pru_rcvd is EOPNOTSUPP */
 /* pru_rcvoob is EOPNOTSUPP */
 
 static int
 rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 	 struct mbuf *control, struct thread *td)
 {
 
 	return (raw_usrreqs.pru_send(so, flags, m, nam, control, td));
 }
 
 /* pru_sense is null */
 
 static int
 rts_shutdown(struct socket *so)
 {
 
 	return (raw_usrreqs.pru_shutdown(so));
 }
 
 static int
 rts_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 
 	return (raw_usrreqs.pru_sockaddr(so, nam));
 }
 
 static struct pr_usrreqs route_usrreqs = {
 	.pru_abort =		rts_abort,
 	.pru_attach =		rts_attach,
 	.pru_bind =		rts_bind,
 	.pru_connect =		rts_connect,
 	.pru_detach =		rts_detach,
 	.pru_disconnect =	rts_disconnect,
 	.pru_peeraddr =		rts_peeraddr,
 	.pru_send =		rts_send,
 	.pru_shutdown =		rts_shutdown,
 	.pru_sockaddr =		rts_sockaddr,
 	.pru_close =		rts_close,
 };
 
 #ifndef _SOCKADDR_UNION_DEFINED
 #define	_SOCKADDR_UNION_DEFINED
 /*
  * The union of all possible address formats we handle.
  */
 union sockaddr_union {
 	struct sockaddr		sa;
 	struct sockaddr_in	sin;
 	struct sockaddr_in6	sin6;
 };
 #endif /* _SOCKADDR_UNION_DEFINED */
 
 static int
 rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp,
     struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred)
 {
 
 	/* First, see if the returned address is part of the jail. */
 	if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) {
 		info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
 		return (0);
 	}
 
 	switch (info->rti_info[RTAX_DST]->sa_family) {
 #ifdef INET
 	case AF_INET:
 	{
 		struct in_addr ia;
 		struct ifaddr *ifa;
 		int found;
 
 		found = 0;
 		/*
 		 * Try to find an address on the given outgoing interface
 		 * that belongs to the jail.
 		 */
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa;
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			ia = ((struct sockaddr_in *)sa)->sin_addr;
 			if (prison_check_ip4(cred, &ia) == 0) {
 				found = 1;
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (!found) {
 			/*
 			 * As a last resort return the 'default' jail address.
 			 */
 			ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)->
 			    sin_addr;
 			if (prison_get_ip4(cred, &ia) != 0)
 				return (ESRCH);
 		}
 		bzero(&saun->sin, sizeof(struct sockaddr_in));
 		saun->sin.sin_len = sizeof(struct sockaddr_in);
 		saun->sin.sin_family = AF_INET;
 		saun->sin.sin_addr.s_addr = ia.s_addr;
 		info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin;
 		break;
 	}
 #endif
 #ifdef INET6
 	case AF_INET6:
 	{
 		struct in6_addr ia6;
 		struct ifaddr *ifa;
 		int found;
 
 		found = 0;
 		/*
 		 * Try to find an address on the given outgoing interface
 		 * that belongs to the jail.
 		 */
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa;
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET6)
 				continue;
 			bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr,
 			    &ia6, sizeof(struct in6_addr));
 			if (prison_check_ip6(cred, &ia6) == 0) {
 				found = 1;
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (!found) {
 			/*
 			 * As a last resort return the 'default' jail address.
 			 */
 			ia6 = ((struct sockaddr_in6 *)rt->rt_ifa->ifa_addr)->
 			    sin6_addr;
 			if (prison_get_ip6(cred, &ia6) != 0)
 				return (ESRCH);
 		}
 		bzero(&saun->sin6, sizeof(struct sockaddr_in6));
 		saun->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		saun->sin6.sin6_family = AF_INET6;
 		bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr));
 		if (sa6_recoverscope(&saun->sin6) != 0)
 			return (ESRCH);
 		info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6;
 		break;
 	}
 #endif
 	default:
 		return (ESRCH);
 	}
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 route_output(struct mbuf *m, struct socket *so, ...)
 {
 	struct rt_msghdr *rtm = NULL;
 	struct rtentry *rt = NULL;
 	struct rib_head *rnh;
 	struct rt_addrinfo info;
 	struct sockaddr_storage ss;
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 	int i, rti_need_deembed = 0;
 #endif
 	int alloc_len = 0, len, error = 0, fibnum;
 	struct ifnet *ifp = NULL;
 	union sockaddr_union saun;
 	sa_family_t saf = AF_UNSPEC;
 	struct rawcb *rp = NULL;
 	struct walkarg w;
 
 	fibnum = so->so_fibnum;
 
 #define senderr(e) { error = e; goto flush;}
 	if (m == NULL || ((m->m_len < sizeof(long)) &&
 		       (m = m_pullup(m, sizeof(long))) == NULL))
 		return (ENOBUFS);
 	if ((m->m_flags & M_PKTHDR) == 0)
 		panic("route_output");
 	len = m->m_pkthdr.len;
 	if (len < sizeof(*rtm) ||
 	    len != mtod(m, struct rt_msghdr *)->rtm_msglen)
 		senderr(EINVAL);
 
 	/*
 	 * Most of current messages are in range 200-240 bytes,
 	 * minimize possible re-allocation on reply using larger size
 	 * buffer aligned on 1k boundaty.
 	 */
 	alloc_len = roundup2(len, 1024);
 	if ((rtm = malloc(alloc_len, M_TEMP, M_NOWAIT)) == NULL)
 		senderr(ENOBUFS);
 
 	m_copydata(m, 0, len, (caddr_t)rtm);
 	bzero(&info, sizeof(info));
 	bzero(&w, sizeof(w));
 
 	if (rtm->rtm_version != RTM_VERSION) {
 		/* Do not touch message since format is unknown */
 		free(rtm, M_TEMP);
 		rtm = NULL;
 		senderr(EPROTONOSUPPORT);
 	}
 
 	/*
 	 * Starting from here, it is possible
 	 * to alter original message and insert
 	 * caller PID and error value.
 	 */
 
 	rtm->rtm_pid = curproc->p_pid;
 	info.rti_addrs = rtm->rtm_addrs;
 
 	info.rti_mflags = rtm->rtm_inits;
 	info.rti_rmx = &rtm->rtm_rmx;
 
 	/*
 	 * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6
 	 * link-local address because rtrequest requires addresses with
 	 * embedded scope id.
 	 */
 	if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info))
 		senderr(EINVAL);
 
 	info.rti_flags = rtm->rtm_flags;
 	if (info.rti_info[RTAX_DST] == NULL ||
 	    info.rti_info[RTAX_DST]->sa_family >= AF_MAX ||
 	    (info.rti_info[RTAX_GATEWAY] != NULL &&
 	     info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX))
 		senderr(EINVAL);
 	saf = info.rti_info[RTAX_DST]->sa_family;
 	/*
 	 * Verify that the caller has the appropriate privilege; RTM_GET
 	 * is the only operation the non-superuser is allowed.
 	 */
 	if (rtm->rtm_type != RTM_GET) {
 		error = priv_check(curthread, PRIV_NET_ROUTE);
 		if (error)
 			senderr(error);
 	}
 
 	/*
 	 * The given gateway address may be an interface address.
 	 * For example, issuing a "route change" command on a route
 	 * entry that was created from a tunnel, and the gateway
 	 * address given is the local end point. In this case the 
 	 * RTF_GATEWAY flag must be cleared or the destination will
 	 * not be reachable even though there is no error message.
 	 */
 	if (info.rti_info[RTAX_GATEWAY] != NULL &&
 	    info.rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) {
 		struct rt_addrinfo ginfo;
 		struct sockaddr *gdst;
 
 		bzero(&ginfo, sizeof(ginfo));
 		bzero(&ss, sizeof(ss));
 		ss.ss_len = sizeof(ss);
 
 		ginfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&ss;
 		gdst = info.rti_info[RTAX_GATEWAY];
 
 		/* 
 		 * A host route through the loopback interface is 
 		 * installed for each interface adddress. In pre 8.0
 		 * releases the interface address of a PPP link type
 		 * is not reachable locally. This behavior is fixed as 
 		 * part of the new L2/L3 redesign and rewrite work. The
 		 * signature of this interface address route is the
 		 * AF_LINK sa_family type of the rt_gateway, and the
 		 * rt_ifp has the IFF_LOOPBACK flag set.
 		 */
 		if (rib_lookup_info(fibnum, gdst, NHR_REF, 0, &ginfo) == 0) {
 			if (ss.ss_family == AF_LINK &&
 			    ginfo.rti_ifp->if_flags & IFF_LOOPBACK) {
 				info.rti_flags &= ~RTF_GATEWAY;
 				info.rti_flags |= RTF_GWFLAG_COMPAT;
 			}
 			rib_free_info(&ginfo);
 		}
 	}
 
 	switch (rtm->rtm_type) {
 		struct rtentry *saved_nrt;
 
 	case RTM_ADD:
 	case RTM_CHANGE:
 		if (info.rti_info[RTAX_GATEWAY] == NULL)
 			senderr(EINVAL);
 		saved_nrt = NULL;
 
 		/* support for new ARP code */
 		if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK &&
 		    (rtm->rtm_flags & RTF_LLDATA) != 0) {
 			error = lla_rt_output(rtm, &info);
 #ifdef INET6
 			if (error == 0)
 				rti_need_deembed = (V_deembed_scopeid) ? 1 : 0;
 #endif
 			break;
 		}
 		error = rtrequest1_fib(rtm->rtm_type, &info, &saved_nrt,
 		    fibnum);
 		if (error == 0 && saved_nrt != NULL) {
 #ifdef INET6
 			rti_need_deembed = (V_deembed_scopeid) ? 1 : 0;
 #endif
 			RT_LOCK(saved_nrt);
 			rtm->rtm_index = saved_nrt->rt_ifp->if_index;
 			RT_REMREF(saved_nrt);
 			RT_UNLOCK(saved_nrt);
 		}
 		break;
 
 	case RTM_DELETE:
 		saved_nrt = NULL;
 		/* support for new ARP code */
 		if (info.rti_info[RTAX_GATEWAY] && 
 		    (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) &&
 		    (rtm->rtm_flags & RTF_LLDATA) != 0) {
 			error = lla_rt_output(rtm, &info);
 #ifdef INET6
 			if (error == 0)
 				rti_need_deembed = (V_deembed_scopeid) ? 1 : 0;
 #endif
 			break;
 		}
 		error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt, fibnum);
 		if (error == 0) {
 			RT_LOCK(saved_nrt);
 			rt = saved_nrt;
 			goto report;
 		}
 #ifdef INET6
 		/* rt_msg2() will not be used when RTM_DELETE fails. */
 		rti_need_deembed = (V_deembed_scopeid) ? 1 : 0;
 #endif
 		break;
 
 	case RTM_GET:
 		rnh = rt_tables_get_rnh(fibnum, saf);
 		if (rnh == NULL)
 			senderr(EAFNOSUPPORT);
 
 		RIB_RLOCK(rnh);
 
 		if (info.rti_info[RTAX_NETMASK] == NULL &&
 		    rtm->rtm_type == RTM_GET) {
 			/*
 			 * Provide logest prefix match for
 			 * address lookup (no mask).
 			 * 'route -n get addr'
 			 */
 			rt = (struct rtentry *) rnh->rnh_matchaddr(
 			    info.rti_info[RTAX_DST], &rnh->head);
 		} else
 			rt = (struct rtentry *) rnh->rnh_lookup(
 			    info.rti_info[RTAX_DST],
 			    info.rti_info[RTAX_NETMASK], &rnh->head);
 
 		if (rt == NULL) {
 			RIB_RUNLOCK(rnh);
 			senderr(ESRCH);
 		}
 #ifdef RADIX_MPATH
 		/*
 		 * for RTM_CHANGE/LOCK, if we got multipath routes,
 		 * we require users to specify a matching RTAX_GATEWAY.
 		 *
 		 * for RTM_GET, gate is optional even with multipath.
 		 * if gate == NULL the first match is returned.
 		 * (no need to call rt_mpath_matchgate if gate == NULL)
 		 */
 		if (rt_mpath_capable(rnh) &&
 		    (rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) {
 			rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]);
 			if (!rt) {
 				RIB_RUNLOCK(rnh);
 				senderr(ESRCH);
 			}
 		}
 #endif
 		/*
 		 * If performing proxied L2 entry insertion, and
 		 * the actual PPP host entry is found, perform
 		 * another search to retrieve the prefix route of
 		 * the local end point of the PPP link.
 		 */
 		if (rtm->rtm_flags & RTF_ANNOUNCE) {
 			struct sockaddr laddr;
 
 			if (rt->rt_ifp != NULL && 
 			    rt->rt_ifp->if_type == IFT_PROPVIRTUAL) {
 				struct ifaddr *ifa;
 
 				ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1,
 						RT_ALL_FIBS);
 				if (ifa != NULL)
 					rt_maskedcopy(ifa->ifa_addr,
 						      &laddr,
 						      ifa->ifa_netmask);
 			} else
 				rt_maskedcopy(rt->rt_ifa->ifa_addr,
 					      &laddr,
 					      rt->rt_ifa->ifa_netmask);
 			/* 
 			 * refactor rt and no lock operation necessary
 			 */
 			rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr,
 			    &rnh->head);
 			if (rt == NULL) {
 				RIB_RUNLOCK(rnh);
 				senderr(ESRCH);
 			}
 		} 
 		RT_LOCK(rt);
 		RT_ADDREF(rt);
 		RIB_RUNLOCK(rnh);
 
 report:
 		RT_LOCK_ASSERT(rt);
 		if ((rt->rt_flags & RTF_HOST) == 0
 		    ? jailed_without_vnet(curthread->td_ucred)
 		    : prison_if(curthread->td_ucred,
 		    rt_key(rt)) != 0) {
 			RT_UNLOCK(rt);
 			senderr(ESRCH);
 		}
 		info.rti_info[RTAX_DST] = rt_key(rt);
 		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 		info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt),
 		    rt_mask(rt), &ss);
 		info.rti_info[RTAX_GENMASK] = 0;
 		if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) {
 			ifp = rt->rt_ifp;
 			if (ifp) {
 				info.rti_info[RTAX_IFP] =
 				    ifp->if_addr->ifa_addr;
 				error = rtm_get_jailed(&info, ifp, rt,
 				    &saun, curthread->td_ucred);
 				if (error != 0) {
 					RT_UNLOCK(rt);
 					senderr(error);
 				}
 				if (ifp->if_flags & IFF_POINTOPOINT)
 					info.rti_info[RTAX_BRD] =
 					    rt->rt_ifa->ifa_dstaddr;
 				rtm->rtm_index = ifp->if_index;
 			} else {
 				info.rti_info[RTAX_IFP] = NULL;
 				info.rti_info[RTAX_IFA] = NULL;
 			}
 		} else if ((ifp = rt->rt_ifp) != NULL) {
 			rtm->rtm_index = ifp->if_index;
 		}
 
 		/* Check if we need to realloc storage */
 		rtsock_msg_buffer(rtm->rtm_type, &info, NULL, &len);
 		if (len > alloc_len) {
 			struct rt_msghdr *new_rtm;
 			new_rtm = malloc(len, M_TEMP, M_NOWAIT);
 			if (new_rtm == NULL) {
 				RT_UNLOCK(rt);
 				senderr(ENOBUFS);
 			}
 			bcopy(rtm, new_rtm, rtm->rtm_msglen);
 			free(rtm, M_TEMP);
 			rtm = new_rtm;
 			alloc_len = len;
 		}
 
 		w.w_tmem = (caddr_t)rtm;
 		w.w_tmemsize = alloc_len;
 		rtsock_msg_buffer(rtm->rtm_type, &info, &w, &len);
 
 		if (rt->rt_flags & RTF_GWFLAG_COMPAT)
 			rtm->rtm_flags = RTF_GATEWAY | 
 				(rt->rt_flags & ~RTF_GWFLAG_COMPAT);
 		else
 			rtm->rtm_flags = rt->rt_flags;
 		rt_getmetrics(rt, &rtm->rtm_rmx);
 		rtm->rtm_addrs = info.rti_addrs;
 
 		RT_UNLOCK(rt);
 		break;
 
 	default:
 		senderr(EOPNOTSUPP);
 	}
 
 flush:
 	if (rt != NULL)
 		RTFREE(rt);
 	/*
 	 * Check to see if we don't want our own messages.
 	 */
 	if ((so->so_options & SO_USELOOPBACK) == 0) {
 		if (V_route_cb.any_count <= 1) {
 			if (rtm != NULL)
 				free(rtm, M_TEMP);
 			m_freem(m);
 			return (error);
 		}
 		/* There is another listener, so construct message */
 		rp = sotorawcb(so);
 	}
 
 	if (rtm != NULL) {
 #ifdef INET6
 		if (rti_need_deembed) {
 			/* sin6_scope_id is recovered before sending rtm. */
 			sin6 = (struct sockaddr_in6 *)&ss;
 			for (i = 0; i < RTAX_MAX; i++) {
 				if (info.rti_info[i] == NULL)
 					continue;
 				if (info.rti_info[i]->sa_family != AF_INET6)
 					continue;
 				bcopy(info.rti_info[i], sin6, sizeof(*sin6));
 				if (sa6_recoverscope(sin6) == 0)
 					bcopy(sin6, info.rti_info[i],
 						    sizeof(*sin6));
 			}
 		}
 #endif
 		if (error != 0)
 			rtm->rtm_errno = error;
 		else
 			rtm->rtm_flags |= RTF_DONE;
 
 		m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm);
 		if (m->m_pkthdr.len < rtm->rtm_msglen) {
 			m_freem(m);
 			m = NULL;
 		} else if (m->m_pkthdr.len > rtm->rtm_msglen)
 			m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len);
 
 		free(rtm, M_TEMP);
 	}
 	if (m != NULL) {
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 		if (rp) {
 			/*
 			 * XXX insure we don't get a copy by
 			 * invalidating our protocol
 			 */
 			unsigned short family = rp->rcb_proto.sp_family;
 			rp->rcb_proto.sp_family = 0;
 			rt_dispatch(m, saf);
 			rp->rcb_proto.sp_family = family;
 		} else
 			rt_dispatch(m, saf);
 	}
 
 	return (error);
 }
 
 static void
 rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out)
 {
 
 	bzero(out, sizeof(*out));
 	out->rmx_mtu = rt->rt_mtu;
 	out->rmx_weight = rt->rt_weight;
 	out->rmx_pksent = counter_u64_fetch(rt->rt_pksent);
 	/* Kernel -> userland timebase conversion. */
 	out->rmx_expire = rt->rt_expire ?
 	    rt->rt_expire - time_uptime + time_second : 0;
 }
 
 /*
  * Extract the addresses of the passed sockaddrs.
  * Do a little sanity checking so as to avoid bad memory references.
  * This data is derived straight from userland.
  */
 static int
 rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo)
 {
 	struct sockaddr *sa;
 	int i;
 
 	for (i = 0; i < RTAX_MAX && cp < cplim; i++) {
 		if ((rtinfo->rti_addrs & (1 << i)) == 0)
 			continue;
 		sa = (struct sockaddr *)cp;
 		/*
 		 * It won't fit.
 		 */
 		if (cp + sa->sa_len > cplim)
 			return (EINVAL);
 		/*
 		 * there are no more.. quit now
 		 * If there are more bits, they are in error.
 		 * I've seen this. route(1) can evidently generate these. 
 		 * This causes kernel to core dump.
 		 * for compatibility, If we see this, point to a safe address.
 		 */
 		if (sa->sa_len == 0) {
 			rtinfo->rti_info[i] = &sa_zero;
 			return (0); /* should be EINVAL but for compat */
 		}
 		/* accept it */
 #ifdef INET6
 		if (sa->sa_family == AF_INET6)
 			sa6_embedscope((struct sockaddr_in6 *)sa,
 			    V_ip6_use_defzone);
 #endif
 		rtinfo->rti_info[i] = sa;
 		cp += SA_SIZE(sa);
 	}
 	return (0);
 }
 
 /*
  * Fill in @dmask with valid netmask leaving original @smask
  * intact. Mostly used with radix netmasks.
  */
 static struct sockaddr *
 rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask,
     struct sockaddr_storage *dmask)
 {
 	if (dst == NULL || smask == NULL)
 		return (NULL);
 
 	memset(dmask, 0, dst->sa_len);
 	memcpy(dmask, smask, smask->sa_len);
 	dmask->ss_len = dst->sa_len;
 	dmask->ss_family = dst->sa_family;
 
 	return ((struct sockaddr *)dmask);
 }
 
 /*
  * Writes information related to @rtinfo object to newly-allocated mbuf.
  * Assumes MCLBYTES is enough to construct any message.
  * Used for OS notifications of vaious events (if/ifa announces,etc)
  *
  * Returns allocated mbuf or NULL on failure.
  */
 static struct mbuf *
 rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo)
 {
 	struct rt_msghdr *rtm;
 	struct mbuf *m;
 	int i;
 	struct sockaddr *sa;
 #ifdef INET6
 	struct sockaddr_storage ss;
 	struct sockaddr_in6 *sin6;
 #endif
 	int len, dlen;
 
 	switch (type) {
 
 	case RTM_DELADDR:
 	case RTM_NEWADDR:
 		len = sizeof(struct ifa_msghdr);
 		break;
 
 	case RTM_DELMADDR:
 	case RTM_NEWMADDR:
 		len = sizeof(struct ifma_msghdr);
 		break;
 
 	case RTM_IFINFO:
 		len = sizeof(struct if_msghdr);
 		break;
 
 	case RTM_IFANNOUNCE:
 	case RTM_IEEE80211:
 		len = sizeof(struct if_announcemsghdr);
 		break;
 
 	default:
 		len = sizeof(struct rt_msghdr);
 	}
 
 	/* XXXGL: can we use MJUMPAGESIZE cluster here? */
 	KASSERT(len <= MCLBYTES, ("%s: message too big", __func__));
 	if (len > MHLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	else
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (m);
 
 	m->m_pkthdr.len = m->m_len = len;
 	rtm = mtod(m, struct rt_msghdr *);
 	bzero((caddr_t)rtm, len);
 	for (i = 0; i < RTAX_MAX; i++) {
 		if ((sa = rtinfo->rti_info[i]) == NULL)
 			continue;
 		rtinfo->rti_addrs |= (1 << i);
 		dlen = SA_SIZE(sa);
 #ifdef INET6
 		if (V_deembed_scopeid && sa->sa_family == AF_INET6) {
 			sin6 = (struct sockaddr_in6 *)&ss;
 			bcopy(sa, sin6, sizeof(*sin6));
 			if (sa6_recoverscope(sin6) == 0)
 				sa = (struct sockaddr *)sin6;
 		}
 #endif
 		m_copyback(m, len, dlen, (caddr_t)sa);
 		len += dlen;
 	}
 	if (m->m_pkthdr.len != len) {
 		m_freem(m);
 		return (NULL);
 	}
 	rtm->rtm_msglen = len;
 	rtm->rtm_version = RTM_VERSION;
 	rtm->rtm_type = type;
 	return (m);
 }
 
 /*
  * Writes information related to @rtinfo object to preallocated buffer.
  * Stores needed size in @plen. If @w is NULL, calculates size without
  * writing.
  * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation.
  *
  * Returns 0 on success.
  *
  */
 static int
 rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen)
 {
 	int i;
 	int len, buflen = 0, dlen;
 	caddr_t cp = NULL;
 	struct rt_msghdr *rtm = NULL;
 #ifdef INET6
 	struct sockaddr_storage ss;
 	struct sockaddr_in6 *sin6;
 #endif
 
 	switch (type) {
 
 	case RTM_DELADDR:
 	case RTM_NEWADDR:
 		if (w != NULL && w->w_op == NET_RT_IFLISTL) {
 #ifdef COMPAT_FREEBSD32
 			if (w->w_req->flags & SCTL_MASK32)
 				len = sizeof(struct ifa_msghdrl32);
 			else
 #endif
 				len = sizeof(struct ifa_msghdrl);
 		} else
 			len = sizeof(struct ifa_msghdr);
 		break;
 
 	case RTM_IFINFO:
 #ifdef COMPAT_FREEBSD32
 		if (w != NULL && w->w_req->flags & SCTL_MASK32) {
 			if (w->w_op == NET_RT_IFLISTL)
 				len = sizeof(struct if_msghdrl32);
 			else
 				len = sizeof(struct if_msghdr32);
 			break;
 		}
 #endif
 		if (w != NULL && w->w_op == NET_RT_IFLISTL)
 			len = sizeof(struct if_msghdrl);
 		else
 			len = sizeof(struct if_msghdr);
 		break;
 
 	case RTM_NEWMADDR:
 		len = sizeof(struct ifma_msghdr);
 		break;
 
 	default:
 		len = sizeof(struct rt_msghdr);
 	}
 
 	if (w != NULL) {
 		rtm = (struct rt_msghdr *)w->w_tmem;
 		buflen = w->w_tmemsize - len;
 		cp = (caddr_t)w->w_tmem + len;
 	}
 
 	rtinfo->rti_addrs = 0;
 	for (i = 0; i < RTAX_MAX; i++) {
 		struct sockaddr *sa;
 
 		if ((sa = rtinfo->rti_info[i]) == NULL)
 			continue;
 		rtinfo->rti_addrs |= (1 << i);
 		dlen = SA_SIZE(sa);
 		if (cp != NULL && buflen >= dlen) {
 #ifdef INET6
 			if (V_deembed_scopeid && sa->sa_family == AF_INET6) {
 				sin6 = (struct sockaddr_in6 *)&ss;
 				bcopy(sa, sin6, sizeof(*sin6));
 				if (sa6_recoverscope(sin6) == 0)
 					sa = (struct sockaddr *)sin6;
 			}
 #endif
 			bcopy((caddr_t)sa, cp, (unsigned)dlen);
 			cp += dlen;
 			buflen -= dlen;
 		} else if (cp != NULL) {
 			/*
 			 * Buffer too small. Count needed size
 			 * and return with error.
 			 */
 			cp = NULL;
 		}
 
 		len += dlen;
 	}
 
 	if (cp != NULL) {
 		dlen = ALIGN(len) - len;
 		if (buflen < dlen)
 			cp = NULL;
 		else
 			buflen -= dlen;
 	}
 	len = ALIGN(len);
 
 	if (cp != NULL) {
 		/* fill header iff buffer is large enough */
 		rtm->rtm_version = RTM_VERSION;
 		rtm->rtm_type = type;
 		rtm->rtm_msglen = len;
 	}
 
 	*plen = len;
 
 	if (w != NULL && cp == NULL)
 		return (ENOBUFS);
 
 	return (0);
 }
 
 /*
  * This routine is called to generate a message from the routing
  * socket indicating that a redirect has occured, a routing lookup
  * has failed, or that a protocol has detected timeouts to a particular
  * destination.
  */
 void
 rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error,
     int fibnum)
 {
 	struct rt_msghdr *rtm;
 	struct mbuf *m;
 	struct sockaddr *sa = rtinfo->rti_info[RTAX_DST];
 
 	if (V_route_cb.any_count == 0)
 		return;
 	m = rtsock_msg_mbuf(type, rtinfo);
 	if (m == NULL)
 		return;
 
 	if (fibnum != RT_ALL_FIBS) {
 		KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out "
 		    "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs));
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rtm = mtod(m, struct rt_msghdr *);
 	rtm->rtm_flags = RTF_DONE | flags;
 	rtm->rtm_errno = error;
 	rtm->rtm_addrs = rtinfo->rti_addrs;
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 }
 
 void
 rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error)
 {
 
 	rt_missmsg_fib(type, rtinfo, flags, error, RT_ALL_FIBS);
 }
 
 /*
  * This routine is called to generate a message from the routing
  * socket indicating that the status of a network interface has changed.
  */
 void
 rt_ifmsg(struct ifnet *ifp)
 {
 	struct if_msghdr *ifm;
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	if (V_route_cb.any_count == 0)
 		return;
 	bzero((caddr_t)&info, sizeof(info));
 	m = rtsock_msg_mbuf(RTM_IFINFO, &info);
 	if (m == NULL)
 		return;
 	ifm = mtod(m, struct if_msghdr *);
 	ifm->ifm_index = ifp->if_index;
 	ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 	if_data_copy(ifp, &ifm->ifm_data);
 	ifm->ifm_addrs = 0;
 	rt_dispatch(m, AF_UNSPEC);
 }
 
 /*
  * Announce interface address arrival/withdraw.
  * Please do not call directly, use rt_addrmsg().
  * Assume input data to be valid.
  * Returns 0 on success.
  */
 int
 rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum)
 {
 	struct rt_addrinfo info;
 	struct sockaddr *sa;
 	int ncmd;
 	struct mbuf *m;
 	struct ifa_msghdr *ifam;
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct sockaddr_storage ss;
 
 	if (V_route_cb.any_count == 0)
 		return (0);
 
 	ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR;
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr;
 	info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr;
 	info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(
 	    info.rti_info[RTAX_IFP], ifa->ifa_netmask, &ss);
 	info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
 	if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL)
 		return (ENOBUFS);
 	ifam = mtod(m, struct ifa_msghdr *);
 	ifam->ifam_index = ifp->if_index;
 	ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 	ifam->ifam_flags = ifa->ifa_flags;
 	ifam->ifam_addrs = info.rti_addrs;
 
 	if (fibnum != RT_ALL_FIBS) {
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 
 	return (0);
 }
 
 /*
  * Announce route addition/removal.
  * Please do not call directly, use rt_routemsg().
  * Note that @rt data MAY be inconsistent/invalid:
  * if some userland app sends us "invalid" route message (invalid mask,
  * no dst, wrong address families, etc...) we need to pass it back
  * to app (and any other rtsock consumers) with rtm_errno field set to
  * non-zero value.
  *
  * Returns 0 on success.
  */
 int
 rtsock_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt,
     int fibnum)
 {
 	struct rt_addrinfo info;
 	struct sockaddr *sa;
 	struct mbuf *m;
 	struct rt_msghdr *rtm;
 	struct sockaddr_storage ss;
 
 	if (V_route_cb.any_count == 0)
 		return (0);
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = sa = rt_key(rt);
 	info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(sa, rt_mask(rt), &ss);
 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 	if ((m = rtsock_msg_mbuf(cmd, &info)) == NULL)
 		return (ENOBUFS);
 	rtm = mtod(m, struct rt_msghdr *);
 	rtm->rtm_index = ifp->if_index;
 	rtm->rtm_flags |= rt->rt_flags;
 	rtm->rtm_errno = error;
 	rtm->rtm_addrs = info.rti_addrs;
 
 	if (fibnum != RT_ALL_FIBS) {
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 
 	return (0);
 }
 
 /*
  * This is the analogue to the rt_newaddrmsg which performs the same
  * function but for multicast group memberhips.  This is easier since
  * there is no route state to worry about.
  */
 void
 rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma)
 {
 	struct rt_addrinfo info;
 	struct mbuf *m = NULL;
 	struct ifnet *ifp = ifma->ifma_ifp;
 	struct ifma_msghdr *ifmam;
 
 	if (V_route_cb.any_count == 0)
 		return;
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_IFA] = ifma->ifma_addr;
 	info.rti_info[RTAX_IFP] = ifp ? ifp->if_addr->ifa_addr : NULL;
 	/*
 	 * If a link-layer address is present, present it as a ``gateway''
 	 * (similarly to how ARP entries, e.g., are presented).
 	 */
 	info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr;
 	m = rtsock_msg_mbuf(cmd, &info);
 	if (m == NULL)
 		return;
 	ifmam = mtod(m, struct ifma_msghdr *);
 	KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n",
 	    __func__));
 	ifmam->ifmam_index = ifp->if_index;
 	ifmam->ifmam_addrs = info.rti_addrs;
 	rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC);
 }
 
 static struct mbuf *
 rt_makeifannouncemsg(struct ifnet *ifp, int type, int what,
 	struct rt_addrinfo *info)
 {
 	struct if_announcemsghdr *ifan;
 	struct mbuf *m;
 
 	if (V_route_cb.any_count == 0)
 		return NULL;
 	bzero((caddr_t)info, sizeof(*info));
 	m = rtsock_msg_mbuf(type, info);
 	if (m != NULL) {
 		ifan = mtod(m, struct if_announcemsghdr *);
 		ifan->ifan_index = ifp->if_index;
 		strlcpy(ifan->ifan_name, ifp->if_xname,
 			sizeof(ifan->ifan_name));
 		ifan->ifan_what = what;
 	}
 	return m;
 }
 
 /*
  * This is called to generate routing socket messages indicating
  * IEEE80211 wireless events.
  * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way.
  */
 void
 rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len)
 {
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info);
 	if (m != NULL) {
 		/*
 		 * Append the ieee80211 data.  Try to stick it in the
 		 * mbuf containing the ifannounce msg; otherwise allocate
 		 * a new mbuf and append.
 		 *
 		 * NB: we assume m is a single mbuf.
 		 */
 		if (data_len > M_TRAILINGSPACE(m)) {
 			struct mbuf *n = m_get(M_NOWAIT, MT_DATA);
 			if (n == NULL) {
 				m_freem(m);
 				return;
 			}
 			bcopy(data, mtod(n, void *), data_len);
 			n->m_len = data_len;
 			m->m_next = n;
 		} else if (data_len > 0) {
 			bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len);
 			m->m_len += data_len;
 		}
 		if (m->m_flags & M_PKTHDR)
 			m->m_pkthdr.len += data_len;
 		mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len;
 		rt_dispatch(m, AF_UNSPEC);
 	}
 }
 
 /*
  * This is called to generate routing socket messages indicating
  * network interface arrival and departure.
  */
 void
 rt_ifannouncemsg(struct ifnet *ifp, int what)
 {
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info);
 	if (m != NULL)
 		rt_dispatch(m, AF_UNSPEC);
 }
 
 static void
 rt_dispatch(struct mbuf *m, sa_family_t saf)
 {
 	struct m_tag *tag;
 
 	/*
 	 * Preserve the family from the sockaddr, if any, in an m_tag for
 	 * use when injecting the mbuf into the routing socket buffer from
 	 * the netisr.
 	 */
 	if (saf != AF_UNSPEC) {
 		tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short),
 		    M_NOWAIT);
 		if (tag == NULL) {
 			m_freem(m);
 			return;
 		}
 		*(unsigned short *)(tag + 1) = saf;
 		m_tag_prepend(m, tag);
 	}
 #ifdef VIMAGE
 	if (V_loif)
 		m->m_pkthdr.rcvif = V_loif;
 	else {
 		m_freem(m);
 		return;
 	}
 #endif
 	netisr_queue(NETISR_ROUTE, m);	/* mbuf is free'd on failure. */
 }
 
 /*
  * This is used in dumping the kernel table via sysctl().
  */
 static int
 sysctl_dumpentry(struct radix_node *rn, void *vw)
 {
 	struct walkarg *w = vw;
 	struct rtentry *rt = (struct rtentry *)rn;
 	int error = 0, size;
 	struct rt_addrinfo info;
 	struct sockaddr_storage ss;
 
 	if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
 		return 0;
 	if ((rt->rt_flags & RTF_HOST) == 0
 	    ? jailed_without_vnet(w->w_req->td->td_ucred)
 	    : prison_if(w->w_req->td->td_ucred, rt_key(rt)) != 0)
 		return (0);
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = rt_key(rt);
 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 	info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt),
 	    rt_mask(rt), &ss);
 	info.rti_info[RTAX_GENMASK] = 0;
 	if (rt->rt_ifp) {
 		info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr;
 		info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
 		if (rt->rt_ifp->if_flags & IFF_POINTOPOINT)
 			info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
 	}
 	if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0)
 		return (error);
 	if (w->w_req && w->w_tmem) {
 		struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
 
 		if (rt->rt_flags & RTF_GWFLAG_COMPAT)
 			rtm->rtm_flags = RTF_GATEWAY | 
 				(rt->rt_flags & ~RTF_GWFLAG_COMPAT);
 		else
 			rtm->rtm_flags = rt->rt_flags;
 		rt_getmetrics(rt, &rtm->rtm_rmx);
 		rtm->rtm_index = rt->rt_ifp->if_index;
 		rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0;
 		rtm->rtm_addrs = info.rti_addrs;
 		error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
 		return (error);
 	}
 	return (error);
 }
 
 static int
 sysctl_iflist_ifml(struct ifnet *ifp, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct if_msghdrl *ifm;
 	struct if_data *ifd;
 
 	ifm = (struct if_msghdrl *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct if_msghdrl32 *ifm32;
 
 		ifm32 = (struct if_msghdrl32 *)ifm;
 		ifm32->ifm_addrs = info->rti_addrs;
 		ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm32->ifm_index = ifp->if_index;
 		ifm32->_ifm_spare1 = 0;
 		ifm32->ifm_len = sizeof(*ifm32);
 		ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data);
 		ifd = &ifm32->ifm_data;
 	} else
 #endif
 	{
 		ifm->ifm_addrs = info->rti_addrs;
 		ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm->ifm_index = ifp->if_index;
 		ifm->_ifm_spare1 = 0;
 		ifm->ifm_len = sizeof(*ifm);
 		ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data);
 		ifd = &ifm->ifm_data;
 	}
 
 	if_data_copy(ifp, ifd);
 
 	return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len));
 }
 
 static int
 sysctl_iflist_ifm(struct ifnet *ifp, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct if_msghdr *ifm;
 	struct if_data *ifd;
 
 	ifm = (struct if_msghdr *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct if_msghdr32 *ifm32;
 
 		ifm32 = (struct if_msghdr32 *)ifm;
 		ifm32->ifm_addrs = info->rti_addrs;
 		ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm32->ifm_index = ifp->if_index;
 		ifd = &ifm32->ifm_data;
 	} else
 #endif
 	{
 		ifm->ifm_addrs = info->rti_addrs;
 		ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm->ifm_index = ifp->if_index;
 		ifd = &ifm->ifm_data;
 	}
 
 	if_data_copy(ifp, ifd);
 
 	return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len));
 }
 
 static int
 sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct ifa_msghdrl *ifam;
 	struct if_data *ifd;
 
 	ifam = (struct ifa_msghdrl *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct ifa_msghdrl32 *ifam32;
 
 		ifam32 = (struct ifa_msghdrl32 *)ifam;
 		ifam32->ifam_addrs = info->rti_addrs;
 		ifam32->ifam_flags = ifa->ifa_flags;
 		ifam32->ifam_index = ifa->ifa_ifp->if_index;
 		ifam32->_ifam_spare1 = 0;
 		ifam32->ifam_len = sizeof(*ifam32);
 		ifam32->ifam_data_off =
 		    offsetof(struct ifa_msghdrl32, ifam_data);
 		ifam32->ifam_metric = ifa->ifa_ifp->if_metric;
 		ifd = &ifam32->ifam_data;
 	} else
 #endif
 	{
 		ifam->ifam_addrs = info->rti_addrs;
 		ifam->ifam_flags = ifa->ifa_flags;
 		ifam->ifam_index = ifa->ifa_ifp->if_index;
 		ifam->_ifam_spare1 = 0;
 		ifam->ifam_len = sizeof(*ifam);
 		ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data);
 		ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 		ifd = &ifam->ifam_data;
 	}
 
 	bzero(ifd, sizeof(*ifd));
 	ifd->ifi_datalen = sizeof(struct if_data);
 	ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets);
 	ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets);
 	ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes);
 	ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes);
 
 	/* Fixup if_data carp(4) vhid. */
 	if (carp_get_vhid_p != NULL)
 		ifd->ifi_vhid = (*carp_get_vhid_p)(ifa);
 
 	return (SYSCTL_OUT(w->w_req, w->w_tmem, len));
 }
 
 static int
 sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct ifa_msghdr *ifam;
 
 	ifam = (struct ifa_msghdr *)w->w_tmem;
 	ifam->ifam_addrs = info->rti_addrs;
 	ifam->ifam_flags = ifa->ifa_flags;
 	ifam->ifam_index = ifa->ifa_ifp->if_index;
 	ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 
 	return (SYSCTL_OUT(w->w_req, w->w_tmem, len));
 }
 
 static int
 sysctl_iflist(int af, struct walkarg *w)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct rt_addrinfo info;
 	int len, error = 0;
 	struct sockaddr_storage ss;
 
 	bzero((caddr_t)&info, sizeof(info));
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (w->w_arg && w->w_arg != ifp->if_index)
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		ifa = ifp->if_addr;
 		info.rti_info[RTAX_IFP] = ifa->ifa_addr;
 		error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len);
 		if (error != 0)
 			goto done;
 		info.rti_info[RTAX_IFP] = NULL;
 		if (w->w_req && w->w_tmem) {
 			if (w->w_op == NET_RT_IFLISTL)
 				error = sysctl_iflist_ifml(ifp, &info, w, len);
 			else
 				error = sysctl_iflist_ifm(ifp, &info, w, len);
 			if (error)
 				goto done;
 		}
 		while ((ifa = TAILQ_NEXT(ifa, ifa_link)) != NULL) {
 			if (af && af != ifa->ifa_addr->sa_family)
 				continue;
 			if (prison_if(w->w_req->td->td_ucred,
 			    ifa->ifa_addr) != 0)
 				continue;
 			info.rti_info[RTAX_IFA] = ifa->ifa_addr;
 			info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(
 			    ifa->ifa_addr, ifa->ifa_netmask, &ss);
 			info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
 			error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len);
 			if (error != 0)
 				goto done;
 			if (w->w_req && w->w_tmem) {
 				if (w->w_op == NET_RT_IFLISTL)
 					error = sysctl_iflist_ifaml(ifa, &info,
 					    w, len);
 				else
 					error = sysctl_iflist_ifam(ifa, &info,
 					    w, len);
 				if (error)
 					goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		info.rti_info[RTAX_IFA] = NULL;
 		info.rti_info[RTAX_NETMASK] = NULL;
 		info.rti_info[RTAX_BRD] = NULL;
 	}
 done:
 	if (ifp != NULL)
 		IF_ADDR_RUNLOCK(ifp);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (error);
 }
 
 static int
 sysctl_ifmalist(int af, struct walkarg *w)
 {
 	struct ifnet *ifp;
 	struct ifmultiaddr *ifma;
 	struct	rt_addrinfo info;
 	int	len, error = 0;
 	struct ifaddr *ifa;
 
 	bzero((caddr_t)&info, sizeof(info));
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (w->w_arg && w->w_arg != ifp->if_index)
 			continue;
 		ifa = ifp->if_addr;
 		info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (af && af != ifma->ifma_addr->sa_family)
 				continue;
 			if (prison_if(w->w_req->td->td_ucred,
 			    ifma->ifma_addr) != 0)
 				continue;
 			info.rti_info[RTAX_IFA] = ifma->ifma_addr;
 			info.rti_info[RTAX_GATEWAY] =
 			    (ifma->ifma_addr->sa_family != AF_LINK) ?
 			    ifma->ifma_lladdr : NULL;
 			error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len);
 			if (error != 0)
 				goto done;
 			if (w->w_req && w->w_tmem) {
 				struct ifma_msghdr *ifmam;
 
 				ifmam = (struct ifma_msghdr *)w->w_tmem;
 				ifmam->ifmam_index = ifma->ifma_ifp->if_index;
 				ifmam->ifmam_flags = 0;
 				ifmam->ifmam_addrs = info.rti_addrs;
 				error = SYSCTL_OUT(w->w_req, w->w_tmem, len);
 				if (error) {
 					IF_ADDR_RUNLOCK(ifp);
 					goto done;
 				}
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (error);
 }
 
 static int
 sysctl_rtsock(SYSCTL_HANDLER_ARGS)
 {
 	int	*name = (int *)arg1;
 	u_int	namelen = arg2;
 	struct rib_head *rnh = NULL; /* silence compiler. */
 	int	i, lim, error = EINVAL;
 	int	fib = 0;
 	u_char	af;
 	struct	walkarg w;
 
 	name ++;
 	namelen--;
 	if (req->newptr)
 		return (EPERM);
 	if (name[1] == NET_RT_DUMP) {
 		if (namelen == 3)
 			fib = req->td->td_proc->p_fibnum;
 		else if (namelen == 4)
 			fib = (name[3] == RT_ALL_FIBS) ?
 			    req->td->td_proc->p_fibnum : name[3];
 		else
 			return ((namelen < 3) ? EISDIR : ENOTDIR);
 		if (fib < 0 || fib >= rt_numfibs)
 			return (EINVAL);
 	} else if (namelen != 3)
 		return ((namelen < 3) ? EISDIR : ENOTDIR);
 	af = name[0];
 	if (af > AF_MAX)
 		return (EINVAL);
 	bzero(&w, sizeof(w));
 	w.w_op = name[1];
 	w.w_arg = name[2];
 	w.w_req = req;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error)
 		return (error);
 	
 	/*
 	 * Allocate reply buffer in advance.
 	 * All rtsock messages has maximum length of u_short.
 	 */
 	w.w_tmemsize = 65536;
 	w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK);
 
 	switch (w.w_op) {
 
 	case NET_RT_DUMP:
 	case NET_RT_FLAGS:
 		if (af == 0) {			/* dump all tables */
 			i = 1;
 			lim = AF_MAX;
 		} else				/* dump only one table */
 			i = lim = af;
 
 		/*
 		 * take care of llinfo entries, the caller must
 		 * specify an AF
 		 */
 		if (w.w_op == NET_RT_FLAGS &&
 		    (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) {
 			if (af != 0)
 				error = lltable_sysctl_dumparp(af, w.w_req);
 			else
 				error = EINVAL;
 			break;
 		}
 		/*
 		 * take care of routing entries
 		 */
 		for (error = 0; error == 0 && i <= lim; i++) {
 			rnh = rt_tables_get_rnh(fib, i);
 			if (rnh != NULL) {
 				RIB_RLOCK(rnh); 
 			    	error = rnh->rnh_walktree(&rnh->head,
 				    sysctl_dumpentry, &w);
 				RIB_RUNLOCK(rnh);
 			} else if (af != 0)
 				error = EAFNOSUPPORT;
 		}
 		break;
 
 	case NET_RT_IFLIST:
 	case NET_RT_IFLISTL:
 		error = sysctl_iflist(af, &w);
 		break;
 
 	case NET_RT_IFMALIST:
 		error = sysctl_ifmalist(af, &w);
 		break;
 	}
 
 	free(w.w_tmem, M_TEMP);
 	return (error);
 }
 
 static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, "");
 
 /*
  * Definitions of protocols supported in the ROUTE domain.
  */
 
 static struct domain routedomain;		/* or at least forward */
 
 static struct protosw routesw[] = {
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&routedomain,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_output =		route_output,
 	.pr_ctlinput =		raw_ctlinput,
 	.pr_init =		raw_init,
 	.pr_usrreqs =		&route_usrreqs
 }
 };
 
 static struct domain routedomain = {
 	.dom_family =		PF_ROUTE,
 	.dom_name =		 "route",
 	.dom_protosw =		routesw,
 	.dom_protoswNPROTOSW =	&routesw[sizeof(routesw)/sizeof(routesw[0])]
 };
 
 VNET_DOMAIN_SET(route);
Index: projects/vnet/sys/net/vnet.c
===================================================================
--- projects/vnet/sys/net/vnet.c	(revision 299172)
+++ projects/vnet/sys/net/vnet.c	(revision 299173)
@@ -1,794 +1,804 @@
 /*-
  * Copyright (c) 2004-2009 University of Zagreb
  * Copyright (c) 2006-2009 FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by the University of Zagreb and the
  * FreeBSD Foundation under sponsorship by the Stichting NLnet and the
  * FreeBSD Foundation.
  *
  * Copyright (c) 2009 Jeffrey Roberson <jeff@freebsd.org>
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_kdb.h"
 
 #include <sys/param.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/jail.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <machine/stdarg.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 /*-
  * This file implements core functions for virtual network stacks:
  *
  * - Virtual network stack management functions.
  *
  * - Virtual network stack memory allocator, which virtualizes global
  *   variables in the network stack
  *
  * - Virtualized SYSINIT's/SYSUNINIT's, which allow network stack subsystems
  *   to register startup/shutdown events to be run for each virtual network
  *   stack instance.
  */
 
 FEATURE(vimage, "VIMAGE kernel virtualization");
 
 static MALLOC_DEFINE(M_VNET, "vnet", "network stack control block");
 
 /*
  * The virtual network stack list has two read-write locks, one sleepable and
  * the other not, so that the list can be stablized and walked in a variety
  * of network stack contexts.  Both must be acquired exclusively to modify
  * the list, but a read lock of either lock is sufficient to walk the list.
  */
 struct rwlock		vnet_rwlock;
 struct sx		vnet_sxlock;
 
 #define	VNET_LIST_WLOCK() do {						\
 	sx_xlock(&vnet_sxlock);						\
 	rw_wlock(&vnet_rwlock);						\
 } while (0)
 
 #define	VNET_LIST_WUNLOCK() do {					\
 	rw_wunlock(&vnet_rwlock);					\
 	sx_xunlock(&vnet_sxlock);					\
 } while (0)
 
 struct vnet_list_head vnet_head;
 struct vnet *vnet0;
 
 /*
  * The virtual network stack allocator provides storage for virtualized
  * global variables.  These variables are defined/declared using the
  * VNET_DEFINE()/VNET_DECLARE() macros, which place them in the 'set_vnet'
  * linker set.  The details of the implementation are somewhat subtle, but
  * allow the majority of most network subsystems to maintain
  * virtualization-agnostic.
  *
  * The virtual network stack allocator handles variables in the base kernel
  * vs. modules in similar but different ways.  In both cases, virtualized
  * global variables are marked as such by being declared to be part of the
  * vnet linker set.  These "master" copies of global variables serve two
  * functions:
  *
  * (1) They contain static initialization or "default" values for global
  *     variables which will be propagated to each virtual network stack
  *     instance when created.  As with normal global variables, they default
  *     to zero-filled.
  *
  * (2) They act as unique global names by which the variable can be referred
  *     to, regardless of network stack instance.  The single global symbol
  *     will be used to calculate the location of a per-virtual instance
  *     variable at run-time.
  *
  * Each virtual network stack instance has a complete copy of each
  * virtualized global variable, stored in a malloc'd block of memory
  * referred to by vnet->vnet_data_mem.  Critical to the design is that each
  * per-instance memory block is laid out identically to the master block so
  * that the offset of each global variable is the same across all blocks.  To
  * optimize run-time access, a precalculated 'base' address,
  * vnet->vnet_data_base, is stored in each vnet, and is the amount that can
  * be added to the address of a 'master' instance of a variable to get to the
  * per-vnet instance.
  *
  * Virtualized global variables are handled in a similar manner, but as each
  * module has its own 'set_vnet' linker set, and we want to keep all
  * virtualized globals togther, we reserve space in the kernel's linker set
  * for potential module variables using a per-vnet character array,
  * 'modspace'.  The virtual network stack allocator maintains a free list to
  * track what space in the array is free (all, initially) and as modules are
  * linked, allocates portions of the space to specific globals.  The kernel
  * module linker queries the virtual network stack allocator and will
  * bind references of the global to the location during linking.  It also
  * calls into the virtual network stack allocator, once the memory is
  * initialized, in order to propagate the new static initializations to all
  * existing virtual network stack instances so that the soon-to-be executing
  * module will find every network stack instance with proper default values.
  */
 
 /*
  * Number of bytes of data in the 'set_vnet' linker set, and hence the total
  * size of all kernel virtualized global variables, and the malloc(9) type
  * that will be used to allocate it.
  */
 #define	VNET_BYTES	(VNET_STOP - VNET_START)
 
 static MALLOC_DEFINE(M_VNET_DATA, "vnet_data", "VNET data");
 
 /*
  * VNET_MODMIN is the minimum number of bytes we will reserve for the sum of
  * global variables across all loaded modules.  As this actually sizes an
  * array declared as a virtualized global variable in the kernel itself, and
  * we want the virtualized global variable space to be page-sized, we may
  * have more space than that in practice.
  */
 #define	VNET_MODMIN	8192
 #define	VNET_SIZE	roundup2(VNET_BYTES, PAGE_SIZE)
 
 /*
  * Space to store virtualized global variables from loadable kernel modules,
  * and the free list to manage it.
  */
 static VNET_DEFINE(char, modspace[VNET_MODMIN]);
 
 /*
  * Global lists of subsystem constructor and destructors for vnets.  They are
  * registered via VNET_SYSINIT() and VNET_SYSUNINIT().  Both lists are
  * protected by the vnet_sysinit_sxlock global lock.
  */
 static TAILQ_HEAD(vnet_sysinit_head, vnet_sysinit) vnet_constructors =
 	TAILQ_HEAD_INITIALIZER(vnet_constructors);
 static TAILQ_HEAD(vnet_sysuninit_head, vnet_sysinit) vnet_destructors =
 	TAILQ_HEAD_INITIALIZER(vnet_destructors);
 
 struct sx		vnet_sysinit_sxlock;
 
 #define	VNET_SYSINIT_WLOCK()	sx_xlock(&vnet_sysinit_sxlock);
 #define	VNET_SYSINIT_WUNLOCK()	sx_xunlock(&vnet_sysinit_sxlock);
 #define	VNET_SYSINIT_RLOCK()	sx_slock(&vnet_sysinit_sxlock);
 #define	VNET_SYSINIT_RUNLOCK()	sx_sunlock(&vnet_sysinit_sxlock);
 
 struct vnet_data_free {
 	uintptr_t	vnd_start;
 	int		vnd_len;
 	TAILQ_ENTRY(vnet_data_free) vnd_link;
 };
 
 static MALLOC_DEFINE(M_VNET_DATA_FREE, "vnet_data_free",
     "VNET resource accounting");
 static TAILQ_HEAD(, vnet_data_free) vnet_data_free_head =
 	    TAILQ_HEAD_INITIALIZER(vnet_data_free_head);
 static struct sx vnet_data_free_lock;
 
 SDT_PROVIDER_DEFINE(vnet);
 SDT_PROBE_DEFINE1(vnet, functions, vnet_alloc, entry, "int");
 SDT_PROBE_DEFINE2(vnet, functions, vnet_alloc, alloc, "int",
     "struct vnet *");
 SDT_PROBE_DEFINE2(vnet, functions, vnet_alloc, return,
     "int", "struct vnet *");
 SDT_PROBE_DEFINE2(vnet, functions, vnet_destroy, entry,
     "int", "struct vnet *");
 SDT_PROBE_DEFINE1(vnet, functions, vnet_destroy, return,
     "int");
 
 #ifdef DDB
 static void db_show_vnet_print_vs(struct vnet_sysinit *, int);
 #endif
 
 /*
  * Allocate a virtual network stack.
  */
 struct vnet *
 vnet_alloc(void)
 {
 	struct vnet *vnet;
 
 	SDT_PROBE1(vnet, functions, vnet_alloc, entry, __LINE__);
 	vnet = malloc(sizeof(struct vnet), M_VNET, M_WAITOK | M_ZERO);
 	vnet->vnet_magic_n = VNET_MAGIC_N;
 	vnet->vnet_state = 0;
 	SDT_PROBE2(vnet, functions, vnet_alloc, alloc, __LINE__, vnet);
 
 	/*
 	 * Allocate storage for virtualized global variables and copy in
 	 * initial values form our 'master' copy.
 	 */
 	vnet->vnet_data_mem = malloc(VNET_SIZE, M_VNET_DATA, M_WAITOK);
 	memcpy(vnet->vnet_data_mem, (void *)VNET_START, VNET_BYTES);
 
 	/*
 	 * All use of vnet-specific data will immediately subtract VNET_START
 	 * from the base memory pointer, so pre-calculate that now to avoid
 	 * it on each use.
 	 */
 	vnet->vnet_data_base = (uintptr_t)vnet->vnet_data_mem - VNET_START;
 
 	/* Initialize / attach vnet module instances. */
 	CURVNET_SET_QUIET(vnet);
 	vnet_sysinit();
 	CURVNET_RESTORE();
 
 	VNET_LIST_WLOCK();
 	LIST_INSERT_HEAD(&vnet_head, vnet, vnet_le);
 	VNET_LIST_WUNLOCK();
 
 	SDT_PROBE2(vnet, functions, vnet_alloc, return, __LINE__, vnet);
 	return (vnet);
 }
 
 /*
  * Destroy a virtual network stack.
  */
 void
 vnet_destroy(struct vnet *vnet)
 {
 
 	SDT_PROBE2(vnet, functions, vnet_destroy, entry, __LINE__, vnet);
 	KASSERT(vnet->vnet_sockcnt == 0,
 	    ("%s: vnet still has sockets", __func__));
 
 	VNET_LIST_WLOCK();
 	LIST_REMOVE(vnet, vnet_le);
 	VNET_LIST_WUNLOCK();
 
 	CURVNET_SET_QUIET(vnet);
 	vnet_sysuninit();
 	CURVNET_RESTORE();
 
 	/*
 	 * Release storage for the virtual network stack instance.
 	 */
 	free(vnet->vnet_data_mem, M_VNET_DATA);
 	vnet->vnet_data_mem = NULL;
 	vnet->vnet_data_base = 0;
 	vnet->vnet_magic_n = 0xdeadbeef;
 	free(vnet, M_VNET);
 	SDT_PROBE1(vnet, functions, vnet_destroy, return, __LINE__);
 }
 
 /*
  * Boot time initialization and allocation of virtual network stacks.
  */
 static void
 vnet_init_prelink(void *arg)
 {
 
 	rw_init(&vnet_rwlock, "vnet_rwlock");
 	sx_init(&vnet_sxlock, "vnet_sxlock");
 	sx_init(&vnet_sysinit_sxlock, "vnet_sysinit_sxlock");
 	LIST_INIT(&vnet_head);
 }
 SYSINIT(vnet_init_prelink, SI_SUB_VNET_PRELINK, SI_ORDER_FIRST,
     vnet_init_prelink, NULL);
 
 static void
-vnet0_init(void *arg)
+vnet0_init(void *arg __unused)
 {
 
 	/* Warn people before take off - in case we crash early. */
 	printf("WARNING: VIMAGE (virtualized network stack) is a highly "
 	    "experimental feature.\n");
 
 	/*
 	 * We MUST clear curvnet in vi_init_done() before going SMP,
 	 * otherwise CURVNET_SET() macros would scream about unnecessary
 	 * curvnet recursions.
 	 */
 	curvnet = prison0.pr_vnet = vnet0 = vnet_alloc();
 }
 SYSINIT(vnet0_init, SI_SUB_VNET, SI_ORDER_FIRST, vnet0_init, NULL);
 
 static void
-vnet_init_done(void *unused)
+vnet_init_done(void *unused __unused)
 {
 
 	curvnet = NULL;
 }
-SYSINIT(vnet_init_done, SI_SUB_VNET_DONE, SI_ORDER_FIRST, vnet_init_done,
+SYSINIT(vnet_init_done, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_init_done,
     NULL);
 
 /*
  * Once on boot, initialize the modspace freelist to entirely cover modspace.
  */
 static void
 vnet_data_startup(void *dummy __unused)
 {
 	struct vnet_data_free *df;
 
 	df = malloc(sizeof(*df), M_VNET_DATA_FREE, M_WAITOK | M_ZERO);
 	df->vnd_start = (uintptr_t)&VNET_NAME(modspace);
 	df->vnd_len = VNET_MODMIN;
 	TAILQ_INSERT_HEAD(&vnet_data_free_head, df, vnd_link);
 	sx_init(&vnet_data_free_lock, "vnet_data alloc lock");
 }
 SYSINIT(vnet_data, SI_SUB_KLD, SI_ORDER_FIRST, vnet_data_startup, 0);
 
+/* Dummy VNET_SYSINIT to make sure we reach the final end state. */
+static void
+vnet_sysinit_done(void *unused __unused)
+{
+
+	return;
+}
+VNET_SYSINIT(vnet_sysinit_done, SI_SUB_VNET_DONE, SI_ORDER_ANY,
+    vnet_sysinit_done, NULL);
+
 /*
  * When a module is loaded and requires storage for a virtualized global
  * variable, allocate space from the modspace free list.  This interface
  * should be used only by the kernel linker.
  */
 void *
 vnet_data_alloc(int size)
 {
 	struct vnet_data_free *df;
 	void *s;
 
 	s = NULL;
 	size = roundup2(size, sizeof(void *));
 	sx_xlock(&vnet_data_free_lock);
 	TAILQ_FOREACH(df, &vnet_data_free_head, vnd_link) {
 		if (df->vnd_len < size)
 			continue;
 		if (df->vnd_len == size) {
 			s = (void *)df->vnd_start;
 			TAILQ_REMOVE(&vnet_data_free_head, df, vnd_link);
 			free(df, M_VNET_DATA_FREE);
 			break;
 		}
 		s = (void *)df->vnd_start;
 		df->vnd_len -= size;
 		df->vnd_start = df->vnd_start + size;
 		break;
 	}
 	sx_xunlock(&vnet_data_free_lock);
 
 	return (s);
 }
 
 /*
  * Free space for a virtualized global variable on module unload.
  */
 void
 vnet_data_free(void *start_arg, int size)
 {
 	struct vnet_data_free *df;
 	struct vnet_data_free *dn;
 	uintptr_t start;
 	uintptr_t end;
 
 	size = roundup2(size, sizeof(void *));
 	start = (uintptr_t)start_arg;
 	end = start + size;
 	/*
 	 * Free a region of space and merge it with as many neighbors as
 	 * possible.  Keeping the list sorted simplifies this operation.
 	 */
 	sx_xlock(&vnet_data_free_lock);
 	TAILQ_FOREACH(df, &vnet_data_free_head, vnd_link) {
 		if (df->vnd_start > end)
 			break;
 		/*
 		 * If we expand at the end of an entry we may have to merge
 		 * it with the one following it as well.
 		 */
 		if (df->vnd_start + df->vnd_len == start) {
 			df->vnd_len += size;
 			dn = TAILQ_NEXT(df, vnd_link);
 			if (df->vnd_start + df->vnd_len == dn->vnd_start) {
 				df->vnd_len += dn->vnd_len;
 				TAILQ_REMOVE(&vnet_data_free_head, dn,
 				    vnd_link);
 				free(dn, M_VNET_DATA_FREE);
 			}
 			sx_xunlock(&vnet_data_free_lock);
 			return;
 		}
 		if (df->vnd_start == end) {
 			df->vnd_start = start;
 			df->vnd_len += size;
 			sx_xunlock(&vnet_data_free_lock);
 			return;
 		}
 	}
 	dn = malloc(sizeof(*df), M_VNET_DATA_FREE, M_WAITOK | M_ZERO);
 	dn->vnd_start = start;
 	dn->vnd_len = size;
 	if (df)
 		TAILQ_INSERT_BEFORE(df, dn, vnd_link);
 	else
 		TAILQ_INSERT_TAIL(&vnet_data_free_head, dn, vnd_link);
 	sx_xunlock(&vnet_data_free_lock);
 }
 
 /*
  * When a new virtualized global variable has been allocated, propagate its
  * initial value to each already-allocated virtual network stack instance.
  */
 void
 vnet_data_copy(void *start, int size)
 {
 	struct vnet *vnet;
 
 	VNET_LIST_RLOCK();
 	LIST_FOREACH(vnet, &vnet_head, vnet_le)
 		memcpy((void *)((uintptr_t)vnet->vnet_data_base +
 		    (uintptr_t)start), start, size);
 	VNET_LIST_RUNLOCK();
 }
 
 /*
  * Support for special SYSINIT handlers registered via VNET_SYSINIT()
  * and VNET_SYSUNINIT().
  */
 void
 vnet_register_sysinit(void *arg)
 {
 	struct vnet_sysinit *vs, *vs2;	
 	struct vnet *vnet;
 
 	vs = arg;
 	KASSERT(vs->subsystem > SI_SUB_VNET, ("vnet sysinit too early"));
 
 	/* Add the constructor to the global list of vnet constructors. */
 	VNET_SYSINIT_WLOCK();
 	TAILQ_FOREACH(vs2, &vnet_constructors, link) {
 		if (vs2->subsystem > vs->subsystem)
 			break;
 		if (vs2->subsystem == vs->subsystem && vs2->order > vs->order)
 			break;
 	}
 	if (vs2 != NULL)
 		TAILQ_INSERT_BEFORE(vs2, vs, link);
 	else
 		TAILQ_INSERT_TAIL(&vnet_constructors, vs, link);
 
 	/*
 	 * Invoke the constructor on all the existing vnets when it is
 	 * registered.
 	 */
 	VNET_FOREACH(vnet) {
 		CURVNET_SET_QUIET(vnet);
 		vs->func(vs->arg);
 		CURVNET_RESTORE();
 	}
 	VNET_SYSINIT_WUNLOCK();
 }
 
 void
 vnet_deregister_sysinit(void *arg)
 {
 	struct vnet_sysinit *vs;
 
 	vs = arg;
 
 	/* Remove the constructor from the global list of vnet constructors. */
 	VNET_SYSINIT_WLOCK();
 	TAILQ_REMOVE(&vnet_constructors, vs, link);
 	VNET_SYSINIT_WUNLOCK();
 }
 
 void
 vnet_register_sysuninit(void *arg)
 {
 	struct vnet_sysinit *vs, *vs2;
 
 	vs = arg;
 
 	/* Add the destructor to the global list of vnet destructors. */
 	VNET_SYSINIT_WLOCK();
 	TAILQ_FOREACH(vs2, &vnet_destructors, link) {
 		if (vs2->subsystem > vs->subsystem)
 			break;
 		if (vs2->subsystem == vs->subsystem && vs2->order > vs->order)
 			break;
 	}
 	if (vs2 != NULL)
 		TAILQ_INSERT_BEFORE(vs2, vs, link);
 	else
 		TAILQ_INSERT_TAIL(&vnet_destructors, vs, link);
 	VNET_SYSINIT_WUNLOCK();
 }
 
 void
 vnet_deregister_sysuninit(void *arg)
 {
 	struct vnet_sysinit *vs;
 	struct vnet *vnet;
 
 	vs = arg;
 
 	/*
 	 * Invoke the destructor on all the existing vnets when it is
 	 * deregistered.
 	 */
 	VNET_SYSINIT_WLOCK();
 	VNET_FOREACH(vnet) {
 		CURVNET_SET_QUIET(vnet);
 		vs->func(vs->arg);
 		CURVNET_RESTORE();
 	}
 
 	/* Remove the destructor from the global list of vnet destructors. */
 	TAILQ_REMOVE(&vnet_destructors, vs, link);
 	VNET_SYSINIT_WUNLOCK();
 }
 
 /*
  * Invoke all registered vnet constructors on the current vnet.  Used during
  * vnet construction.  The caller is responsible for ensuring the new vnet is
  * the current vnet and that the vnet_sysinit_sxlock lock is locked.
  */
 void
 vnet_sysinit(void)
 {
 	struct vnet_sysinit *vs;
 
 	VNET_SYSINIT_RLOCK();
 	TAILQ_FOREACH(vs, &vnet_constructors, link) {
 		curvnet->vnet_state = vs->subsystem;
 		vs->func(vs->arg);
 	}
 	VNET_SYSINIT_RUNLOCK();
 }
 
 /*
  * Invoke all registered vnet destructors on the current vnet.  Used during
  * vnet destruction.  The caller is responsible for ensuring the dying vnet
  * the current vnet and that the vnet_sysinit_sxlock lock is locked.
  */
 void
 vnet_sysuninit(void)
 {
 	struct vnet_sysinit *vs;
 
 	VNET_SYSINIT_RLOCK();
 	TAILQ_FOREACH_REVERSE(vs, &vnet_destructors, vnet_sysuninit_head,
 	    link) {
 		curvnet->vnet_state = vs->subsystem;
 		vs->func(vs->arg);
 	}
 	VNET_SYSINIT_RUNLOCK();
 }
 
 /*
  * EVENTHANDLER(9) extensions.
  */
 /*
  * Invoke the eventhandler function originally registered with the possibly
  * registered argument for all virtual network stack instances.
  *
  * This iterator can only be used for eventhandlers that do not take any
  * additional arguments, as we do ignore the variadic arguments from the
  * EVENTHANDLER_INVOKE() call.
  */
 void
 vnet_global_eventhandler_iterator_func(void *arg, ...)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct eventhandler_entry_vimage *v_ee;
 
 	/*
 	 * There is a bug here in that we should actually cast things to
 	 * (struct eventhandler_entry_ ## name *)  but that's not easily
 	 * possible in here so just re-using the variadic version we
 	 * defined for the generic vimage case.
 	 */
 	v_ee = arg;
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		((vimage_iterator_func_t)v_ee->func)(v_ee->ee_arg);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 }
 
 #ifdef VNET_DEBUG
 struct vnet_recursion {
 	SLIST_ENTRY(vnet_recursion)	 vnr_le;
 	const char			*prev_fn;
 	const char			*where_fn;
 	int				 where_line;
 	struct vnet			*old_vnet;
 	struct vnet			*new_vnet;
 };
 
 static SLIST_HEAD(, vnet_recursion) vnet_recursions =
     SLIST_HEAD_INITIALIZER(vnet_recursions);
 
 static void
 vnet_print_recursion(struct vnet_recursion *vnr, int brief)
 {
 
 	if (!brief)
 		printf("CURVNET_SET() recursion in ");
 	printf("%s() line %d, prev in %s()", vnr->where_fn, vnr->where_line,
 	    vnr->prev_fn);
 	if (brief)
 		printf(", ");
 	else
 		printf("\n    ");
 	printf("%p -> %p\n", vnr->old_vnet, vnr->new_vnet);
 }
 
 void
 vnet_log_recursion(struct vnet *old_vnet, const char *old_fn, int line)
 {
 	struct vnet_recursion *vnr;
 
 	/* Skip already logged recursion events. */
 	SLIST_FOREACH(vnr, &vnet_recursions, vnr_le)
 		if (vnr->prev_fn == old_fn &&
 		    vnr->where_fn == curthread->td_vnet_lpush &&
 		    vnr->where_line == line &&
 		    (vnr->old_vnet == vnr->new_vnet) == (curvnet == old_vnet))
 			return;
 
 	vnr = malloc(sizeof(*vnr), M_VNET, M_NOWAIT | M_ZERO);
 	if (vnr == NULL)
 		panic("%s: malloc failed", __func__);
 	vnr->prev_fn = old_fn;
 	vnr->where_fn = curthread->td_vnet_lpush;
 	vnr->where_line = line;
 	vnr->old_vnet = old_vnet;
 	vnr->new_vnet = curvnet;
 
 	SLIST_INSERT_HEAD(&vnet_recursions, vnr, vnr_le);
 
 	vnet_print_recursion(vnr, 0);
 #ifdef KDB
 	kdb_backtrace();
 #endif
 }
 #endif /* VNET_DEBUG */
 
 /*
  * DDB(4).
  */
 #ifdef DDB
 static void
 db_vnet_print(struct vnet *vnet)
 {
 
 	db_printf("vnet            = %p\n", vnet);
 	db_printf(" vnet_magic_n   = 0x%x (%s, orig 0x%x)\n",
 	    vnet->vnet_magic_n,
 	    (vnet->vnet_magic_n == VNET_MAGIC_N) ?
 		"ok" : "mismatch", VNET_MAGIC_N);
 	db_printf(" vnet_ifcnt     = %u\n", vnet->vnet_ifcnt);
 	db_printf(" vnet_sockcnt   = %u\n", vnet->vnet_sockcnt);
 	db_printf(" vnet_data_mem  = %p\n", vnet->vnet_data_mem);
 	db_printf(" vnet_data_base = 0x%jx\n",
 	    (uintmax_t)vnet->vnet_data_base);
-	db_printf(" vnet_state     = %#x\n", vnet->vnet_state);
+	db_printf(" vnet_state     = %#08x\n", vnet->vnet_state);
 	db_printf("\n");
 }
 
 DB_SHOW_ALL_COMMAND(vnets, db_show_all_vnets)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_FOREACH(vnet_iter) {
 		db_vnet_print(vnet_iter);
 		if (db_pager_quit)
 			break;
 	}
 }
 
 DB_SHOW_COMMAND(vnet, db_show_vnet)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show vnet <struct vnet *>\n");
 		return;
 	}
 
 	db_vnet_print((struct vnet *)addr);
 }
 
 static void
 db_show_vnet_print_vs(struct vnet_sysinit *vs, int ddb)
 {
 	const char *vsname, *funcname;
 	c_db_sym_t sym;
 	db_expr_t  offset;
 
 #define xprint(...)							\
 	if (ddb)							\
 		db_printf(__VA_ARGS__);					\
 	else								\
 		printf(__VA_ARGS__)
 
 	if (vs == NULL) {
 		xprint("%s: no vnet_sysinit * given\n", __func__);
 		return;
 	}
 
 	sym = db_search_symbol((vm_offset_t)vs, DB_STGY_ANY, &offset);
 	db_symbol_values(sym, &vsname, NULL);
 	sym = db_search_symbol((vm_offset_t)vs->func, DB_STGY_PROC, &offset);
 	db_symbol_values(sym, &funcname, NULL);
 	xprint("%s(%p)\n", (vsname != NULL) ? vsname : "", vs);
-	xprint("  0x%08x 0x%08x\n", vs->subsystem, vs->order);
+	xprint("  %#08x %#08x\n", vs->subsystem, vs->order);
 	xprint("  %p(%s)(%p)\n",
 	    vs->func, (funcname != NULL) ? funcname : "", vs->arg);
 #undef xprint
 }
 
 DB_SHOW_COMMAND(vnet_sysinit, db_show_vnet_sysinit)
 {
 	struct vnet_sysinit *vs;
 
 	db_printf("VNET_SYSINIT vs Name(Ptr)\n");
 	db_printf("  Subsystem  Order\n");
 	db_printf("  Function(Name)(Arg)\n");
 	TAILQ_FOREACH(vs, &vnet_constructors, link) {
 		db_show_vnet_print_vs(vs, 1);
 		if (db_pager_quit)
 			break;
 	}
 }
 
 DB_SHOW_COMMAND(vnet_sysuninit, db_show_vnet_sysuninit)
 {
 	struct vnet_sysinit *vs;
 
 	db_printf("VNET_SYSUNINIT vs Name(Ptr)\n");
 	db_printf("  Subsystem  Order\n");
 	db_printf("  Function(Name)(Arg)\n");
 	TAILQ_FOREACH_REVERSE(vs, &vnet_destructors, vnet_sysuninit_head,
 	    link) {
 		db_show_vnet_print_vs(vs, 1);
 		if (db_pager_quit)
 			break;
 	}
 }
 
 #ifdef VNET_DEBUG
 DB_SHOW_COMMAND(vnetrcrs, db_show_vnetrcrs)
 {
 	struct vnet_recursion *vnr;
 
 	SLIST_FOREACH(vnr, &vnet_recursions, vnr_le)
 		vnet_print_recursion(vnr, 1);
 }
 #endif
 #endif /* DDB */
Index: projects/vnet/sys/netinet/if_ether.c
===================================================================
--- projects/vnet/sys/netinet/if_ether.c	(revision 299172)
+++ projects/vnet/sys/netinet/if_ether.c	(revision 299173)
@@ -1,1353 +1,1355 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_ether.c	8.1 (Berkeley) 6/10/93
  */
 
 /*
  * Ethernet address resolution protocol.
  * TODO:
  *	add "inuse/lock" bit (or ref. count) along with valid bit
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/ethernet.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_var.h>
 #include <net/if_llatbl.h>
 #include <netinet/if_ether.h>
 #ifdef INET
 #include <netinet/ip_carp.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 #define SIN(s) ((const struct sockaddr_in *)(s))
 
 static struct timeval arp_lastlog;
 static int arp_curpps;
 static int arp_maxpps = 1;
 
 /* Simple ARP state machine */
 enum arp_llinfo_state {
 	ARP_LLINFO_INCOMPLETE = 0, /* No LLE data */
 	ARP_LLINFO_REACHABLE,	/* LLE is valid */
 	ARP_LLINFO_VERIFY,	/* LLE is valid, need refresh */
 	ARP_LLINFO_DELETED,	/* LLE is deleted */
 };
 
 SYSCTL_DECL(_net_link_ether);
 static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, "");
 static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, "");
 
 /* timer values */
 static VNET_DEFINE(int, arpt_keep) = (20*60);	/* once resolved, good for 20
 						 * minutes */
 static VNET_DEFINE(int, arp_maxtries) = 5;
 static VNET_DEFINE(int, arp_proxyall) = 0;
 static VNET_DEFINE(int, arpt_down) = 20;	/* keep incomplete entries for
 						 * 20 seconds */
 static VNET_DEFINE(int, arpt_rexmit) = 1;	/* retransmit arp entries, sec*/
 VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat);  /* ARP statistics, see if_arp.h */
 VNET_PCPUSTAT_SYSINIT(arpstat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(arpstat);
 #endif /* VIMAGE */
 
 static VNET_DEFINE(int, arp_maxhold) = 1;
 
 #define	V_arpt_keep		VNET(arpt_keep)
 #define	V_arpt_down		VNET(arpt_down)
 #define	V_arpt_rexmit		VNET(arpt_rexmit)
 #define	V_arp_maxtries		VNET(arp_maxtries)
 #define	V_arp_proxyall		VNET(arp_proxyall)
 #define	V_arp_maxhold		VNET(arp_maxhold)
 
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arpt_keep), 0,
 	"ARP entry lifetime in seconds");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_maxtries), 0,
 	"ARP resolution attempts before returning error");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_proxyall), 0,
 	"Enable proxy ARP for all suitable requests");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arpt_down), 0,
 	"Incomplete ARP entry lifetime in seconds");
 SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat,
     arpstat, "ARP statistics (struct arpstat, net/if_arp.h)");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_maxhold), 0,
 	"Number of packets to hold per ARP entry");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second,
 	CTLFLAG_RW, &arp_maxpps, 0,
 	"Maximum number of remotely triggered ARP messages that can be "
 	"logged per second");
 
 #define	ARP_LOG(pri, ...)	do {					\
 	if (ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps))	\
 		log((pri), "arp: " __VA_ARGS__);			\
 } while (0)
 
 
 static void	arpintr(struct mbuf *);
 static void	arptimer(void *);
 #ifdef INET
 static void	in_arpinput(struct mbuf *);
 #endif
 
 static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr,
     struct ifnet *ifp, int bridged, struct llentry *la);
 static void arp_mark_lle_reachable(struct llentry *la);
 static void arp_iflladdr(void *arg __unused, struct ifnet *ifp);
 
 static eventhandler_tag iflladdr_tag;
 
 static const struct netisr_handler arp_nh = {
 	.nh_name = "arp",
 	.nh_handler = arpintr,
 	.nh_proto = NETISR_ARP,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 /*
  * Timeout routine.  Age arp_tab entries periodically.
  */
 static void
 arptimer(void *arg)
 {
 	struct llentry *lle = (struct llentry *)arg;
 	struct ifnet *ifp;
 	int r_skip_req;
 
 	if (lle->la_flags & LLE_STATIC) {
 		return;
 	}
 	LLE_WLOCK(lle);
 	if (callout_pending(&lle->lle_timer)) {
 		/*
 		 * Here we are a bit odd here in the treatment of 
 		 * active/pending. If the pending bit is set, it got
 		 * rescheduled before I ran. The active
 		 * bit we ignore, since if it was stopped
 		 * in ll_tablefree() and was currently running
 		 * it would have return 0 so the code would
 		 * not have deleted it since the callout could
 		 * not be stopped so we want to go through
 		 * with the delete here now. If the callout
 		 * was restarted, the pending bit will be back on and
 		 * we just want to bail since the callout_reset would
 		 * return 1 and our reference would have been removed
 		 * by arpresolve() below.
 		 */
 		LLE_WUNLOCK(lle);
  		return;
  	}
 	ifp = lle->lle_tbl->llt_ifp;
 	CURVNET_SET(ifp->if_vnet);
 
 	switch (lle->ln_state) {
 	case ARP_LLINFO_REACHABLE:
 
 		/*
 		 * Expiration time is approaching.
 		 * Let's try to refresh entry if it is still
 		 * in use.
 		 *
 		 * Set r_skip_req to get feedback from
 		 * fast path. Change state and re-schedule
 		 * ourselves.
 		 */
 		LLE_REQ_LOCK(lle);
 		lle->r_skip_req = 1;
 		LLE_REQ_UNLOCK(lle);
 		lle->ln_state = ARP_LLINFO_VERIFY;
 		callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
 		LLE_WUNLOCK(lle);
 		CURVNET_RESTORE();
 		return;
 	case ARP_LLINFO_VERIFY:
 		LLE_REQ_LOCK(lle);
 		r_skip_req = lle->r_skip_req;
 		LLE_REQ_UNLOCK(lle);
 
 		if (r_skip_req == 0 && lle->la_preempt > 0) {
 			/* Entry was used, issue refresh request */
 			struct in_addr dst;
 			dst = lle->r_l3addr.addr4;
 			lle->la_preempt--;
 			callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
 			LLE_WUNLOCK(lle);
 			arprequest(ifp, NULL, &dst, NULL);
 			CURVNET_RESTORE();
 			return;
 		}
 		/* Nothing happened. Reschedule if not too late */
 		if (lle->la_expire > time_uptime) {
 			callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
 			LLE_WUNLOCK(lle);
 			CURVNET_RESTORE();
 			return;
 		}
 		break;
 	case ARP_LLINFO_INCOMPLETE:
 	case ARP_LLINFO_DELETED:
 		break;
 	}
 
 	if ((lle->la_flags & LLE_DELETED) == 0) {
 		int evt;
 
 		if (lle->la_flags & LLE_VALID)
 			evt = LLENTRY_EXPIRED;
 		else
 			evt = LLENTRY_TIMEDOUT;
 		EVENTHANDLER_INVOKE(lle_event, lle, evt);
 	}
 
 	callout_stop(&lle->lle_timer);
 
 	/* XXX: LOR avoidance. We still have ref on lle. */
 	LLE_WUNLOCK(lle);
 	IF_AFDATA_LOCK(ifp);
 	LLE_WLOCK(lle);
 
 	/* Guard against race with other llentry_free(). */
 	if (lle->la_flags & LLE_LINKED) {
 		LLE_REMREF(lle);
 		lltable_unlink_entry(lle->lle_tbl, lle);
 	}
 	IF_AFDATA_UNLOCK(ifp);
 
 	size_t pkts_dropped = llentry_free(lle);
 
 	ARPSTAT_ADD(dropped, pkts_dropped);
 	ARPSTAT_INC(timeouts);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Stores link-layer header for @ifp in format suitable for if_output()
  * into buffer @buf. Resulting header length is stored in @bufsize.
  *
  * Returns 0 on success.
  */
 static int
 arp_fillheader(struct ifnet *ifp, struct arphdr *ah, int bcast, u_char *buf,
     size_t *bufsize)
 {
 	struct if_encap_req ereq;
 	int error;
 
 	bzero(buf, *bufsize);
 	bzero(&ereq, sizeof(ereq));
 	ereq.buf = buf;
 	ereq.bufsize = *bufsize;
 	ereq.rtype = IFENCAP_LL;
 	ereq.family = AF_ARP;
 	ereq.lladdr = ar_tha(ah);
 	ereq.hdata = (u_char *)ah;
 	if (bcast)
 		ereq.flags = IFENCAP_FLAG_BROADCAST;
 	error = ifp->if_requestencap(ifp, &ereq);
 	if (error == 0)
 		*bufsize = ereq.bufsize;
 
 	return (error);
 }
 
 
 /*
  * Broadcast an ARP request. Caller specifies:
  *	- arp header source ip address
  *	- arp header target ip address
  *	- arp header source ethernet address
  */
 void
 arprequest(struct ifnet *ifp, const struct in_addr *sip,
     const struct in_addr *tip, u_char *enaddr)
 {
 	struct mbuf *m;
 	struct arphdr *ah;
 	struct sockaddr sa;
 	u_char *carpaddr = NULL;
 	uint8_t linkhdr[LLE_MAX_LINKHDR];
 	size_t linkhdrsize;
 	struct route ro;
 	int error;
 
 	if (sip == NULL) {
 		/*
 		 * The caller did not supply a source address, try to find
 		 * a compatible one among those assigned to this interface.
 		 */
 		struct ifaddr *ifa;
 
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 
 			if (ifa->ifa_carp) {
 				if ((*carp_iamatch_p)(ifa, &carpaddr) == 0)
 					continue;
 				sip = &IA_SIN(ifa)->sin_addr;
 			} else {
 				carpaddr = NULL;
 				sip = &IA_SIN(ifa)->sin_addr;
 			}
 
 			if (0 == ((sip->s_addr ^ tip->s_addr) &
 			    IA_MASKSIN(ifa)->sin_addr.s_addr))
 				break;  /* found it. */
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (sip == NULL) {
 			printf("%s: cannot find matching address\n", __func__);
 			return;
 		}
 	}
 	if (enaddr == NULL)
 		enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp);
 
 	if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
 		return;
 	m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) +
 		2 * ifp->if_addrlen;
 	m->m_pkthdr.len = m->m_len;
 	M_ALIGN(m, m->m_len);
 	ah = mtod(m, struct arphdr *);
 	bzero((caddr_t)ah, m->m_len);
 #ifdef MAC
 	mac_netinet_arp_send(ifp, m);
 #endif
 	ah->ar_pro = htons(ETHERTYPE_IP);
 	ah->ar_hln = ifp->if_addrlen;		/* hardware address length */
 	ah->ar_pln = sizeof(struct in_addr);	/* protocol address length */
 	ah->ar_op = htons(ARPOP_REQUEST);
 	bcopy(enaddr, ar_sha(ah), ah->ar_hln);
 	bcopy(sip, ar_spa(ah), ah->ar_pln);
 	bcopy(tip, ar_tpa(ah), ah->ar_pln);
 	sa.sa_family = AF_ARP;
 	sa.sa_len = 2;
 
 	/* Calculate link header for sending frame */
 	bzero(&ro, sizeof(ro));
 	linkhdrsize = sizeof(linkhdr);
 	error = arp_fillheader(ifp, ah, 1, linkhdr, &linkhdrsize);
 	if (error != 0 && error != EAFNOSUPPORT) {
 		ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n",
 		    if_name(ifp), error);
 		return;
 	}
 
 	ro.ro_prepend = linkhdr;
 	ro.ro_plen = linkhdrsize;
 	ro.ro_flags = 0;
 
 	m->m_flags |= M_BCAST;
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	(*ifp->if_output)(ifp, m, &sa, &ro);
 	ARPSTAT_INC(txrequests);
 }
 
 
 /*
  * Resolve an IP address into an ethernet address - heavy version.
  * Used internally by arpresolve().
  * We have already checked than  we can't use existing lle without
  * modification so we have to acquire LLE_EXCLUSIVE lle lock.
  *
  * On success, desten and flags are filled in and the function returns 0;
  * If the packet must be held pending resolution, we return EWOULDBLOCK
  * On other errors, we return the corresponding error code.
  * Note that m_freem() handles NULL.
  */
 static int
 arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m,
 	const struct sockaddr *dst, u_char *desten, uint32_t *pflags)
 {
 	struct llentry *la = NULL, *la_tmp;
 	struct mbuf *curr = NULL;
 	struct mbuf *next = NULL;
 	int error, renew;
 	char *lladdr;
 	int ll_len;
 
 	if (pflags != NULL)
 		*pflags = 0;
 
 	if ((flags & LLE_CREATE) == 0) {
 		IF_AFDATA_RLOCK(ifp);
 		la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 		IF_AFDATA_RUNLOCK(ifp);
 	}
 	if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) {
 		la = lltable_alloc_entry(LLTABLE(ifp), 0, dst);
 		if (la == NULL) {
 			log(LOG_DEBUG,
 			    "arpresolve: can't allocate llinfo for %s on %s\n",
 			    inet_ntoa(SIN(dst)->sin_addr), if_name(ifp));
 			m_freem(m);
 			return (EINVAL);
 		}
 
 		IF_AFDATA_WLOCK(ifp);
 		LLE_WLOCK(la);
 		la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 		/* Prefer ANY existing lle over newly-created one */
 		if (la_tmp == NULL)
 			lltable_link_entry(LLTABLE(ifp), la);
 		IF_AFDATA_WUNLOCK(ifp);
 		if (la_tmp != NULL) {
 			lltable_free_entry(LLTABLE(ifp), la);
 			la = la_tmp;
 		}
 	}
 	if (la == NULL) {
 		m_freem(m);
 		return (EINVAL);
 	}
 
 	if ((la->la_flags & LLE_VALID) &&
 	    ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
 		if (flags & LLE_ADDRONLY) {
 			lladdr = la->ll_addr;
 			ll_len = ifp->if_addrlen;
 		} else {
 			lladdr = la->r_linkdata;
 			ll_len = la->r_hdrlen;
 		}
 		bcopy(lladdr, desten, ll_len);
 
 		/* Check if we have feedback request from arptimer() */
 		if (la->r_skip_req != 0) {
 			LLE_REQ_LOCK(la);
 			la->r_skip_req = 0; /* Notify that entry was used */
 			LLE_REQ_UNLOCK(la);
 		}
 		if (pflags != NULL)
 			*pflags = la->la_flags & (LLE_VALID|LLE_IFADDR);
 		LLE_WUNLOCK(la);
 		return (0);
 	}
 
 	renew = (la->la_asked == 0 || la->la_expire != time_uptime);
 	/*
 	 * There is an arptab entry, but no ethernet address
 	 * response yet.  Add the mbuf to the list, dropping
 	 * the oldest packet if we have exceeded the system
 	 * setting.
 	 */
 	if (m != NULL) {
 		if (la->la_numheld >= V_arp_maxhold) {
 			if (la->la_hold != NULL) {
 				next = la->la_hold->m_nextpkt;
 				m_freem(la->la_hold);
 				la->la_hold = next;
 				la->la_numheld--;
 				ARPSTAT_INC(dropped);
 			}
 		}
 		if (la->la_hold != NULL) {
 			curr = la->la_hold;
 			while (curr->m_nextpkt != NULL)
 				curr = curr->m_nextpkt;
 			curr->m_nextpkt = m;
 		} else
 			la->la_hold = m;
 		la->la_numheld++;
 	}
 	/*
 	 * Return EWOULDBLOCK if we have tried less than arp_maxtries. It
 	 * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH
 	 * if we have already sent arp_maxtries ARP requests. Retransmit the
 	 * ARP request, but not faster than one request per second.
 	 */
 	if (la->la_asked < V_arp_maxtries)
 		error = EWOULDBLOCK;	/* First request. */
 	else
 		error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN;
 
 	if (renew) {
 		int canceled;
 
 		LLE_ADDREF(la);
 		la->la_expire = time_uptime;
 		canceled = callout_reset(&la->lle_timer, hz * V_arpt_down,
 		    arptimer, la);
 		if (canceled)
 			LLE_REMREF(la);
 		la->la_asked++;
 		LLE_WUNLOCK(la);
 		arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL);
 		return (error);
 	}
 
 	LLE_WUNLOCK(la);
 	return (error);
 }
 
 /*
  * Resolve an IP address into an ethernet address.
  */
 int
 arpresolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst,
     char *desten, uint32_t *pflags)
 {
 	int error;
 
 	flags |= LLE_ADDRONLY;
 	error = arpresolve_full(ifp, 0, flags, NULL, dst, desten, pflags);
 	return (error);
 }
 
 
 /*
  * Lookups link header based on an IP address.
  * On input:
  *    ifp is the interface we use
  *    is_gw != 0 if @dst represents gateway to some destination
  *    m is the mbuf. May be NULL if we don't have a packet.
  *    dst is the next hop,
  *    desten is the storage to put LL header.
  *    flags returns subset of lle flags: LLE_VALID | LLE_IFADDR
  *
  * On success, full/partial link header and flags are filled in and
  * the function returns 0.
  * If the packet must be held pending resolution, we return EWOULDBLOCK
  * On other errors, we return the corresponding error code.
  * Note that m_freem() handles NULL.
  */
 int
 arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
 	const struct sockaddr *dst, u_char *desten, uint32_t *pflags)
 {
 	struct llentry *la = 0;
 
 	if (pflags != NULL)
 		*pflags = 0;
 
 	if (m != NULL) {
 		if (m->m_flags & M_BCAST) {
 			/* broadcast */
 			(void)memcpy(desten,
 			    ifp->if_broadcastaddr, ifp->if_addrlen);
 			return (0);
 		}
 		if (m->m_flags & M_MCAST) {
 			/* multicast */
 			ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
 			return (0);
 		}
 	}
 
 	IF_AFDATA_RLOCK(ifp);
 	la = lla_lookup(LLTABLE(ifp), LLE_UNLOCKED, dst);
 	if (la != NULL && (la->r_flags & RLLE_VALID) != 0) {
 		/* Entry found, let's copy lle info */
 		bcopy(la->r_linkdata, desten, la->r_hdrlen);
 		if (pflags != NULL)
 			*pflags = LLE_VALID | (la->r_flags & RLLE_IFADDR);
 		/* Check if we have feedback request from arptimer() */
 		if (la->r_skip_req != 0) {
 			LLE_REQ_LOCK(la);
 			la->r_skip_req = 0; /* Notify that entry was used */
 			LLE_REQ_UNLOCK(la);
 		}
 		IF_AFDATA_RUNLOCK(ifp);
 		return (0);
 	}
 	IF_AFDATA_RUNLOCK(ifp);
 
 	return (arpresolve_full(ifp, is_gw, la == NULL ? LLE_CREATE : 0, m, dst,
 	    desten, pflags));
 }
 
 /*
  * Common length and type checks are done here,
  * then the protocol-specific routine is called.
  */
 static void
 arpintr(struct mbuf *m)
 {
 	struct arphdr *ar;
 	struct ifnet *ifp;
 	char *layer;
 	int hlen;
 
 	ifp = m->m_pkthdr.rcvif;
 
 	if (m->m_len < sizeof(struct arphdr) &&
 	    ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) {
 		ARP_LOG(LOG_NOTICE, "packet with short header received on %s\n",
 		    if_name(ifp));
 		return;
 	}
 	ar = mtod(m, struct arphdr *);
 
 	/* Check if length is sufficient */
 	if (m->m_len <  arphdr_len(ar)) {
 		m = m_pullup(m, arphdr_len(ar));
 		if (m == NULL) {
 			ARP_LOG(LOG_NOTICE, "short packet received on %s\n",
 			    if_name(ifp));
 			return;
 		}
 		ar = mtod(m, struct arphdr *);
 	}
 
 	hlen = 0;
 	layer = "";
 	switch (ntohs(ar->ar_hrd)) {
 	case ARPHRD_ETHER:
 		hlen = ETHER_ADDR_LEN; /* RFC 826 */
 		layer = "ethernet";
 		break;
 	case ARPHRD_IEEE802:
 		hlen = 6; /* RFC 1390, FDDI_ADDR_LEN */
 		layer = "fddi";
 		break;
 	case ARPHRD_ARCNET:
 		hlen = 1; /* RFC 1201, ARC_ADDR_LEN */
 		layer = "arcnet";
 		break;
 	case ARPHRD_INFINIBAND:
 		hlen = 20;	/* RFC 4391, INFINIBAND_ALEN */ 
 		layer = "infiniband";
 		break;
 	case ARPHRD_IEEE1394:
 		hlen = 0; /* SHALL be 16 */ /* RFC 2734 */
 		layer = "firewire";
 
 		/*
 		 * Restrict too long hardware addresses.
 		 * Currently we are capable of handling 20-byte
 		 * addresses ( sizeof(lle->ll_addr) )
 		 */
 		if (ar->ar_hln >= 20)
 			hlen = 16;
 		break;
 	default:
 		ARP_LOG(LOG_NOTICE,
 		    "packet with unknown hardware format 0x%02d received on "
 		    "%s\n", ntohs(ar->ar_hrd), if_name(ifp));
 		m_freem(m);
 		return;
 	}
 
 	if (hlen != 0 && hlen != ar->ar_hln) {
 		ARP_LOG(LOG_NOTICE,
 		    "packet with invalid %s address length %d received on %s\n",
 		    layer, ar->ar_hln, if_name(ifp));
 		m_freem(m);
 		return;
 	}
 
 	ARPSTAT_INC(received);
 	switch (ntohs(ar->ar_pro)) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		in_arpinput(m);
 		return;
 #endif
 	}
 	m_freem(m);
 }
 
 #ifdef INET
 /*
  * ARP for Internet protocols on 10 Mb/s Ethernet.
  * Algorithm is that given in RFC 826.
  * In addition, a sanity check is performed on the sender
  * protocol address, to catch impersonators.
  * We no longer handle negotiations for use of trailer protocol:
  * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent
  * along with IP replies if we wanted trailers sent to us,
  * and also sent them in response to IP replies.
  * This allowed either end to announce the desire to receive
  * trailer packets.
  * We no longer reply to requests for ETHERTYPE_TRAIL protocol either,
  * but formerly didn't normally send requests.
  */
 static int log_arp_wrong_iface = 1;
 static int log_arp_movements = 1;
 static int log_arp_permanent_modify = 1;
 static int allow_multicast = 0;
 
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW,
 	&log_arp_wrong_iface, 0,
 	"log arp packets arriving on the wrong interface");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW,
 	&log_arp_movements, 0,
 	"log arp replies from MACs different than the one in the cache");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW,
 	&log_arp_permanent_modify, 0,
 	"log arp replies from MACs different than the one in the permanent arp entry");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW,
 	&allow_multicast, 0, "accept multicast addresses");
 
 static void
 in_arpinput(struct mbuf *m)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct arphdr *ah;
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct llentry *la = NULL, *la_tmp;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	struct sockaddr sa;
 	struct in_addr isaddr, itaddr, myaddr;
 	u_int8_t *enaddr = NULL;
 	int op;
 	int bridged = 0, is_bridge = 0;
 	int carped;
 	struct sockaddr_in sin;
 	struct sockaddr *dst;
 	struct nhop4_basic nh4;
 	uint8_t linkhdr[LLE_MAX_LINKHDR];
 	struct route ro;
 	size_t linkhdrsize;
 	int lladdr_off;
 	int error;
 
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = 0;
 
 	if (ifp->if_bridge)
 		bridged = 1;
 	if (ifp->if_type == IFT_BRIDGE)
 		is_bridge = 1;
 
 	/*
 	 * We already have checked that mbuf contains enough contiguous data
 	 * to hold entire arp message according to the arp header.
 	 */
 	ah = mtod(m, struct arphdr *);
 
 	/*
 	 * ARP is only for IPv4 so we can reject packets with
 	 * a protocol length not equal to an IPv4 address.
 	 */
 	if (ah->ar_pln != sizeof(struct in_addr)) {
 		ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n",
 		    sizeof(struct in_addr));
 		goto drop;
 	}
 
 	if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) {
 		ARP_LOG(LOG_NOTICE, "%*D is multicast\n",
 		    ifp->if_addrlen, (u_char *)ar_sha(ah), ":");
 		goto drop;
 	}
 
 	op = ntohs(ah->ar_op);
 	(void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr));
 	(void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr));
 
 	if (op == ARPOP_REPLY)
 		ARPSTAT_INC(rxreplies);
 
 	/*
 	 * For a bridge, we want to check the address irrespective
 	 * of the receive interface. (This will change slightly
 	 * when we have clusters of interfaces).
 	 */
 	IN_IFADDR_RLOCK(&in_ifa_tracker);
 	LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
 		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
 		    ia->ia_ifp == ifp) &&
 		    itaddr.s_addr == ia->ia_addr.sin_addr.s_addr &&
 		    (ia->ia_ifa.ifa_carp == NULL ||
 		    (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) {
 			ifa_ref(&ia->ia_ifa);
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			goto match;
 		}
 	}
 	LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash)
 		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
 		    ia->ia_ifp == ifp) &&
 		    isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
 			ifa_ref(&ia->ia_ifa);
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			goto match;
 		}
 
 #define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia)				\
   (ia->ia_ifp->if_bridge == ifp->if_softc &&				\
   !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) &&	\
   addr == ia->ia_addr.sin_addr.s_addr)
 	/*
 	 * Check the case when bridge shares its MAC address with
 	 * some of its children, so packets are claimed by bridge
 	 * itself (bridge_input() does it first), but they are really
 	 * meant to be destined to the bridge member.
 	 */
 	if (is_bridge) {
 		LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
 			if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) {
 				ifa_ref(&ia->ia_ifa);
 				ifp = ia->ia_ifp;
 				IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 				goto match;
 			}
 		}
 	}
 #undef BDG_MEMBER_MATCHES_ARP
 	IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 
 	/*
 	 * No match, use the first inet address on the receive interface
 	 * as a dummy address for the rest of the function.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (ifa->ifa_addr->sa_family == AF_INET &&
 		    (ifa->ifa_carp == NULL ||
 		    (*carp_iamatch_p)(ifa, &enaddr))) {
 			ia = ifatoia(ifa);
 			ifa_ref(ifa);
 			IF_ADDR_RUNLOCK(ifp);
 			goto match;
 		}
 	IF_ADDR_RUNLOCK(ifp);
 
 	/*
 	 * If bridging, fall back to using any inet address.
 	 */
 	IN_IFADDR_RLOCK(&in_ifa_tracker);
 	if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) {
 		IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 		goto drop;
 	}
 	ifa_ref(&ia->ia_ifa);
 	IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 match:
 	if (!enaddr)
 		enaddr = (u_int8_t *)IF_LLADDR(ifp);
 	carped = (ia->ia_ifa.ifa_carp != NULL);
 	myaddr = ia->ia_addr.sin_addr;
 	ifa_free(&ia->ia_ifa);
 	if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen))
 		goto drop;	/* it's from me, ignore it. */
 	if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) {
 		ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address "
 		    "%s!\n", inet_ntoa(isaddr));
 		goto drop;
 	}
 
 	if (ifp->if_addrlen != ah->ar_hln) {
 		ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, "
 		    "i/f %d (ignored)\n", ifp->if_addrlen,
 		    (u_char *) ar_sha(ah), ":", ah->ar_hln,
 		    ifp->if_addrlen);
 		goto drop;
 	}
 
 	/*
 	 * Warn if another host is using the same IP address, but only if the
 	 * IP address isn't 0.0.0.0, which is used for DHCP only, in which
 	 * case we suppress the warning to avoid false positive complaints of
 	 * potential misconfiguration.
 	 */
 	if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr &&
 	    myaddr.s_addr != 0) {
 		ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n",
 		   ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 		   inet_ntoa(isaddr), ifp->if_xname);
 		itaddr = myaddr;
 		ARPSTAT_INC(dupips);
 		goto reply;
 	}
 	if (ifp->if_flags & IFF_STATICARP)
 		goto reply;
 
 	bzero(&sin, sizeof(sin));
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_addr = isaddr;
 	dst = (struct sockaddr *)&sin;
 	IF_AFDATA_RLOCK(ifp);
 	la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 	IF_AFDATA_RUNLOCK(ifp);
 	if (la != NULL)
 		arp_check_update_lle(ah, isaddr, ifp, bridged, la);
 	else if (itaddr.s_addr == myaddr.s_addr) {
 		/*
 		 * Request/reply to our address, but no lle exists yet.
 		 * Calculate full link prepend to use in lle.
 		 */
 		linkhdrsize = sizeof(linkhdr);
 		if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr,
 		    &linkhdrsize, &lladdr_off) != 0)
 			goto reply;
 
 		/* Allocate new entry */
 		la = lltable_alloc_entry(LLTABLE(ifp), 0, dst);
 		if (la == NULL) {
 
 			/*
 			 * lle creation may fail if source address belongs
 			 * to non-directly connected subnet. However, we
 			 * will try to answer the request instead of dropping
 			 * frame.
 			 */
 			goto reply;
 		}
 		lltable_set_entry_addr(ifp, la, linkhdr, linkhdrsize,
 		    lladdr_off);
 
 		IF_AFDATA_WLOCK(ifp);
 		LLE_WLOCK(la);
 		la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 
 		/*
 		 * Check if lle still does not exists.
 		 * If it does, that means that we either
 		 * 1) have configured it explicitly, via
 		 * 1a) 'arp -s' static entry or
 		 * 1b) interface address static record
 		 * or
 		 * 2) it was the result of sending first packet to-host
 		 * or
 		 * 3) it was another arp reply packet we handled in
 		 * different thread.
 		 *
 		 * In all cases except 3) we definitely need to prefer
 		 * existing lle. For the sake of simplicity, prefer any
 		 * existing lle over newly-create one.
 		 */
 		if (la_tmp == NULL)
 			lltable_link_entry(LLTABLE(ifp), la);
 		IF_AFDATA_WUNLOCK(ifp);
 
 		if (la_tmp == NULL) {
 			arp_mark_lle_reachable(la);
 			LLE_WUNLOCK(la);
 		} else {
 			/* Free newly-create entry and handle packet */
 			lltable_free_entry(LLTABLE(ifp), la);
 			la = la_tmp;
 			la_tmp = NULL;
 			arp_check_update_lle(ah, isaddr, ifp, bridged, la);
 			/* arp_check_update_lle() returns @la unlocked */
 		}
 		la = NULL;
 	}
 reply:
 	if (op != ARPOP_REQUEST)
 		goto drop;
 	ARPSTAT_INC(rxrequests);
 
 	if (itaddr.s_addr == myaddr.s_addr) {
 		/* Shortcut.. the receiving interface is the target. */
 		(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 		(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
 	} else {
 		struct llentry *lle = NULL;
 
 		sin.sin_addr = itaddr;
 		IF_AFDATA_RLOCK(ifp);
 		lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin);
 		IF_AFDATA_RUNLOCK(ifp);
 
 		if ((lle != NULL) && (lle->la_flags & LLE_PUB)) {
 			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 			(void)memcpy(ar_sha(ah), lle->ll_addr, ah->ar_hln);
 			LLE_RUNLOCK(lle);
 		} else {
 
 			if (lle != NULL)
 				LLE_RUNLOCK(lle);
 
 			if (!V_arp_proxyall)
 				goto drop;
 
 			/* XXX MRT use table 0 for arp reply  */
 			if (fib4_lookup_nh_basic(0, itaddr, 0, 0, &nh4) != 0)
 				goto drop;
 
 			/*
 			 * Don't send proxies for nodes on the same interface
 			 * as this one came out of, or we'll get into a fight
 			 * over who claims what Ether address.
 			 */
 			if (nh4.nh_ifp == ifp)
 				goto drop;
 
 			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 			(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
 
 			/*
 			 * Also check that the node which sent the ARP packet
 			 * is on the interface we expect it to be on. This
 			 * avoids ARP chaos if an interface is connected to the
 			 * wrong network.
 			 */
 
 			/* XXX MRT use table 0 for arp checks */
 			if (fib4_lookup_nh_basic(0, isaddr, 0, 0, &nh4) != 0)
 				goto drop;
 			if (nh4.nh_ifp != ifp) {
 				ARP_LOG(LOG_INFO, "proxy: ignoring request"
 				    " from %s via %s\n",
 				    inet_ntoa(isaddr), ifp->if_xname);
 				goto drop;
 			}
 
 #ifdef DEBUG_PROXY
 			printf("arp: proxying for %s\n", inet_ntoa(itaddr));
 #endif
 		}
 	}
 
 	if (itaddr.s_addr == myaddr.s_addr &&
 	    IN_LINKLOCAL(ntohl(itaddr.s_addr))) {
 		/* RFC 3927 link-local IPv4; always reply by broadcast. */
 #ifdef DEBUG_LINKLOCAL
 		printf("arp: sending reply for link-local addr %s\n",
 		    inet_ntoa(itaddr));
 #endif
 		m->m_flags |= M_BCAST;
 		m->m_flags &= ~M_MCAST;
 	} else {
 		/* default behaviour; never reply by broadcast. */
 		m->m_flags &= ~(M_BCAST|M_MCAST);
 	}
 	(void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln);
 	(void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln);
 	ah->ar_op = htons(ARPOP_REPLY);
 	ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */
 	m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln);
 	m->m_pkthdr.len = m->m_len;
 	m->m_pkthdr.rcvif = NULL;
 	sa.sa_family = AF_ARP;
 	sa.sa_len = 2;
 
 	/* Calculate link header for sending frame */
 	bzero(&ro, sizeof(ro));
 	linkhdrsize = sizeof(linkhdr);
 	error = arp_fillheader(ifp, ah, 0, linkhdr, &linkhdrsize);
 
 	/*
 	 * arp_fillheader() may fail due to lack of support inside encap request
 	 * routing. This is not necessary an error, AF_ARP can/should be handled
 	 * by if_output().
 	 */
 	if (error != 0 && error != EAFNOSUPPORT) {
 		ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n",
 		    if_name(ifp), error);
 		return;
 	}
 
 	ro.ro_prepend = linkhdr;
 	ro.ro_plen = linkhdrsize;
 	ro.ro_flags = 0;
 
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	(*ifp->if_output)(ifp, m, &sa, &ro);
 	ARPSTAT_INC(txreplies);
 	return;
 
 drop:
 	m_freem(m);
 }
 #endif
 
 /*
  * Checks received arp data against existing @la.
  * Updates lle state/performs notification if necessary.
  */
 static void
 arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp,
     int bridged, struct llentry *la)
 {
 	struct sockaddr sa;
 	struct mbuf *m_hold, *m_hold_next;
 	uint8_t linkhdr[LLE_MAX_LINKHDR];
 	size_t linkhdrsize;
 	int lladdr_off;
 
 	LLE_WLOCK_ASSERT(la);
 
 	/* the following is not an error when doing bridging */
 	if (!bridged && la->lle_tbl->llt_ifp != ifp) {
 		if (log_arp_wrong_iface)
 			ARP_LOG(LOG_WARNING, "%s is on %s "
 			    "but got reply from %*D on %s\n",
 			    inet_ntoa(isaddr),
 			    la->lle_tbl->llt_ifp->if_xname,
 			    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 			    ifp->if_xname);
 		LLE_WUNLOCK(la);
 		return;
 	}
 	if ((la->la_flags & LLE_VALID) &&
 	    bcmp(ar_sha(ah), la->ll_addr, ifp->if_addrlen)) {
 		if (la->la_flags & LLE_STATIC) {
 			LLE_WUNLOCK(la);
 			if (log_arp_permanent_modify)
 				ARP_LOG(LOG_ERR,
 				    "%*D attempts to modify "
 				    "permanent entry for %s on %s\n",
 				    ifp->if_addrlen,
 				    (u_char *)ar_sha(ah), ":",
 				    inet_ntoa(isaddr), ifp->if_xname);
 			return;
 		}
 		if (log_arp_movements) {
 			ARP_LOG(LOG_INFO, "%s moved from %*D "
 			    "to %*D on %s\n",
 			    inet_ntoa(isaddr),
 			    ifp->if_addrlen,
 			    (u_char *)&la->ll_addr, ":",
 			    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 			    ifp->if_xname);
 		}
 	}
 
 	/* Calculate full link prepend to use in lle */
 	linkhdrsize = sizeof(linkhdr);
 	if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr,
 	    &linkhdrsize, &lladdr_off) != 0)
 		return;
 
 	/* Check if something has changed */
 	if (memcmp(la->r_linkdata, linkhdr, linkhdrsize) != 0 ||
 	    (la->la_flags & LLE_VALID) == 0) {
 		/* Try to perform LLE update */
 		if (lltable_try_set_entry_addr(ifp, la, linkhdr, linkhdrsize,
 		    lladdr_off) == 0)
 			return;
 
 		/* Clear fast path feedback request if set */
 		la->r_skip_req = 0;
 	}
 
 	arp_mark_lle_reachable(la);
 
 	/*
 	 * The packets are all freed within the call to the output
 	 * routine.
 	 *
 	 * NB: The lock MUST be released before the call to the
 	 * output routine.
 	 */
 	if (la->la_hold != NULL) {
 		m_hold = la->la_hold;
 		la->la_hold = NULL;
 		la->la_numheld = 0;
 		lltable_fill_sa_entry(la, &sa);
 		LLE_WUNLOCK(la);
 		for (; m_hold != NULL; m_hold = m_hold_next) {
 			m_hold_next = m_hold->m_nextpkt;
 			m_hold->m_nextpkt = NULL;
 			/* Avoid confusing lower layers. */
 			m_clrprotoflags(m_hold);
 			(*ifp->if_output)(ifp, m_hold, &sa, NULL);
 		}
 	} else
 		LLE_WUNLOCK(la);
 }
 
 static void
 arp_mark_lle_reachable(struct llentry *la)
 {
 	int canceled, wtime;
 
 	LLE_WLOCK_ASSERT(la);
 
 	la->ln_state = ARP_LLINFO_REACHABLE;
 	EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
 
 	if (!(la->la_flags & LLE_STATIC)) {
 		LLE_ADDREF(la);
 		la->la_expire = time_uptime + V_arpt_keep;
 		wtime = V_arpt_keep - V_arp_maxtries * V_arpt_rexmit;
 		if (wtime < 0)
 			wtime = V_arpt_keep;
 		canceled = callout_reset(&la->lle_timer,
 		    hz * wtime, arptimer, la);
 		if (canceled)
 			LLE_REMREF(la);
 	}
 	la->la_asked = 0;
 	la->la_preempt = V_arp_maxtries;
 }
 
 /*
  * Add pernament link-layer record for given interface address.
  */
 static __noinline void
 arp_add_ifa_lle(struct ifnet *ifp, const struct sockaddr *dst)
 {
 	struct llentry *lle, *lle_tmp;
 
 	/*
 	 * Interface address LLE record is considered static
 	 * because kernel code relies on LLE_STATIC flag to check
 	 * if these entries can be rewriten by arp updates.
 	 */
 	lle = lltable_alloc_entry(LLTABLE(ifp), LLE_IFADDR | LLE_STATIC, dst);
 	if (lle == NULL) {
 		log(LOG_INFO, "arp_ifinit: cannot create arp "
 		    "entry for interface address\n");
 		return;
 	}
 
 	IF_AFDATA_WLOCK(ifp);
 	LLE_WLOCK(lle);
 	/* Unlink any entry if exists */
 	lle_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 	if (lle_tmp != NULL)
 		lltable_unlink_entry(LLTABLE(ifp), lle_tmp);
 
 	lltable_link_entry(LLTABLE(ifp), lle);
 	IF_AFDATA_WUNLOCK(ifp);
 
 	if (lle_tmp != NULL)
 		EVENTHANDLER_INVOKE(lle_event, lle_tmp, LLENTRY_EXPIRED);
 
 	EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED);
 	LLE_WUNLOCK(lle);
 	if (lle_tmp != NULL)
 		lltable_free_entry(LLTABLE(ifp), lle_tmp);
 }
 
 void
 arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa)
 {
 	const struct sockaddr_in *dst_in;
 	const struct sockaddr *dst;
 
 	if (ifa->ifa_carp != NULL)
 		return;
 
 	dst = ifa->ifa_addr;
 	dst_in = (const struct sockaddr_in *)dst;
 
 	if (ntohl(dst_in->sin_addr.s_addr) == INADDR_ANY)
 		return;
 	arp_announce_ifaddr(ifp, dst_in->sin_addr, IF_LLADDR(ifp));
 
 	arp_add_ifa_lle(ifp, dst);
 }
 
 void
 arp_announce_ifaddr(struct ifnet *ifp, struct in_addr addr, u_char *enaddr)
 {
 
 	if (ntohl(addr.s_addr) != INADDR_ANY)
 		arprequest(ifp, &addr, &addr, enaddr);
 }
 
 /*
  * Sends gratuitous ARPs for each ifaddr to notify other
  * nodes about the address change.
  */
 static __noinline void
 arp_handle_ifllchange(struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			arp_ifinit(ifp, ifa);
 	}
 }
 
 /*
  * A handler for interface link layer address change event.
  */
 static void
 arp_iflladdr(void *arg __unused, struct ifnet *ifp)
 {
 
 	lltable_update_ifaddr(LLTABLE(ifp));
 
 	if ((ifp->if_flags & IFF_UP) != 0)
 		arp_handle_ifllchange(ifp);
 }
 
 static void
 vnet_arp_init(void)
 {
 
-	netisr_register(&arp_nh);
-	if (IS_DEFAULT_VNET(curvnet))
+	if (IS_DEFAULT_VNET(curvnet)) {
+		netisr_register(&arp_nh);
 		iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event,
 		    arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
+	} else
+		netisr_register_vnet(&arp_nh);
 }
 VNET_SYSINIT(vnet_arp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND,
     vnet_arp_init, 0);
 
 #ifdef VIMAGE
 /*
  * We have to unregister ARP along with IP otherwise we risk doing INADDR_HASH
  * lookups after destroying the hash.  Ideally this would go on SI_ORDER_3.5.
  */
 static void
 vnet_arp_destroy(__unused void *arg)
 {
 
-	netisr_unregister(&arp_nh);
+	netisr_unregister_vnet(&arp_nh);
 }
 VNET_SYSUNINIT(vnet_arp_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     vnet_arp_destroy, NULL);
 #endif
Index: projects/vnet/sys/netinet/igmp.c
===================================================================
--- projects/vnet/sys/netinet/igmp.c	(revision 299172)
+++ projects/vnet/sys/netinet/igmp.c	(revision 299173)
@@ -1,3676 +1,3678 @@
 /*-
  * Copyright (c) 2007-2009 Bruce Simpson.
  * Copyright (c) 1988 Stephen Deering.
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
  */
 
 /*
  * Internet Group Management Protocol (IGMP) routines.
  * [RFC1112, RFC2236, RFC3376]
  *
  * Written by Steve Deering, Stanford, May 1988.
  * Modified by Rosen Sharma, Stanford, Aug 1994.
  * Modified by Bill Fenner, Xerox PARC, Feb 1995.
  * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995.
  * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson.
  *
  * MULTICAST Revision: 3.5.1.4
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/sysctl.h>
 #include <sys/ktr.h>
 #include <sys/condvar.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/igmp.h>
 #include <netinet/igmp_var.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifndef KTR_IGMPV3
 #define KTR_IGMPV3 KTR_INET
 #endif
 
 static struct igmp_ifsoftc *
 		igi_alloc_locked(struct ifnet *);
 static void	igi_delete_locked(const struct ifnet *);
 static void	igmp_dispatch_queue(struct mbufq *, int, const int);
 static void	igmp_fasttimo_vnet(void);
 static void	igmp_final_leave(struct in_multi *, struct igmp_ifsoftc *);
 static int	igmp_handle_state_change(struct in_multi *,
 		    struct igmp_ifsoftc *);
 static int	igmp_initial_join(struct in_multi *, struct igmp_ifsoftc *);
 static int	igmp_input_v1_query(struct ifnet *, const struct ip *,
 		    const struct igmp *);
 static int	igmp_input_v2_query(struct ifnet *, const struct ip *,
 		    const struct igmp *);
 static int	igmp_input_v3_query(struct ifnet *, const struct ip *,
 		    /*const*/ struct igmpv3 *);
 static int	igmp_input_v3_group_query(struct in_multi *,
 		    struct igmp_ifsoftc *, int, /*const*/ struct igmpv3 *);
 static int	igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *,
 		    /*const*/ struct igmp *);
 static int	igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *,
 		    /*const*/ struct igmp *);
 static void	igmp_intr(struct mbuf *);
 static int	igmp_isgroupreported(const struct in_addr);
 static struct mbuf *
 		igmp_ra_alloc(void);
 #ifdef KTR
 static char *	igmp_rec_type_to_str(const int);
 #endif
 static void	igmp_set_version(struct igmp_ifsoftc *, const int);
 static void	igmp_slowtimo_vnet(void);
 static int	igmp_v1v2_queue_report(struct in_multi *, const int);
 static void	igmp_v1v2_process_group_timer(struct in_multi *, const int);
 static void	igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *);
 static void	igmp_v2_update_group(struct in_multi *, const int);
 static void	igmp_v3_cancel_link_timers(struct igmp_ifsoftc *);
 static void	igmp_v3_dispatch_general_query(struct igmp_ifsoftc *);
 static struct mbuf *
 		igmp_v3_encap_report(struct ifnet *, struct mbuf *);
 static int	igmp_v3_enqueue_group_record(struct mbufq *,
 		    struct in_multi *, const int, const int, const int);
 static int	igmp_v3_enqueue_filter_change(struct mbufq *,
 		    struct in_multi *);
 static void	igmp_v3_process_group_timers(struct igmp_ifsoftc *,
 		    struct mbufq *, struct mbufq *, struct in_multi *,
 		    const int);
 static int	igmp_v3_merge_state_changes(struct in_multi *,
 		    struct mbufq *);
 static void	igmp_v3_suppress_group_record(struct in_multi *);
 static int	sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS);
 static int	sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS);
 static int	sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS);
 
 static const struct netisr_handler igmp_nh = {
 	.nh_name = "igmp",
 	.nh_handler = igmp_intr,
 	.nh_proto = NETISR_IGMP,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 /*
  * System-wide globals.
  *
  * Unlocked access to these is OK, except for the global IGMP output
  * queue. The IGMP subsystem lock ends up being system-wide for the moment,
  * because all VIMAGEs have to share a global output queue, as netisrs
  * themselves are not virtualized.
  *
  * Locking:
  *  * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
  *    Any may be taken independently; if any are held at the same
  *    time, the above lock order must be followed.
  *  * All output is delegated to the netisr.
  *    Now that Giant has been eliminated, the netisr may be inlined.
  *  * IN_MULTI_LOCK covers in_multi.
  *  * IGMP_LOCK covers igmp_ifsoftc and any global variables in this file,
  *    including the output queue.
  *  * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
  *    per-link state iterators.
  *  * igmp_ifsoftc is valid as long as PF_INET is attached to the interface,
  *    therefore it is not refcounted.
  *    We allow unlocked reads of igmp_ifsoftc when accessed via in_multi.
  *
  * Reference counting
  *  * IGMP acquires its own reference every time an in_multi is passed to
  *    it and the group is being joined for the first time.
  *  * IGMP releases its reference(s) on in_multi in a deferred way,
  *    because the operations which process the release run as part of
  *    a loop whose control variables are directly affected by the release
  *    (that, and not recursing on the IF_ADDR_LOCK).
  *
  * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds
  * to a vnet in ifp->if_vnet.
  *
  * SMPng: XXX We may potentially race operations on ifma_protospec.
  * The problem is that we currently lack a clean way of taking the
  * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing,
  * as anything which modifies ifma needs to be covered by that lock.
  * So check for ifma_protospec being NULL before proceeding.
  */
 struct mtx		 igmp_mtx;
 
 struct mbuf		*m_raopt;		 /* Router Alert option */
 static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");
 
 /*
  * VIMAGE-wide globals.
  *
  * The IGMPv3 timers themselves need to run per-image, however,
  * protosw timers run globally (see tcp).
  * An ifnet can only be in one vimage at a time, and the loopback
  * ifnet, loif, is itself virtualized.
  * It would otherwise be possible to seriously hose IGMP state,
  * and create inconsistencies in upstream multicast routing, if you have
  * multiple VIMAGEs running on the same link joining different multicast
  * groups, UNLESS the "primary IP address" is different. This is because
  * IGMP for IPv4 does not force link-local addresses to be used for each
  * node, unlike MLD for IPv6.
  * Obviously the IGMPv3 per-interface state has per-vimage granularity
  * also as a result.
  *
  * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection
  * policy to control the address used by IGMP on the link.
  */
 static VNET_DEFINE(int, interface_timers_running);	/* IGMPv3 general
 							 * query response */
 static VNET_DEFINE(int, state_change_timers_running);	/* IGMPv3 state-change
 							 * retransmit */
 static VNET_DEFINE(int, current_state_timers_running);	/* IGMPv1/v2 host
 							 * report; IGMPv3 g/sg
 							 * query response */
 
 #define	V_interface_timers_running	VNET(interface_timers_running)
 #define	V_state_change_timers_running	VNET(state_change_timers_running)
 #define	V_current_state_timers_running	VNET(current_state_timers_running)
 
 static VNET_DEFINE(LIST_HEAD(, igmp_ifsoftc), igi_head) =
     LIST_HEAD_INITIALIZER(igi_head);
 static VNET_DEFINE(struct igmpstat, igmpstat) = {
 	.igps_version = IGPS_VERSION_3,
 	.igps_len = sizeof(struct igmpstat),
 };
 static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0};
 
 #define	V_igi_head			VNET(igi_head)
 #define	V_igmpstat			VNET(igmpstat)
 #define	V_igmp_gsrdelay			VNET(igmp_gsrdelay)
 
 static VNET_DEFINE(int, igmp_recvifkludge) = 1;
 static VNET_DEFINE(int, igmp_sendra) = 1;
 static VNET_DEFINE(int, igmp_sendlocal) = 1;
 static VNET_DEFINE(int, igmp_v1enable) = 1;
 static VNET_DEFINE(int, igmp_v2enable) = 1;
 static VNET_DEFINE(int, igmp_legacysupp);
 static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3;
 
 #define	V_igmp_recvifkludge		VNET(igmp_recvifkludge)
 #define	V_igmp_sendra			VNET(igmp_sendra)
 #define	V_igmp_sendlocal		VNET(igmp_sendlocal)
 #define	V_igmp_v1enable			VNET(igmp_v1enable)
 #define	V_igmp_v2enable			VNET(igmp_v2enable)
 #define	V_igmp_legacysupp		VNET(igmp_legacysupp)
 #define	V_igmp_default_version		VNET(igmp_default_version)
 
 /*
  * Virtualized sysctls.
  */
 SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmpstat), igmpstat, "");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_recvifkludge), 0,
     "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_sendra), 0,
     "Send IP Router Alert option in IGMPv2/v3 messages");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_sendlocal), 0,
     "Send IGMP membership reports for 224.0.0.0/24 groups");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_v1enable), 0,
     "Enable backwards compatibility with IGMPv1");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_v2enable), 0,
     "Enable backwards compatibility with IGMPv2");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_legacysupp), 0,
     "Allow v1/v2 reports to suppress v3 group responses");
 SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I",
     "Default version of IGMP to run on each interface");
 SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I",
     "Rate limit for IGMPv3 Group-and-Source queries in seconds");
 
 /*
  * Non-virtualized sysctls.
  */
 static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo,
     "Per-interface IGMPv3 state");
 
 static __inline void
 igmp_save_context(struct mbuf *m, struct ifnet *ifp)
 {
 
 #ifdef VIMAGE
 	m->m_pkthdr.PH_loc.ptr = ifp->if_vnet;
 #endif /* VIMAGE */
 	m->m_pkthdr.flowid = ifp->if_index;
 }
 
 static __inline void
 igmp_scrub_context(struct mbuf *m)
 {
 
 	m->m_pkthdr.PH_loc.ptr = NULL;
 	m->m_pkthdr.flowid = 0;
 }
 
 #ifdef KTR
 static __inline char *
 inet_ntoa_haddr(in_addr_t haddr)
 {
 	struct in_addr ia;
 
 	ia.s_addr = htonl(haddr);
 	return (inet_ntoa(ia));
 }
 #endif
 
 /*
  * Restore context from a queued IGMP output chain.
  * Return saved ifindex.
  *
  * VIMAGE: The assertion is there to make sure that we
  * actually called CURVNET_SET() with what's in the mbuf chain.
  */
 static __inline uint32_t
 igmp_restore_context(struct mbuf *m)
 {
 
 #ifdef notyet
 #if defined(VIMAGE) && defined(INVARIANTS)
 	KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr),
 	    ("%s: called when curvnet was not restored", __func__));
 #endif
 #endif
 	return (m->m_pkthdr.flowid);
 }
 
 /*
  * Retrieve or set default IGMP version.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by IGMP lock.
  */
 static int
 sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS)
 {
 	int	 error;
 	int	 new;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	IGMP_LOCK();
 
 	new = V_igmp_default_version;
 
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d",
 	     V_igmp_default_version, new);
 
 	V_igmp_default_version = new;
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Retrieve or set threshold between group-source queries in seconds.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by IGMP lock.
  */
 static int
 sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	IGMP_LOCK();
 
 	i = V_igmp_gsrdelay.tv_sec;
 
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (i < -1 || i >= 60) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d",
 	     V_igmp_gsrdelay.tv_sec, i);
 	V_igmp_gsrdelay.tv_sec = i;
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Expose struct igmp_ifsoftc to userland, keyed by ifindex.
  * For use by ifmcstat(8).
  *
  * SMPng: NOTE: Does an unlocked ifindex space read.
  * VIMAGE: Assume curvnet set by caller. The node handler itself
  * is not directly virtualized.
  */
 static int
 sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
 {
 	int			*name;
 	int			 error;
 	u_int			 namelen;
 	struct ifnet		*ifp;
 	struct igmp_ifsoftc	*igi;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo));
 	if (error)
 		return (error);
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	if (name[0] <= 0 || name[0] > V_if_index) {
 		error = ENOENT;
 		goto out_locked;
 	}
 
 	error = ENOENT;
 
 	ifp = ifnet_byindex(name[0]);
 	if (ifp == NULL)
 		goto out_locked;
 
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		if (ifp == igi->igi_ifp) {
 			struct igmp_ifinfo info;
 
 			info.igi_version = igi->igi_version;
 			info.igi_v1_timer = igi->igi_v1_timer;
 			info.igi_v2_timer = igi->igi_v2_timer;
 			info.igi_v3_timer = igi->igi_v3_timer;
 			info.igi_flags = igi->igi_flags;
 			info.igi_rv = igi->igi_rv;
 			info.igi_qi = igi->igi_qi;
 			info.igi_qri = igi->igi_qri;
 			info.igi_uri = igi->igi_uri;
 			error = SYSCTL_OUT(req, &info, sizeof(info));
 			break;
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Dispatch an entire queue of pending packet chains
  * using the netisr.
  * VIMAGE: Assumes the vnet pointer has been set.
  */
 static void
 igmp_dispatch_queue(struct mbufq *mq, int limit, const int loop)
 {
 	struct mbuf *m;
 
 	while ((m = mbufq_dequeue(mq)) != NULL) {
 		CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, mq, m);
 		if (loop)
 			m->m_flags |= M_IGMP_LOOP;
 		netisr_dispatch(NETISR_IGMP, m);
 		if (--limit == 0)
 			break;
 	}
 }
 
 /*
  * Filter outgoing IGMP report state by group.
  *
  * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1).
  * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are
  * disabled for all groups in the 224.0.0.0/24 link-local scope. However,
  * this may break certain IGMP snooping switches which rely on the old
  * report behaviour.
  *
  * Return zero if the given group is one for which IGMP reports
  * should be suppressed, or non-zero if reports should be issued.
  */
 static __inline int
 igmp_isgroupreported(const struct in_addr addr)
 {
 
 	if (in_allhosts(addr) ||
 	    ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr)))))
 		return (0);
 
 	return (1);
 }
 
 /*
  * Construct a Router Alert option to use in outgoing packets.
  */
 static struct mbuf *
 igmp_ra_alloc(void)
 {
 	struct mbuf	*m;
 	struct ipoption	*p;
 
 	m = m_get(M_WAITOK, MT_DATA);
 	p = mtod(m, struct ipoption *);
 	p->ipopt_dst.s_addr = INADDR_ANY;
 	p->ipopt_list[0] = IPOPT_RA;	/* Router Alert Option */
 	p->ipopt_list[1] = 0x04;	/* 4 bytes long */
 	p->ipopt_list[2] = IPOPT_EOL;	/* End of IP option list */
 	p->ipopt_list[3] = 0x00;	/* pad byte */
 	m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1];
 
 	return (m);
 }
 
 /*
  * Attach IGMP when PF_INET is attached to an interface.
  */
 struct igmp_ifsoftc *
 igmp_domifattach(struct ifnet *ifp)
 {
 	struct igmp_ifsoftc *igi;
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK();
 
 	igi = igi_alloc_locked(ifp);
 	if (!(ifp->if_flags & IFF_MULTICAST))
 		igi->igi_flags |= IGIF_SILENT;
 
 	IGMP_UNLOCK();
 
 	return (igi);
 }
 
 /*
  * VIMAGE: assume curvnet set by caller.
  */
 static struct igmp_ifsoftc *
 igi_alloc_locked(/*const*/ struct ifnet *ifp)
 {
 	struct igmp_ifsoftc *igi;
 
 	IGMP_LOCK_ASSERT();
 
 	igi = malloc(sizeof(struct igmp_ifsoftc), M_IGMP, M_NOWAIT|M_ZERO);
 	if (igi == NULL)
 		goto out;
 
 	igi->igi_ifp = ifp;
 	igi->igi_version = V_igmp_default_version;
 	igi->igi_flags = 0;
 	igi->igi_rv = IGMP_RV_INIT;
 	igi->igi_qi = IGMP_QI_INIT;
 	igi->igi_qri = IGMP_QRI_INIT;
 	igi->igi_uri = IGMP_URI_INIT;
 	SLIST_INIT(&igi->igi_relinmhead);
 	mbufq_init(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS);
 
 	LIST_INSERT_HEAD(&V_igi_head, igi, igi_link);
 
 	CTR2(KTR_IGMPV3, "allocate igmp_ifsoftc for ifp %p(%s)",
 	     ifp, ifp->if_xname);
 
 out:
 	return (igi);
 }
 
 /*
  * Hook for ifdetach.
  *
  * NOTE: Some finalization tasks need to run before the protocol domain
  * is detached, but also before the link layer does its cleanup.
  *
  * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK().
  * XXX This is also bitten by unlocked ifma_protospec access.
  */
 void
 igmp_ifdetach(struct ifnet *ifp)
 {
 	struct igmp_ifsoftc	*igi;
 	struct ifmultiaddr	*ifma;
 	struct in_multi		*inm, *tinm;
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp,
 	    ifp->if_xname);
 
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	if (igi->igi_version == IGMP_VERSION_3) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 #if 0
 			KASSERT(ifma->ifma_protospec != NULL,
 			    ("%s: ifma_protospec is NULL", __func__));
 #endif
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			if (inm->inm_state == IGMP_LEAVING_MEMBER) {
 				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
 				    inm, inm_nrele);
 			}
 			inm_clear_recorded(inm);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		/*
 		 * Free the in_multi reference(s) for this IGMP lifecycle.
 		 */
 		SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele,
 		    tinm) {
 			SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
 			inm_release_locked(inm);
 		}
 	}
 
 	IGMP_UNLOCK();
 }
 
 /*
  * Hook for domifdetach.
  */
 void
 igmp_domifdetach(struct ifnet *ifp)
 {
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK();
 	igi_delete_locked(ifp);
 	IGMP_UNLOCK();
 }
 
 static void
 igi_delete_locked(const struct ifnet *ifp)
 {
 	struct igmp_ifsoftc *igi, *tigi;
 
 	CTR3(KTR_IGMPV3, "%s: freeing igmp_ifsoftc for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK_ASSERT();
 
 	LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) {
 		if (igi->igi_ifp == ifp) {
 			/*
 			 * Free deferred General Query responses.
 			 */
 			mbufq_drain(&igi->igi_gq);
 
 			LIST_REMOVE(igi, igi_link);
 
 			KASSERT(SLIST_EMPTY(&igi->igi_relinmhead),
 			    ("%s: there are dangling in_multi references",
 			    __func__));
 
 			free(igi, M_IGMP);
 			return;
 		}
 	}
 }
 
 /*
  * Process a received IGMPv1 query.
  * Return non-zero if the message should be dropped.
  *
  * VIMAGE: The curvnet pointer is derived from the input ifp.
  */
 static int
 igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
     const struct igmp *igmp)
 {
 	struct ifmultiaddr	*ifma;
 	struct igmp_ifsoftc	*igi;
 	struct in_multi		*inm;
 
 	/*
 	 * IGMPv1 Host Mmembership Queries SHOULD always be addressed to
 	 * 224.0.0.1. They are always treated as General Queries.
 	 * igmp_group is always ignored. Do not drop it as a userland
 	 * daemon may wish to see it.
 	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
 	 */
 	if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) {
 		IGMPSTAT_INC(igps_rcv_badqueries);
 		return (0);
 	}
 	IGMPSTAT_INC(igps_rcv_gen_queries);
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Switch to IGMPv1 host compatibility mode.
 	 */
 	igmp_set_version(igi, IGMP_VERSION_1);
 
 	CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname);
 
 	/*
 	 * Start the timers in all of our group records
 	 * for the interface on which the query arrived,
 	 * except those which are already running.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		if (inm->inm_timer != 0)
 			continue;
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			inm->inm_timer = IGMP_RANDOM_DELAY(
 			    IGMP_V1V2_MAX_RI * PR_FASTHZ);
 			V_current_state_timers_running = 1;
 			break;
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a received IGMPv2 general or group-specific query.
  */
 static int
 igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
     const struct igmp *igmp)
 {
 	struct ifmultiaddr	*ifma;
 	struct igmp_ifsoftc	*igi;
 	struct in_multi		*inm;
 	int			 is_general_query;
 	uint16_t		 timer;
 
 	is_general_query = 0;
 
 	/*
 	 * Validate address fields upfront.
 	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
 	 */
 	if (in_nullhost(igmp->igmp_group)) {
 		/*
 		 * IGMPv2 General Query.
 		 * If this was not sent to the all-hosts group, ignore it.
 		 */
 		if (!in_allhosts(ip->ip_dst))
 			return (0);
 		IGMPSTAT_INC(igps_rcv_gen_queries);
 		is_general_query = 1;
 	} else {
 		/* IGMPv2 Group-Specific Query. */
 		IGMPSTAT_INC(igps_rcv_group_queries);
 	}
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Ignore v2 query if in v1 Compatibility Mode.
 	 */
 	if (igi->igi_version == IGMP_VERSION_1)
 		goto out_locked;
 
 	igmp_set_version(igi, IGMP_VERSION_2);
 
 	timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	if (is_general_query) {
 		/*
 		 * For each reporting group joined on this
 		 * interface, kick the report timer.
 		 */
 		CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			igmp_v2_update_group(inm, timer);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	} else {
 		/*
 		 * Group-specific IGMPv2 query, we need only
 		 * look up the single group to process it.
 		 */
 		inm = inm_lookup(ifp, igmp->igmp_group);
 		if (inm != NULL) {
 			CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 			igmp_v2_update_group(inm, timer);
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Update the report timer on a group in response to an IGMPv2 query.
  *
  * If we are becoming the reporting member for this group, start the timer.
  * If we already are the reporting member for this group, and timer is
  * below the threshold, reset it.
  *
  * We may be updating the group for the first time since we switched
  * to IGMPv3. If we are, then we must clear any recorded source lists,
  * and transition to REPORTING state; the group timer is overloaded
  * for group and group-source query responses. 
  *
  * Unlike IGMPv3, the delay per group should be jittered
  * to avoid bursts of IGMPv2 reports.
  */
 static void
 igmp_v2_update_group(struct in_multi *inm, const int timer)
 {
 
 	CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__,
 	    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer);
 
 	IN_MULTI_LOCK_ASSERT();
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 		break;
 	case IGMP_REPORTING_MEMBER:
 		if (inm->inm_timer != 0 &&
 		    inm->inm_timer <= timer) {
 			CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, "
 			    "skipping.", __func__);
 			break;
 		}
 		/* FALLTHROUGH */
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__);
 		inm->inm_state = IGMP_REPORTING_MEMBER;
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		break;
 	case IGMP_SLEEPING_MEMBER:
 		CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__);
 		inm->inm_state = IGMP_AWAKENING_MEMBER;
 		break;
 	case IGMP_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Process a received IGMPv3 general, group-specific or
  * group-and-source-specific query.
  * Assumes m has already been pulled up to the full IGMP message length.
  * Return 0 if successful, otherwise an appropriate error code is returned.
  */
 static int
 igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
     /*const*/ struct igmpv3 *igmpv3)
 {
 	struct igmp_ifsoftc	*igi;
 	struct in_multi		*inm;
 	int			 is_general_query;
 	uint32_t		 maxresp, nsrc, qqi;
 	uint16_t		 timer;
 	uint8_t			 qrv;
 
 	is_general_query = 0;
 
 	CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname);
 
 	maxresp = igmpv3->igmp_code;	/* in 1/10ths of a second */
 	if (maxresp >= 128) {
 		maxresp = IGMP_MANT(igmpv3->igmp_code) <<
 			  (IGMP_EXP(igmpv3->igmp_code) + 3);
 	}
 
 	/*
 	 * Robustness must never be less than 2 for on-wire IGMPv3.
 	 * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make
 	 * an exception for interfaces whose IGMPv3 state changes
 	 * are redirected to loopback (e.g. MANET).
 	 */
 	qrv = IGMP_QRV(igmpv3->igmp_misc);
 	if (qrv < 2) {
 		CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__,
 		    qrv, IGMP_RV_INIT);
 		qrv = IGMP_RV_INIT;
 	}
 
 	qqi = igmpv3->igmp_qqi;
 	if (qqi >= 128) {
 		qqi = IGMP_MANT(igmpv3->igmp_qqi) <<
 		     (IGMP_EXP(igmpv3->igmp_qqi) + 3);
 	}
 
 	timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	nsrc = ntohs(igmpv3->igmp_numsrc);
 
 	/*
 	 * Validate address fields and versions upfront before
 	 * accepting v3 query.
 	 * XXX SMPng: Unlocked access to igmpstat counters here.
 	 */
 	if (in_nullhost(igmpv3->igmp_group)) {
 		/*
 		 * IGMPv3 General Query.
 		 *
 		 * General Queries SHOULD be directed to 224.0.0.1.
 		 * A general query with a source list has undefined
 		 * behaviour; discard it.
 		 */
 		IGMPSTAT_INC(igps_rcv_gen_queries);
 		if (!in_allhosts(ip->ip_dst) || nsrc > 0) {
 			IGMPSTAT_INC(igps_rcv_badqueries);
 			return (0);
 		}
 		is_general_query = 1;
 	} else {
 		/* Group or group-source specific query. */
 		if (nsrc == 0)
 			IGMPSTAT_INC(igps_rcv_group_queries);
 		else
 			IGMPSTAT_INC(igps_rcv_gsr_queries);
 	}
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Discard the v3 query if we're in Compatibility Mode.
 	 * The RFC is not obviously worded that hosts need to stay in
 	 * compatibility mode until the Old Version Querier Present
 	 * timer expires.
 	 */
 	if (igi->igi_version != IGMP_VERSION_3) {
 		CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)",
 		    igi->igi_version, ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	igmp_set_version(igi, IGMP_VERSION_3);
 	igi->igi_rv = qrv;
 	igi->igi_qi = qqi;
 	igi->igi_qri = maxresp;
 
 	CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi,
 	    maxresp);
 
 	if (is_general_query) {
 		/*
 		 * Schedule a current-state report on this ifp for
 		 * all groups, possibly containing source lists.
 		 * If there is a pending General Query response
 		 * scheduled earlier than the selected delay, do
 		 * not schedule any other reports.
 		 * Otherwise, reset the interface timer.
 		 */
 		CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) {
 			igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer);
 			V_interface_timers_running = 1;
 		}
 	} else {
 		/*
 		 * Group-source-specific queries are throttled on
 		 * a per-group basis to defeat denial-of-service attempts.
 		 * Queries for groups we are not a member of on this
 		 * link are simply ignored.
 		 */
 		inm = inm_lookup(ifp, igmpv3->igmp_group);
 		if (inm == NULL)
 			goto out_locked;
 		if (nsrc > 0) {
 			if (!ratecheck(&inm->inm_lastgsrtv,
 			    &V_igmp_gsrdelay)) {
 				CTR1(KTR_IGMPV3, "%s: GS query throttled.",
 				    __func__);
 				IGMPSTAT_INC(igps_drop_gsr_queries);
 				goto out_locked;
 			}
 		}
 		CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)",
 		     inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname);
 		/*
 		 * If there is a pending General Query response
 		 * scheduled sooner than the selected delay, no
 		 * further report need be scheduled.
 		 * Otherwise, prepare to respond to the
 		 * group-specific or group-and-source query.
 		 */
 		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer)
 			igmp_input_v3_group_query(inm, igi, timer, igmpv3);
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a recieved IGMPv3 group-specific or group-and-source-specific
  * query.
  * Return <0 if any error occured. Currently this is ignored.
  */
 static int
 igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifsoftc *igi,
     int timer, /*const*/ struct igmpv3 *igmpv3)
 {
 	int			 retval;
 	uint16_t		 nsrc;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	retval = 0;
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		return (retval);
 		break;
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		break;
 	}
 
 	nsrc = ntohs(igmpv3->igmp_numsrc);
 
 	/*
 	 * Deal with group-specific queries upfront.
 	 * If any group query is already pending, purge any recorded
 	 * source-list state if it exists, and schedule a query response
 	 * for this group-specific query.
 	 */
 	if (nsrc == 0) {
 		if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
 		    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) {
 			inm_clear_recorded(inm);
 			timer = min(inm->inm_timer, timer);
 		}
 		inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER;
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		return (retval);
 	}
 
 	/*
 	 * Deal with the case where a group-and-source-specific query has
 	 * been received but a group-specific query is already pending.
 	 */
 	if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) {
 		timer = min(inm->inm_timer, timer);
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		return (retval);
 	}
 
 	/*
 	 * Finally, deal with the case where a group-and-source-specific
 	 * query has been received, where a response to a previous g-s-r
 	 * query exists, or none exists.
 	 * In this case, we need to parse the source-list which the Querier
 	 * has provided us with and check if we have any source list filter
 	 * entries at T1 for these sources. If we do not, there is no need
 	 * schedule a report and the query may be dropped.
 	 * If we do, we must record them and schedule a current-state
 	 * report for those sources.
 	 * FIXME: Handling source lists larger than 1 mbuf requires that
 	 * we pass the mbuf chain pointer down to this function, and use
 	 * m_getptr() to walk the chain.
 	 */
 	if (inm->inm_nsrc > 0) {
 		const struct in_addr	*ap;
 		int			 i, nrecorded;
 
 		ap = (const struct in_addr *)(igmpv3 + 1);
 		nrecorded = 0;
 		for (i = 0; i < nsrc; i++, ap++) {
 			retval = inm_record_source(inm, ap->s_addr);
 			if (retval < 0)
 				break;
 			nrecorded += retval;
 		}
 		if (nrecorded > 0) {
 			CTR1(KTR_IGMPV3,
 			    "%s: schedule response to SG query", __func__);
 			inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER;
 			inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 			V_current_state_timers_running = 1;
 		}
 	}
 
 	return (retval);
 }
 
 /*
  * Process a received IGMPv1 host membership report.
  *
  * NOTE: 0.0.0.0 workaround breaks const correctness.
  */
 static int
 igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
     /*const*/ struct igmp *igmp)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct in_ifaddr *ia;
 	struct in_multi *inm;
 
 	IGMPSTAT_INC(igps_rcv_reports);
 
 	if (ifp->if_flags & IFF_LOOPBACK)
 		return (0);
 
 	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
 	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
 		IGMPSTAT_INC(igps_rcv_badreports);
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
 	 * Booting clients may use the source address 0.0.0.0. Some
 	 * IGMP daemons may not know how to use IP_RECVIF to determine
 	 * the interface upon which this message was received.
 	 * Replace 0.0.0.0 with the subnet address if told to do so.
 	 */
 	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
 		IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 		if (ia != NULL) {
 			ip->ip_src.s_addr = htonl(ia->ia_subnet);
 			ifa_free(&ia->ia_ifa);
 		}
 	}
 
 	CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)",
 	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 
 	/*
 	 * IGMPv1 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, stop our group timer and transition to the 'lazy' state.
 	 */
 	IN_MULTI_LOCK();
 	inm = inm_lookup(ifp, igmp->igmp_group);
 	if (inm != NULL) {
 		struct igmp_ifsoftc *igi;
 
 		igi = inm->inm_igi;
 		if (igi == NULL) {
 			KASSERT(igi != NULL,
 			    ("%s: no igi for ifp %p", __func__, ifp));
 			goto out_locked;
 		}
 
 		IGMPSTAT_INC(igps_rcv_ourreports);
 
 		/*
 		 * If we are in IGMPv3 host mode, do not allow the
 		 * other host's IGMPv1 report to suppress our reports
 		 * unless explicitly configured to do so.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3) {
 			if (V_igmp_legacysupp)
 				igmp_v3_suppress_group_record(inm);
 			goto out_locked;
 		}
 
 		inm->inm_timer = 0;
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 		case IGMP_SLEEPING_MEMBER:
 			inm->inm_state = IGMP_SLEEPING_MEMBER;
 			break;
 		case IGMP_REPORTING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 			if (igi->igi_version == IGMP_VERSION_1)
 				inm->inm_state = IGMP_LAZY_MEMBER;
 			else if (igi->igi_version == IGMP_VERSION_2)
 				inm->inm_state = IGMP_SLEEPING_MEMBER;
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a received IGMPv2 host membership report.
  *
  * NOTE: 0.0.0.0 workaround breaks const correctness.
  */
 static int
 igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
     /*const*/ struct igmp *igmp)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct in_ifaddr *ia;
 	struct in_multi *inm;
 
 	/*
 	 * Make sure we don't hear our own membership report.  Fast
 	 * leave requires knowing that we are the only member of a
 	 * group.
 	 */
 	IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 	if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) {
 		ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 
 	IGMPSTAT_INC(igps_rcv_reports);
 
 	if (ifp->if_flags & IFF_LOOPBACK) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 
 	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
 	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		IGMPSTAT_INC(igps_rcv_badreports);
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
 	 * Booting clients may use the source address 0.0.0.0. Some
 	 * IGMP daemons may not know how to use IP_RECVIF to determine
 	 * the interface upon which this message was received.
 	 * Replace 0.0.0.0 with the subnet address if told to do so.
 	 */
 	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
 		if (ia != NULL)
 			ip->ip_src.s_addr = htonl(ia->ia_subnet);
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 
 	CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)",
 	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 
 	/*
 	 * IGMPv2 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, and our group timer is pending or about to be reset,
 	 * stop our group timer by transitioning to the 'lazy' state.
 	 */
 	IN_MULTI_LOCK();
 	inm = inm_lookup(ifp, igmp->igmp_group);
 	if (inm != NULL) {
 		struct igmp_ifsoftc *igi;
 
 		igi = inm->inm_igi;
 		KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp));
 
 		IGMPSTAT_INC(igps_rcv_ourreports);
 
 		/*
 		 * If we are in IGMPv3 host mode, do not allow the
 		 * other host's IGMPv1 report to suppress our reports
 		 * unless explicitly configured to do so.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3) {
 			if (V_igmp_legacysupp)
 				igmp_v3_suppress_group_record(inm);
 			goto out_locked;
 		}
 
 		inm->inm_timer = 0;
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 			break;
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 		case IGMP_LAZY_MEMBER:
 			inm->inm_state = IGMP_LAZY_MEMBER;
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 int
 igmp_input(struct mbuf **mp, int *offp, int proto)
 {
 	int iphlen;
 	struct ifnet *ifp;
 	struct igmp *igmp;
 	struct ip *ip;
 	struct mbuf *m;
 	int igmplen;
 	int minlen;
 	int queryver;
 
 	CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp);
 
 	m = *mp;
 	ifp = m->m_pkthdr.rcvif;
 	*mp = NULL;
 
 	IGMPSTAT_INC(igps_rcv_total);
 
 	ip = mtod(m, struct ip *);
 	iphlen = *offp;
 	igmplen = ntohs(ip->ip_len) - iphlen;
 
 	/*
 	 * Validate lengths.
 	 */
 	if (igmplen < IGMP_MINLEN) {
 		IGMPSTAT_INC(igps_rcv_tooshort);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Always pullup to the minimum size for v1/v2 or v3
 	 * to amortize calls to m_pullup().
 	 */
 	minlen = iphlen;
 	if (igmplen >= IGMP_V3_QUERY_MINLEN)
 		minlen += IGMP_V3_QUERY_MINLEN;
 	else
 		minlen += IGMP_MINLEN;
 	if ((!M_WRITABLE(m) || m->m_len < minlen) &&
 	    (m = m_pullup(m, minlen)) == 0) {
 		IGMPSTAT_INC(igps_rcv_tooshort);
 		return (IPPROTO_DONE);
 	}
 	ip = mtod(m, struct ip *);
 
 	/*
 	 * Validate checksum.
 	 */
 	m->m_data += iphlen;
 	m->m_len -= iphlen;
 	igmp = mtod(m, struct igmp *);
 	if (in_cksum(m, igmplen)) {
 		IGMPSTAT_INC(igps_rcv_badsum);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	m->m_data -= iphlen;
 	m->m_len += iphlen;
 
 	/*
 	 * IGMP control traffic is link-scope, and must have a TTL of 1.
 	 * DVMRP traffic (e.g. mrinfo, mtrace) is an exception;
 	 * probe packets may come from beyond the LAN.
 	 */
 	if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) {
 		IGMPSTAT_INC(igps_rcv_badttl);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	switch (igmp->igmp_type) {
 	case IGMP_HOST_MEMBERSHIP_QUERY:
 		if (igmplen == IGMP_MINLEN) {
 			if (igmp->igmp_code == 0)
 				queryver = IGMP_VERSION_1;
 			else
 				queryver = IGMP_VERSION_2;
 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
 			queryver = IGMP_VERSION_3;
 		} else {
 			IGMPSTAT_INC(igps_rcv_tooshort);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		switch (queryver) {
 		case IGMP_VERSION_1:
 			IGMPSTAT_INC(igps_rcv_v1v2_queries);
 			if (!V_igmp_v1enable)
 				break;
 			if (igmp_input_v1_query(ifp, ip, igmp) != 0) {
 				m_freem(m);
 				return (IPPROTO_DONE);
 			}
 			break;
 
 		case IGMP_VERSION_2:
 			IGMPSTAT_INC(igps_rcv_v1v2_queries);
 			if (!V_igmp_v2enable)
 				break;
 			if (igmp_input_v2_query(ifp, ip, igmp) != 0) {
 				m_freem(m);
 				return (IPPROTO_DONE);
 			}
 			break;
 
 		case IGMP_VERSION_3: {
 				struct igmpv3 *igmpv3;
 				uint16_t igmpv3len;
 				uint16_t nsrc;
 
 				IGMPSTAT_INC(igps_rcv_v3_queries);
 				igmpv3 = (struct igmpv3 *)igmp;
 				/*
 				 * Validate length based on source count.
 				 */
 				nsrc = ntohs(igmpv3->igmp_numsrc);
 				if (nsrc * sizeof(in_addr_t) >
 				    UINT16_MAX - iphlen - IGMP_V3_QUERY_MINLEN) {
 					IGMPSTAT_INC(igps_rcv_tooshort);
 					return (IPPROTO_DONE);
 				}
 				/*
 				 * m_pullup() may modify m, so pullup in
 				 * this scope.
 				 */
 				igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN +
 				   sizeof(struct in_addr) * nsrc;
 				if ((!M_WRITABLE(m) ||
 				     m->m_len < igmpv3len) &&
 				    (m = m_pullup(m, igmpv3len)) == NULL) {
 					IGMPSTAT_INC(igps_rcv_tooshort);
 					return (IPPROTO_DONE);
 				}
 				igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *)
 				    + iphlen);
 				if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) {
 					m_freem(m);
 					return (IPPROTO_DONE);
 				}
 			}
 			break;
 		}
 		break;
 
 	case IGMP_v1_HOST_MEMBERSHIP_REPORT:
 		if (!V_igmp_v1enable)
 			break;
 		if (igmp_input_v1_report(ifp, ip, igmp) != 0) {
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	case IGMP_v2_HOST_MEMBERSHIP_REPORT:
 		if (!V_igmp_v2enable)
 			break;
 		if (!ip_checkrouteralert(m))
 			IGMPSTAT_INC(igps_rcv_nora);
 		if (igmp_input_v2_report(ifp, ip, igmp) != 0) {
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	case IGMP_v3_HOST_MEMBERSHIP_REPORT:
 		/*
 		 * Hosts do not need to process IGMPv3 membership reports,
 		 * as report suppression is no longer required.
 		 */
 		if (!ip_checkrouteralert(m))
 			IGMPSTAT_INC(igps_rcv_nora);
 		break;
 
 	default:
 		break;
 	}
 
 	/*
 	 * Pass all valid IGMP packets up to any process(es) listening on a
 	 * raw IGMP socket.
 	 */
 	*mp = m;
 	return (rip_input(mp, offp, proto));
 }
 
 
 /*
  * Fast timeout handler (global).
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 igmp_fasttimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		igmp_fasttimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Fast timeout handler (per-vnet).
  * Sends are shuffled off to a netisr to deal with Giant.
  *
  * VIMAGE: Assume caller has set up our curvnet.
  */
 static void
 igmp_fasttimo_vnet(void)
 {
 	struct mbufq		 scq;	/* State-change packets */
 	struct mbufq		 qrq;	/* Query response packets */
 	struct ifnet		*ifp;
 	struct igmp_ifsoftc	*igi;
 	struct ifmultiaddr	*ifma;
 	struct in_multi		*inm;
 	int			 loop, uri_fasthz;
 
 	loop = 0;
 	uri_fasthz = 0;
 
 	/*
 	 * Quick check to see if any work needs to be done, in order to
 	 * minimize the overhead of fasttimo processing.
 	 * SMPng: XXX Unlocked reads.
 	 */
 	if (!V_current_state_timers_running &&
 	    !V_interface_timers_running &&
 	    !V_state_change_timers_running)
 		return;
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	/*
 	 * IGMPv3 General Query response timer processing.
 	 */
 	if (V_interface_timers_running) {
 		CTR1(KTR_IGMPV3, "%s: interface timers running", __func__);
 
 		V_interface_timers_running = 0;
 		LIST_FOREACH(igi, &V_igi_head, igi_link) {
 			if (igi->igi_v3_timer == 0) {
 				/* Do nothing. */
 			} else if (--igi->igi_v3_timer == 0) {
 				igmp_v3_dispatch_general_query(igi);
 			} else {
 				V_interface_timers_running = 1;
 			}
 		}
 	}
 
 	if (!V_current_state_timers_running &&
 	    !V_state_change_timers_running)
 		goto out_locked;
 
 	V_current_state_timers_running = 0;
 	V_state_change_timers_running = 0;
 
 	CTR1(KTR_IGMPV3, "%s: state change timers running", __func__);
 
 	/*
 	 * IGMPv1/v2/v3 host report and state-change timer processing.
 	 * Note: Processing a v3 group timer may remove a node.
 	 */
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		ifp = igi->igi_ifp;
 
 		if (igi->igi_version == IGMP_VERSION_3) {
 			loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
 			uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri *
 			    PR_FASTHZ);
 			mbufq_init(&qrq, IGMP_MAX_G_GS_PACKETS);
 			mbufq_init(&scq, IGMP_MAX_STATE_CHANGE_PACKETS);
 		}
 
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			switch (igi->igi_version) {
 			case IGMP_VERSION_1:
 			case IGMP_VERSION_2:
 				igmp_v1v2_process_group_timer(inm,
 				    igi->igi_version);
 				break;
 			case IGMP_VERSION_3:
 				igmp_v3_process_group_timers(igi, &qrq,
 				    &scq, inm, uri_fasthz);
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 
 		if (igi->igi_version == IGMP_VERSION_3) {
 			struct in_multi		*tinm;
 
 			igmp_dispatch_queue(&qrq, 0, loop);
 			igmp_dispatch_queue(&scq, 0, loop);
 
 			/*
 			 * Free the in_multi reference(s) for this
 			 * IGMP lifecycle.
 			 */
 			SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead,
 			    inm_nrele, tinm) {
 				SLIST_REMOVE_HEAD(&igi->igi_relinmhead,
 				    inm_nrele);
 				inm_release_locked(inm);
 			}
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 }
 
 /*
  * Update host report group timer for IGMPv1/v2.
  * Will update the global pending timer flags.
  */
 static void
 igmp_v1v2_process_group_timer(struct in_multi *inm, const int version)
 {
 	int report_timer_expired;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	if (inm->inm_timer == 0) {
 		report_timer_expired = 0;
 	} else if (--inm->inm_timer == 0) {
 		report_timer_expired = 1;
 	} else {
 		V_current_state_timers_running = 1;
 		return;
 	}
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		break;
 	case IGMP_REPORTING_MEMBER:
 		if (report_timer_expired) {
 			inm->inm_state = IGMP_IDLE_MEMBER;
 			(void)igmp_v1v2_queue_report(inm,
 			    (version == IGMP_VERSION_2) ?
 			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
 			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
 		}
 		break;
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Update a group's timers for IGMPv3.
  * Will update the global pending timer flags.
  * Note: Unlocked read from igi.
  */
 static void
 igmp_v3_process_group_timers(struct igmp_ifsoftc *igi,
     struct mbufq *qrq, struct mbufq *scq,
     struct in_multi *inm, const int uri_fasthz)
 {
 	int query_response_timer_expired;
 	int state_change_retransmit_timer_expired;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	query_response_timer_expired = 0;
 	state_change_retransmit_timer_expired = 0;
 
 	/*
 	 * During a transition from v1/v2 compatibility mode back to v3,
 	 * a group record in REPORTING state may still have its group
 	 * timer active. This is a no-op in this function; it is easier
 	 * to deal with it here than to complicate the slow-timeout path.
 	 */
 	if (inm->inm_timer == 0) {
 		query_response_timer_expired = 0;
 	} else if (--inm->inm_timer == 0) {
 		query_response_timer_expired = 1;
 	} else {
 		V_current_state_timers_running = 1;
 	}
 
 	if (inm->inm_sctimer == 0) {
 		state_change_retransmit_timer_expired = 0;
 	} else if (--inm->inm_sctimer == 0) {
 		state_change_retransmit_timer_expired = 1;
 	} else {
 		V_state_change_timers_running = 1;
 	}
 
 	/* We are in fasttimo, so be quick about it. */
 	if (!state_change_retransmit_timer_expired &&
 	    !query_response_timer_expired)
 		return;
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 		break;
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		/*
 		 * Respond to a previously pending Group-Specific
 		 * or Group-and-Source-Specific query by enqueueing
 		 * the appropriate Current-State report for
 		 * immediate transmission.
 		 */
 		if (query_response_timer_expired) {
 			int retval;
 
 			retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1,
 			    (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER));
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			/* XXX Clear recorded sources for next time. */
 			inm_clear_recorded(inm);
 		}
 		/* FALLTHROUGH */
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		if (state_change_retransmit_timer_expired) {
 			/*
 			 * State-change retransmission timer fired.
 			 * If there are any further pending retransmissions,
 			 * set the global pending state-change flag, and
 			 * reset the timer.
 			 */
 			if (--inm->inm_scrv > 0) {
 				inm->inm_sctimer = uri_fasthz;
 				V_state_change_timers_running = 1;
 			}
 			/*
 			 * Retransmit the previously computed state-change
 			 * report. If there are no further pending
 			 * retransmissions, the mbuf queue will be consumed.
 			 * Update T0 state to T1 as we have now sent
 			 * a state-change.
 			 */
 			(void)igmp_v3_merge_state_changes(inm, scq);
 
 			inm_commit(inm);
 			CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 			    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 
 			/*
 			 * If we are leaving the group for good, make sure
 			 * we release IGMP's reference to it.
 			 * This release must be deferred using a SLIST,
 			 * as we are called from a loop which traverses
 			 * the in_ifmultiaddr TAILQ.
 			 */
 			if (inm->inm_state == IGMP_LEAVING_MEMBER &&
 			    inm->inm_scrv == 0) {
 				inm->inm_state = IGMP_NOT_MEMBER;
 				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
 				    inm, inm_nrele);
 			}
 		}
 		break;
 	}
 }
 
 
 /*
  * Suppress a group's pending response to a group or source/group query.
  *
  * Do NOT suppress state changes. This leads to IGMPv3 inconsistency.
  * Do NOT update ST1/ST0 as this operation merely suppresses
  * the currently pending group record.
  * Do NOT suppress the response to a general query. It is possible but
  * it would require adding another state or flag.
  */
 static void
 igmp_v3_suppress_group_record(struct in_multi *inm)
 {
 
 	IN_MULTI_LOCK_ASSERT();
 
 	KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3,
 		("%s: not IGMPv3 mode on link", __func__));
 
 	if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER ||
 	    inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER)
 		return;
 
 	if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
 		inm_clear_recorded(inm);
 
 	inm->inm_timer = 0;
 	inm->inm_state = IGMP_REPORTING_MEMBER;
 }
 
 /*
  * Switch to a different IGMP version on the given interface,
  * as per Section 7.2.1.
  */
 static void
 igmp_set_version(struct igmp_ifsoftc *igi, const int version)
 {
 	int old_version_timer;
 
 	IGMP_LOCK_ASSERT();
 
 	CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__,
 	    version, igi->igi_ifp, igi->igi_ifp->if_xname);
 
 	if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) {
 		/*
 		 * Compute the "Older Version Querier Present" timer as per
 		 * Section 8.12.
 		 */
 		old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri;
 		old_version_timer *= PR_SLOWHZ;
 
 		if (version == IGMP_VERSION_1) {
 			igi->igi_v1_timer = old_version_timer;
 			igi->igi_v2_timer = 0;
 		} else if (version == IGMP_VERSION_2) {
 			igi->igi_v1_timer = 0;
 			igi->igi_v2_timer = old_version_timer;
 		}
 	}
 
 	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
 		if (igi->igi_version != IGMP_VERSION_2) {
 			igi->igi_version = IGMP_VERSION_2;
 			igmp_v3_cancel_link_timers(igi);
 		}
 	} else if (igi->igi_v1_timer > 0) {
 		if (igi->igi_version != IGMP_VERSION_1) {
 			igi->igi_version = IGMP_VERSION_1;
 			igmp_v3_cancel_link_timers(igi);
 		}
 	}
 }
 
 /*
  * Cancel pending IGMPv3 timers for the given link and all groups
  * joined on it; state-change, general-query, and group-query timers.
  *
  * Only ever called on a transition from v3 to Compatibility mode. Kill
  * the timers stone dead (this may be expensive for large N groups), they
  * will be restarted if Compatibility Mode deems that they must be due to
  * query processing.
  */
 static void
 igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in_multi		*inm, *tinm;
 
 	CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__,
 	    igi->igi_ifp, igi->igi_ifp->if_xname);
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	/*
 	 * Stop the v3 General Query Response on this link stone dead.
 	 * If fasttimo is woken up due to V_interface_timers_running,
 	 * the flag will be cleared if there are no pending link timers.
 	 */
 	igi->igi_v3_timer = 0;
 
 	/*
 	 * Now clear the current-state and state-change report timers
 	 * for all memberships scoped to this link.
 	 */
 	ifp = igi->igi_ifp;
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			/*
 			 * These states are either not relevant in v3 mode,
 			 * or are unreported. Do nothing.
 			 */
 			break;
 		case IGMP_LEAVING_MEMBER:
 			/*
 			 * If we are leaving the group and switching to
 			 * compatibility mode, we need to release the final
 			 * reference held for issuing the INCLUDE {}, and
 			 * transition to REPORTING to ensure the host leave
 			 * message is sent upstream to the old querier --
 			 * transition to NOT would lose the leave and race.
 			 */
 			SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele);
 			/* FALLTHROUGH */
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 			inm_clear_recorded(inm);
 			/* FALLTHROUGH */
 		case IGMP_REPORTING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			break;
 		}
 		/*
 		 * Always clear state-change and group report timers.
 		 * Free any pending IGMPv3 state-change records.
 		 */
 		inm->inm_sctimer = 0;
 		inm->inm_timer = 0;
 		mbufq_drain(&inm->inm_scq);
 	}
 	IF_ADDR_RUNLOCK(ifp);
 	SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
 		inm_release_locked(inm);
 	}
 }
 
 /*
  * Update the Older Version Querier Present timers for a link.
  * See Section 7.2.1 of RFC 3376.
  */
 static void
 igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *igi)
 {
 
 	IGMP_LOCK_ASSERT();
 
 	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) {
 		/*
 		 * IGMPv1 and IGMPv2 Querier Present timers expired.
 		 *
 		 * Revert to IGMPv3.
 		 */
 		if (igi->igi_version != IGMP_VERSION_3) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_version = IGMP_VERSION_3;
 		}
 	} else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
 		/*
 		 * IGMPv1 Querier Present timer expired,
 		 * IGMPv2 Querier Present timer running.
 		 * If IGMPv2 was disabled since last timeout,
 		 * revert to IGMPv3.
 		 * If IGMPv2 is enabled, revert to IGMPv2.
 		 */
 		if (!V_igmp_v2enable) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v2_timer = 0;
 			igi->igi_version = IGMP_VERSION_3;
 		} else {
 			--igi->igi_v2_timer;
 			if (igi->igi_version != IGMP_VERSION_2) {
 				CTR5(KTR_IGMPV3,
 				    "%s: transition from v%d -> v%d on %p(%s)",
 				    __func__, igi->igi_version, IGMP_VERSION_2,
 				    igi->igi_ifp, igi->igi_ifp->if_xname);
 				igi->igi_version = IGMP_VERSION_2;
 				igmp_v3_cancel_link_timers(igi);
 			}
 		}
 	} else if (igi->igi_v1_timer > 0) {
 		/*
 		 * IGMPv1 Querier Present timer running.
 		 * Stop IGMPv2 timer if running.
 		 *
 		 * If IGMPv1 was disabled since last timeout,
 		 * revert to IGMPv3.
 		 * If IGMPv1 is enabled, reset IGMPv2 timer if running.
 		 */
 		if (!V_igmp_v1enable) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v1_timer = 0;
 			igi->igi_version = IGMP_VERSION_3;
 		} else {
 			--igi->igi_v1_timer;
 		}
 		if (igi->igi_v2_timer > 0) {
 			CTR3(KTR_IGMPV3,
 			    "%s: cancel v2 timer on %p(%s)",
 			    __func__, igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v2_timer = 0;
 		}
 	}
 }
 
 /*
  * Global slowtimo handler.
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 igmp_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		igmp_slowtimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Per-vnet slowtimo handler.
  */
 static void
 igmp_slowtimo_vnet(void)
 {
 	struct igmp_ifsoftc *igi;
 
 	IGMP_LOCK();
 
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		igmp_v1v2_process_querier_timers(igi);
 	}
 
 	IGMP_UNLOCK();
 }
 
 /*
  * Dispatch an IGMPv1/v2 host report or leave message.
  * These are always small enough to fit inside a single mbuf.
  */
 static int
 igmp_v1v2_queue_report(struct in_multi *inm, const int type)
 {
 	struct ifnet		*ifp;
 	struct igmp		*igmp;
 	struct ip		*ip;
 	struct mbuf		*m;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	ifp = inm->inm_ifp;
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOMEM);
 	M_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp));
 
 	m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp);
 
 	m->m_data += sizeof(struct ip);
 	m->m_len = sizeof(struct igmp);
 
 	igmp = mtod(m, struct igmp *);
 	igmp->igmp_type = type;
 	igmp->igmp_code = 0;
 	igmp->igmp_group = inm->inm_addr;
 	igmp->igmp_cksum = 0;
 	igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp));
 
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_tos = 0;
 	ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp));
 	ip->ip_off = 0;
 	ip->ip_p = IPPROTO_IGMP;
 	ip->ip_src.s_addr = INADDR_ANY;
 
 	if (type == IGMP_HOST_LEAVE_MESSAGE)
 		ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP);
 	else
 		ip->ip_dst = inm->inm_addr;
 
 	igmp_save_context(m, ifp);
 
 	m->m_flags |= M_IGMPV2;
 	if (inm->inm_igi->igi_flags & IGIF_LOOPBACK)
 		m->m_flags |= M_IGMP_LOOP;
 
 	CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m);
 	netisr_dispatch(NETISR_IGMP, m);
 
 	return (0);
 }
 
 /*
  * Process a state change from the upper layer for the given IPv4 group.
  *
  * Each socket holds a reference on the in_multi in its own ip_moptions.
  * The socket layer will have made the necessary updates to.the group
  * state, it is now up to IGMP to issue a state change report if there
  * has been any change between T0 (when the last state-change was issued)
  * and T1 (now).
  *
  * We use the IGMPv3 state machine at group level. The IGMP module
  * however makes the decision as to which IGMP protocol version to speak.
  * A state change *from* INCLUDE {} always means an initial join.
  * A state change *to* INCLUDE {} always means a final leave.
  *
  * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can
  * save ourselves a bunch of work; any exclusive mode groups need not
  * compute source filter lists.
  *
  * VIMAGE: curvnet should have been set by caller, as this routine
  * is called from the socket option handlers.
  */
 int
 igmp_change_state(struct in_multi *inm)
 {
 	struct igmp_ifsoftc *igi;
 	struct ifnet *ifp;
 	int error;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	error = 0;
 
 	/*
 	 * Try to detect if the upper layer just asked us to change state
 	 * for an interface which has now gone away.
 	 */
 	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->inm_ifma->ifma_ifp;
 	/*
 	 * Sanity check that netinet's notion of ifp is the
 	 * same as net's.
 	 */
 	KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
 
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	/*
 	 * If we detect a state transition to or from MCAST_UNDEFINED
 	 * for this group, then we are starting or finishing an IGMP
 	 * life cycle for this group.
 	 */
 	if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) {
 		CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__,
 		    inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode);
 		if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_IGMPV3, "%s: initial join", __func__);
 			error = igmp_initial_join(inm, igi);
 			goto out_locked;
 		} else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_IGMPV3, "%s: final leave", __func__);
 			igmp_final_leave(inm, igi);
 			goto out_locked;
 		}
 	} else {
 		CTR1(KTR_IGMPV3, "%s: filter set change", __func__);
 	}
 
 	error = igmp_handle_state_change(inm, igi);
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Perform the initial join for an IGMP group.
  *
  * When joining a group:
  *  If the group should have its IGMP traffic suppressed, do nothing.
  *  IGMPv1 starts sending IGMPv1 host membership reports.
  *  IGMPv2 starts sending IGMPv2 host membership reports.
  *  IGMPv3 will schedule an IGMPv3 state-change report containing the
  *  initial state of the membership.
  */
 static int
 igmp_initial_join(struct in_multi *inm, struct igmp_ifsoftc *igi)
 {
 	struct ifnet		*ifp;
 	struct mbufq		*mq;
 	int			 error, retval, syncstates;
 
 	CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	error = 0;
 	syncstates = 1;
 
 	ifp = inm->inm_ifp;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	/*
 	 * Groups joined on loopback or marked as 'not reported',
 	 * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and
 	 * are never reported in any IGMP protocol exchanges.
 	 * All other groups enter the appropriate IGMP state machine
 	 * for the version in use on this link.
 	 * A link marked as IGIF_SILENT causes IGMP to be completely
 	 * disabled for the link.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (igi->igi_flags & IGIF_SILENT) ||
 	    !igmp_isgroupreported(inm->inm_addr)) {
 		CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		inm->inm_state = IGMP_SILENT_MEMBER;
 		inm->inm_timer = 0;
 	} else {
 		/*
 		 * Deal with overlapping in_multi lifecycle.
 		 * If this group was LEAVING, then make sure
 		 * we drop the reference we picked up to keep the
 		 * group around for the final INCLUDE {} enqueue.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3 &&
 		    inm->inm_state == IGMP_LEAVING_MEMBER)
 			inm_release_locked(inm);
 
 		inm->inm_state = IGMP_REPORTING_MEMBER;
 
 		switch (igi->igi_version) {
 		case IGMP_VERSION_1:
 		case IGMP_VERSION_2:
 			inm->inm_state = IGMP_IDLE_MEMBER;
 			error = igmp_v1v2_queue_report(inm,
 			    (igi->igi_version == IGMP_VERSION_2) ?
 			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
 			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
 			if (error == 0) {
 				inm->inm_timer = IGMP_RANDOM_DELAY(
 				    IGMP_V1V2_MAX_RI * PR_FASTHZ);
 				V_current_state_timers_running = 1;
 			}
 			break;
 
 		case IGMP_VERSION_3:
 			/*
 			 * Defer update of T0 to T1, until the first copy
 			 * of the state change has been transmitted.
 			 */
 			syncstates = 0;
 
 			/*
 			 * Immediately enqueue a State-Change Report for
 			 * this interface, freeing any previous reports.
 			 * Don't kick the timers if there is nothing to do,
 			 * or if an error occurred.
 			 */
 			mq = &inm->inm_scq;
 			mbufq_drain(mq);
 			retval = igmp_v3_enqueue_group_record(mq, inm, 1,
 			    0, 0);
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			if (retval <= 0) {
 				error = retval * -1;
 				break;
 			}
 
 			/*
 			 * Schedule transmission of pending state-change
 			 * report up to RV times for this link. The timer
 			 * will fire at the next igmp_fasttimo (~200ms),
 			 * giving us an opportunity to merge the reports.
 			 */
 			if (igi->igi_flags & IGIF_LOOPBACK) {
 				inm->inm_scrv = 1;
 			} else {
 				KASSERT(igi->igi_rv > 1,
 				   ("%s: invalid robustness %d", __func__,
 				    igi->igi_rv));
 				inm->inm_scrv = igi->igi_rv;
 			}
 			inm->inm_sctimer = 1;
 			V_state_change_timers_running = 1;
 
 			error = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Only update the T0 state if state change is atomic,
 	 * i.e. we don't need to wait for a timer to fire before we
 	 * can consider the state change to have been communicated.
 	 */
 	if (syncstates) {
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 	}
 
 	return (error);
 }
 
 /*
  * Issue an intermediate state change during the IGMP life-cycle.
  */
 static int
 igmp_handle_state_change(struct in_multi *inm, struct igmp_ifsoftc *igi)
 {
 	struct ifnet		*ifp;
 	int			 retval;
 
 	CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	ifp = inm->inm_ifp;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (igi->igi_flags & IGIF_SILENT) ||
 	    !igmp_isgroupreported(inm->inm_addr) ||
 	    (igi->igi_version != IGMP_VERSION_3)) {
 		if (!igmp_isgroupreported(inm->inm_addr)) {
 			CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		}
 		CTR1(KTR_IGMPV3, "%s: nothing to do", __func__);
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 		return (0);
 	}
 
 	mbufq_drain(&inm->inm_scq);
 
 	retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0);
 	CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval);
 	if (retval <= 0)
 		return (-retval);
 
 	/*
 	 * If record(s) were enqueued, start the state-change
 	 * report timer for this group.
 	 */
 	inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv);
 	inm->inm_sctimer = 1;
 	V_state_change_timers_running = 1;
 
 	return (0);
 }
 
 /*
  * Perform the final leave for an IGMP group.
  *
  * When leaving a group:
  *  IGMPv1 does nothing.
  *  IGMPv2 sends a host leave message, if and only if we are the reporter.
  *  IGMPv3 enqueues a state-change report containing a transition
  *  to INCLUDE {} for immediate transmission.
  */
 static void
 igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi)
 {
 	int syncstates;
 
 	syncstates = 1;
 
 	CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		/* Already leaving or left; do nothing. */
 		CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		break;
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		if (igi->igi_version == IGMP_VERSION_2) {
 #ifdef INVARIANTS
 			if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
 			    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
 			panic("%s: IGMPv3 state reached, not IGMPv3 mode",
 			     __func__);
 #endif
 			igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE);
 			inm->inm_state = IGMP_NOT_MEMBER;
 		} else if (igi->igi_version == IGMP_VERSION_3) {
 			/*
 			 * Stop group timer and all pending reports.
 			 * Immediately enqueue a state-change report
 			 * TO_IN {} to be sent on the next fast timeout,
 			 * giving us an opportunity to merge reports.
 			 */
 			mbufq_drain(&inm->inm_scq);
 			inm->inm_timer = 0;
 			if (igi->igi_flags & IGIF_LOOPBACK) {
 				inm->inm_scrv = 1;
 			} else {
 				inm->inm_scrv = igi->igi_rv;
 			}
 			CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d "
 			    "pending retransmissions.", __func__,
 			    inet_ntoa(inm->inm_addr),
 			    inm->inm_ifp->if_xname, inm->inm_scrv);
 			if (inm->inm_scrv == 0) {
 				inm->inm_state = IGMP_NOT_MEMBER;
 				inm->inm_sctimer = 0;
 			} else {
 				int retval;
 
 				inm_acquire_locked(inm);
 
 				retval = igmp_v3_enqueue_group_record(
 				    &inm->inm_scq, inm, 1, 0, 0);
 				KASSERT(retval != 0,
 				    ("%s: enqueue record = %d", __func__,
 				     retval));
 
 				inm->inm_state = IGMP_LEAVING_MEMBER;
 				inm->inm_sctimer = 1;
 				V_state_change_timers_running = 1;
 				syncstates = 0;
 			}
 			break;
 		}
 		break;
 	case IGMP_LAZY_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		/* Our reports are suppressed; do nothing. */
 		break;
 	}
 
 	if (syncstates) {
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 		CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s",
 		    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 	}
 }
 
 /*
  * Enqueue an IGMPv3 group record to the given output queue.
  *
  * XXX This function could do with having the allocation code
  * split out, and the multiple-tree-walks coalesced into a single
  * routine as has been done in igmp_v3_enqueue_filter_change().
  *
  * If is_state_change is zero, a current-state record is appended.
  * If is_state_change is non-zero, a state-change report is appended.
  *
  * If is_group_query is non-zero, an mbuf packet chain is allocated.
  * If is_group_query is zero, and if there is a packet with free space
  * at the tail of the queue, it will be appended to providing there
  * is enough free space.
  * Otherwise a new mbuf packet chain is allocated.
  *
  * If is_source_query is non-zero, each source is checked to see if
  * it was recorded for a Group-Source query, and will be omitted if
  * it is not both in-mode and recorded.
  *
  * The function will attempt to allocate leading space in the packet
  * for the IP/IGMP header to be prepended without fragmenting the chain.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm,
     const int is_state_change, const int is_group_query,
     const int is_source_query)
 {
 	struct igmp_grouprec	 ig;
 	struct igmp_grouprec	*pig;
 	struct ifnet		*ifp;
 	struct ip_msource	*ims, *nims;
 	struct mbuf		*m0, *m, *md;
 	int			 error, is_filter_list_change;
 	int			 minrec0len, m0srcs, msrcs, nbytes, off;
 	int			 record_has_sources;
 	int			 now;
 	int			 type;
 	in_addr_t		 naddr;
 	uint8_t			 mode;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	error = 0;
 	ifp = inm->inm_ifp;
 	is_filter_list_change = 0;
 	m = NULL;
 	m0 = NULL;
 	m0srcs = 0;
 	msrcs = 0;
 	nbytes = 0;
 	nims = NULL;
 	record_has_sources = 1;
 	pig = NULL;
 	type = IGMP_DO_NOTHING;
 	mode = inm->inm_st[1].iss_fmode;
 
 	/*
 	 * If we did not transition out of ASM mode during t0->t1,
 	 * and there are no source nodes to process, we can skip
 	 * the generation of source records.
 	 */
 	if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 &&
 	    inm->inm_nsrc == 0)
 		record_has_sources = 0;
 
 	if (is_state_change) {
 		/*
 		 * Queue a state change record.
 		 * If the mode did not change, and there are non-ASM
 		 * listeners or source filters present,
 		 * we potentially need to issue two records for the group.
 		 * If we are transitioning to MCAST_UNDEFINED, we need
 		 * not send any sources.
 		 * If there are ASM listeners, and there was no filter
 		 * mode transition of any kind, do nothing.
 		 */
 		if (mode != inm->inm_st[0].iss_fmode) {
 			if (mode == MCAST_EXCLUDE) {
 				CTR1(KTR_IGMPV3, "%s: change to EXCLUDE",
 				    __func__);
 				type = IGMP_CHANGE_TO_EXCLUDE_MODE;
 			} else {
 				CTR1(KTR_IGMPV3, "%s: change to INCLUDE",
 				    __func__);
 				type = IGMP_CHANGE_TO_INCLUDE_MODE;
 				if (mode == MCAST_UNDEFINED)
 					record_has_sources = 0;
 			}
 		} else {
 			if (record_has_sources) {
 				is_filter_list_change = 1;
 			} else {
 				type = IGMP_DO_NOTHING;
 			}
 		}
 	} else {
 		/*
 		 * Queue a current state record.
 		 */
 		if (mode == MCAST_EXCLUDE) {
 			type = IGMP_MODE_IS_EXCLUDE;
 		} else if (mode == MCAST_INCLUDE) {
 			type = IGMP_MODE_IS_INCLUDE;
 			KASSERT(inm->inm_st[1].iss_asm == 0,
 			    ("%s: inm %p is INCLUDE but ASM count is %d",
 			     __func__, inm, inm->inm_st[1].iss_asm));
 		}
 	}
 
 	/*
 	 * Generate the filter list changes using a separate function.
 	 */
 	if (is_filter_list_change)
 		return (igmp_v3_enqueue_filter_change(mq, inm));
 
 	if (type == IGMP_DO_NOTHING) {
 		CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s",
 		    __func__, inet_ntoa(inm->inm_addr),
 		    inm->inm_ifp->if_xname);
 		return (0);
 	}
 
 	/*
 	 * If any sources are present, we must be able to fit at least
 	 * one in the trailing space of the tail packet's mbuf,
 	 * ideally more.
 	 */
 	minrec0len = sizeof(struct igmp_grouprec);
 	if (record_has_sources)
 		minrec0len += sizeof(in_addr_t);
 
 	CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__,
 	    igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr),
 	    inm->inm_ifp->if_xname);
 
 	/*
 	 * Check if we have a packet in the tail of the queue for this
 	 * group into which the first group record for this group will fit.
 	 * Otherwise allocate a new packet.
 	 * Always allocate leading space for IP+RA_OPT+IGMP+REPORT.
 	 * Note: Group records for G/GSR query responses MUST be sent
 	 * in their own packet.
 	 */
 	m0 = mbufq_last(mq);
 	if (!is_group_query &&
 	    m0 != NULL &&
 	    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) &&
 	    (m0->m_pkthdr.len + minrec0len) <
 	     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
 		m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 			    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 		m = m0;
 		CTR1(KTR_IGMPV3, "%s: use existing packet", __func__);
 	} else {
 		if (mbufq_full(mq)) {
 			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = NULL;
 		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 		if (!is_state_change && !is_group_query) {
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 			if (m)
 				m->m_data += IGMP_LEADINGSPACE;
 		}
 		if (m == NULL) {
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 			if (m)
 				M_ALIGN(m, IGMP_LEADINGSPACE);
 		}
 		if (m == NULL)
 			return (-ENOMEM);
 
 		igmp_save_context(m, ifp);
 
 		CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__);
 	}
 
 	/*
 	 * Append group record.
 	 * If we have sources, we don't know how many yet.
 	 */
 	ig.ig_type = type;
 	ig.ig_datalen = 0;
 	ig.ig_numsrc = 0;
 	ig.ig_group = inm->inm_addr;
 	if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
 		if (m != m0)
 			m_freem(m);
 		CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
 		return (-ENOMEM);
 	}
 	nbytes += sizeof(struct igmp_grouprec);
 
 	/*
 	 * Append as many sources as will fit in the first packet.
 	 * If we are appending to a new packet, the chain allocation
 	 * may potentially use clusters; use m_getptr() in this case.
 	 * If we are appending to an existing packet, we need to obtain
 	 * a pointer to the group record after m_append(), in case a new
 	 * mbuf was allocated.
 	 * Only append sources which are in-mode at t1. If we are
 	 * transitioning to MCAST_UNDEFINED state on the group, do not
 	 * include source entries.
 	 * Only report recorded sources in our filter set when responding
 	 * to a group-source query.
 	 */
 	if (record_has_sources) {
 		if (m == m0) {
 			md = m_last(m);
 			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
 			    md->m_len - nbytes);
 		} else {
 			md = m_getptr(m, 0, &off);
 			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
 			    off);
 		}
 		msrcs = 0;
 		RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) {
 			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
 			    inet_ntoa_haddr(ims->ims_haddr));
 			now = ims_get_mode(inm, ims, 1);
 			CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now);
 			if ((now != mode) ||
 			    (now == mode && mode == MCAST_UNDEFINED)) {
 				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->ims_stp == 0) {
 				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_IGMPV3, "%s: append node", __func__);
 			naddr = htonl(ims->ims_haddr);
 			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			nbytes += sizeof(in_addr_t);
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__,
 		    msrcs);
 		pig->ig_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(in_addr_t));
 	}
 
 	if (is_source_query && msrcs == 0) {
 		CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__);
 		if (m != m0)
 			m_freem(m);
 		return (0);
 	}
 
 	/*
 	 * We are good to go with first packet.
 	 */
 	if (m != m0) {
 		CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__);
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		mbufq_enqueue(mq, m);
 	} else
 		m->m_pkthdr.PH_vt.vt_nrecs++;
 
 	/*
 	 * No further work needed if no source list in packet(s).
 	 */
 	if (!record_has_sources)
 		return (nbytes);
 
 	/*
 	 * Whilst sources remain to be announced, we need to allocate
 	 * a new packet and fill out as many sources as will fit.
 	 * Always try for a cluster first.
 	 */
 	while (nims != NULL) {
 		if (mbufq_full(mq)) {
 			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m)
 			m->m_data += IGMP_LEADINGSPACE;
 		if (m == NULL) {
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 			if (m)
 				M_ALIGN(m, IGMP_LEADINGSPACE);
 		}
 		if (m == NULL)
 			return (-ENOMEM);
 		igmp_save_context(m, ifp);
 		md = m_getptr(m, 0, &off);
 		pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off);
 		CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__);
 
 		if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
 			if (m != m0)
 				m_freem(m);
 			CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
 			return (-ENOMEM);
 		}
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		nbytes += sizeof(struct igmp_grouprec);
 
 		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 
 		msrcs = 0;
 		RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
 			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
 			    inet_ntoa_haddr(ims->ims_haddr));
 			now = ims_get_mode(inm, ims, 1);
 			if ((now != mode) ||
 			    (now == mode && mode == MCAST_UNDEFINED)) {
 				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->ims_stp == 0) {
 				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_IGMPV3, "%s: append node", __func__);
 			naddr = htonl(ims->ims_haddr);
 			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		pig->ig_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(in_addr_t));
 
 		CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__);
 		mbufq_enqueue(mq, m);
 	}
 
 	return (nbytes);
 }
 
 /*
  * Type used to mark record pass completion.
  * We exploit the fact we can cast to this easily from the
  * current filter modes on each ip_msource node.
  */
 typedef enum {
 	REC_NONE = 0x00,	/* MCAST_UNDEFINED */
 	REC_ALLOW = 0x01,	/* MCAST_INCLUDE */
 	REC_BLOCK = 0x02,	/* MCAST_EXCLUDE */
 	REC_FULL = REC_ALLOW | REC_BLOCK
 } rectype_t;
 
 /*
  * Enqueue an IGMPv3 filter list change to the given output queue.
  *
  * Source list filter state is held in an RB-tree. When the filter list
  * for a group is changed without changing its mode, we need to compute
  * the deltas between T0 and T1 for each source in the filter set,
  * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
  *
  * As we may potentially queue two record types, and the entire R-B tree
  * needs to be walked at once, we break this out into its own function
  * so we can generate a tightly packed queue of packets.
  *
  * XXX This could be written to only use one tree walk, although that makes
  * serializing into the mbuf chains a bit harder. For now we do two walks
  * which makes things easier on us, and it may or may not be harder on
  * the L2 cache.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 igmp_v3_enqueue_filter_change(struct mbufq *mq, struct in_multi *inm)
 {
 	static const int MINRECLEN =
 	    sizeof(struct igmp_grouprec) + sizeof(in_addr_t);
 	struct ifnet		*ifp;
 	struct igmp_grouprec	 ig;
 	struct igmp_grouprec	*pig;
 	struct ip_msource	*ims, *nims;
 	struct mbuf		*m, *m0, *md;
 	in_addr_t		 naddr;
 	int			 m0srcs, nbytes, npbytes, off, rsrcs, schanged;
 	int			 nallow, nblock;
 	uint8_t			 mode, now, then;
 	rectype_t		 crt, drt, nrt;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	if (inm->inm_nsrc == 0 ||
 	    (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0))
 		return (0);
 
 	ifp = inm->inm_ifp;			/* interface */
 	mode = inm->inm_st[1].iss_fmode;	/* filter mode at t1 */
 	crt = REC_NONE;	/* current group record type */
 	drt = REC_NONE;	/* mask of completed group record types */
 	nrt = REC_NONE;	/* record type for current node */
 	m0srcs = 0;	/* # source which will fit in current mbuf chain */
 	nbytes = 0;	/* # of bytes appended to group's state-change queue */
 	npbytes = 0;	/* # of bytes appended this packet */
 	rsrcs = 0;	/* # sources encoded in current record */
 	schanged = 0;	/* # nodes encoded in overall filter change */
 	nallow = 0;	/* # of source entries in ALLOW_NEW */
 	nblock = 0;	/* # of source entries in BLOCK_OLD */
 	nims = NULL;	/* next tree node pointer */
 
 	/*
 	 * For each possible filter record mode.
 	 * The first kind of source we encounter tells us which
 	 * is the first kind of record we start appending.
 	 * If a node transitioned to UNDEFINED at t1, its mode is treated
 	 * as the inverse of the group's filter mode.
 	 */
 	while (drt != REC_FULL) {
 		do {
 			m0 = mbufq_last(mq);
 			if (m0 != NULL &&
 			    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
 			     IGMP_V3_REPORT_MAXRECS) &&
 			    (m0->m_pkthdr.len + MINRECLEN) <
 			     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
 				m = m0;
 				m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 					    sizeof(struct igmp_grouprec)) /
 				    sizeof(in_addr_t);
 				CTR1(KTR_IGMPV3,
 				    "%s: use previous packet", __func__);
 			} else {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				if (m)
 					m->m_data += IGMP_LEADINGSPACE;
 				if (m == NULL) {
 					m = m_gethdr(M_NOWAIT, MT_DATA);
 					if (m)
 						M_ALIGN(m, IGMP_LEADINGSPACE);
 				}
 				if (m == NULL) {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_get*() failed", __func__);
 					return (-ENOMEM);
 				}
 				m->m_pkthdr.PH_vt.vt_nrecs = 0;
 				igmp_save_context(m, ifp);
 				m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 				    sizeof(struct igmp_grouprec)) /
 				    sizeof(in_addr_t);
 				npbytes = 0;
 				CTR1(KTR_IGMPV3,
 				    "%s: allocated new packet", __func__);
 			}
 			/*
 			 * Append the IGMP group record header to the
 			 * current packet's data area.
 			 * Recalculate pointer to free space for next
 			 * group record, in case m_append() allocated
 			 * a new mbuf or cluster.
 			 */
 			memset(&ig, 0, sizeof(ig));
 			ig.ig_group = inm->inm_addr;
 			if (!m_append(m, sizeof(ig), (void *)&ig)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3,
 				    "%s: m_append() failed", __func__);
 				return (-ENOMEM);
 			}
 			npbytes += sizeof(struct igmp_grouprec);
 			if (m != m0) {
 				/* new packet; offset in c hain */
 				md = m_getptr(m, npbytes -
 				    sizeof(struct igmp_grouprec), &off);
 				pig = (struct igmp_grouprec *)(mtod(md,
 				    uint8_t *) + off);
 			} else {
 				/* current packet; offset from last append */
 				md = m_last(m);
 				pig = (struct igmp_grouprec *)(mtod(md,
 				    uint8_t *) + md->m_len -
 				    sizeof(struct igmp_grouprec));
 			}
 			/*
 			 * Begin walking the tree for this record type
 			 * pass, or continue from where we left off
 			 * previously if we had to allocate a new packet.
 			 * Only report deltas in-mode at t1.
 			 * We need not report included sources as allowed
 			 * if we are in inclusive mode on the group,
 			 * however the converse is not true.
 			 */
 			rsrcs = 0;
 			if (nims == NULL)
 				nims = RB_MIN(ip_msource_tree, &inm->inm_srcs);
 			RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
 				CTR2(KTR_IGMPV3, "%s: visit node %s",
 				    __func__, inet_ntoa_haddr(ims->ims_haddr));
 				now = ims_get_mode(inm, ims, 1);
 				then = ims_get_mode(inm, ims, 0);
 				CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d",
 				    __func__, then, now);
 				if (now == then) {
 					CTR1(KTR_IGMPV3,
 					    "%s: skip unchanged", __func__);
 					continue;
 				}
 				if (mode == MCAST_EXCLUDE &&
 				    now == MCAST_INCLUDE) {
 					CTR1(KTR_IGMPV3,
 					    "%s: skip IN src on EX group",
 					    __func__);
 					continue;
 				}
 				nrt = (rectype_t)now;
 				if (nrt == REC_NONE)
 					nrt = (rectype_t)(~mode & REC_FULL);
 				if (schanged++ == 0) {
 					crt = nrt;
 				} else if (crt != nrt)
 					continue;
 				naddr = htonl(ims->ims_haddr);
 				if (!m_append(m, sizeof(in_addr_t),
 				    (void *)&naddr)) {
 					if (m != m0)
 						m_freem(m);
 					CTR1(KTR_IGMPV3,
 					    "%s: m_append() failed", __func__);
 					return (-ENOMEM);
 				}
 				nallow += !!(crt == REC_ALLOW);
 				nblock += !!(crt == REC_BLOCK);
 				if (++rsrcs == m0srcs)
 					break;
 			}
 			/*
 			 * If we did not append any tree nodes on this
 			 * pass, back out of allocations.
 			 */
 			if (rsrcs == 0) {
 				npbytes -= sizeof(struct igmp_grouprec);
 				if (m != m0) {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_free(m)", __func__);
 					m_freem(m);
 				} else {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_adj(m, -ig)", __func__);
 					m_adj(m, -((int)sizeof(
 					    struct igmp_grouprec)));
 				}
 				continue;
 			}
 			npbytes += (rsrcs * sizeof(in_addr_t));
 			if (crt == REC_ALLOW)
 				pig->ig_type = IGMP_ALLOW_NEW_SOURCES;
 			else if (crt == REC_BLOCK)
 				pig->ig_type = IGMP_BLOCK_OLD_SOURCES;
 			pig->ig_numsrc = htons(rsrcs);
 			/*
 			 * Count the new group record, and enqueue this
 			 * packet if it wasn't already queued.
 			 */
 			m->m_pkthdr.PH_vt.vt_nrecs++;
 			if (m != m0)
 				mbufq_enqueue(mq, m);
 			nbytes += npbytes;
 		} while (nims != NULL);
 		drt |= crt;
 		crt = (~crt & REC_FULL);
 	}
 
 	CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
 	    nallow, nblock);
 
 	return (nbytes);
 }
 
 static int
 igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq)
 {
 	struct mbufq	*gq;
 	struct mbuf	*m;		/* pending state-change */
 	struct mbuf	*m0;		/* copy of pending state-change */
 	struct mbuf	*mt;		/* last state-change in packet */
 	int		 docopy, domerge;
 	u_int		 recslen;
 
 	docopy = 0;
 	domerge = 0;
 	recslen = 0;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	/*
 	 * If there are further pending retransmissions, make a writable
 	 * copy of each queued state-change message before merging.
 	 */
 	if (inm->inm_scrv > 0)
 		docopy = 1;
 
 	gq = &inm->inm_scq;
 #ifdef KTR
 	if (mbufq_first(gq) == NULL) {
 		CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty",
 		    __func__, inm);
 	}
 #endif
 
 	m = mbufq_first(gq);
 	while (m != NULL) {
 		/*
 		 * Only merge the report into the current packet if
 		 * there is sufficient space to do so; an IGMPv3 report
 		 * packet may only contain 65,535 group records.
 		 * Always use a simple mbuf chain concatentation to do this,
 		 * as large state changes for single groups may have
 		 * allocated clusters.
 		 */
 		domerge = 0;
 		mt = mbufq_last(scq);
 		if (mt != NULL) {
 			recslen = m_length(m, NULL);
 
 			if ((mt->m_pkthdr.PH_vt.vt_nrecs +
 			    m->m_pkthdr.PH_vt.vt_nrecs <=
 			    IGMP_V3_REPORT_MAXRECS) &&
 			    (mt->m_pkthdr.len + recslen <=
 			    (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE)))
 				domerge = 1;
 		}
 
 		if (!domerge && mbufq_full(gq)) {
 			CTR2(KTR_IGMPV3,
 			    "%s: outbound queue full, skipping whole packet %p",
 			    __func__, m);
 			mt = m->m_nextpkt;
 			if (!docopy)
 				m_freem(m);
 			m = mt;
 			continue;
 		}
 
 		if (!docopy) {
 			CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m);
 			m0 = mbufq_dequeue(gq);
 			m = m0->m_nextpkt;
 		} else {
 			CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m);
 			m0 = m_dup(m, M_NOWAIT);
 			if (m0 == NULL)
 				return (ENOMEM);
 			m0->m_nextpkt = NULL;
 			m = m->m_nextpkt;
 		}
 
 		if (!domerge) {
 			CTR3(KTR_IGMPV3, "%s: queueing %p to scq %p)",
 			    __func__, m0, scq);
 			mbufq_enqueue(scq, m0);
 		} else {
 			struct mbuf *mtl;	/* last mbuf of packet mt */
 
 			CTR3(KTR_IGMPV3, "%s: merging %p with scq tail %p)",
 			    __func__, m0, mt);
 
 			mtl = m_last(mt);
 			m0->m_flags &= ~M_PKTHDR;
 			mt->m_pkthdr.len += recslen;
 			mt->m_pkthdr.PH_vt.vt_nrecs +=
 			    m0->m_pkthdr.PH_vt.vt_nrecs;
 
 			mtl->m_next = m0;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Respond to a pending IGMPv3 General Query.
  */
 static void
 igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in_multi		*inm;
 	int			 retval, loop;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi->igi_version == IGMP_VERSION_3,
 	    ("%s: called when version %d", __func__, igi->igi_version));
 
 	/*
 	 * Check that there are some packets queued. If so, send them first.
 	 * For large number of groups the reply to general query can take
 	 * many packets, we should finish sending them before starting of
 	 * queuing the new reply.
 	 */
 	if (mbufq_len(&igi->igi_gq) != 0)
 		goto send;
 
 	ifp = igi->igi_ifp;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		KASSERT(ifp == inm->inm_ifp,
 		    ("%s: inconsistent ifp", __func__));
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			retval = igmp_v3_enqueue_group_record(&igi->igi_gq,
 			    inm, 0, 0, 0);
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 send:
 	loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
 	igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop);
 
 	/*
 	 * Slew transmission of bursts over 500ms intervals.
 	 */
 	if (mbufq_first(&igi->igi_gq) != NULL) {
 		igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY(
 		    IGMP_RESPONSE_BURST_INTERVAL);
 		V_interface_timers_running = 1;
 	}
 }
 
 /*
  * Transmit the next pending IGMP message in the output queue.
  *
  * We get called from netisr_processqueue(). A mutex private to igmpoq
  * will be acquired and released around this routine.
  *
  * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
  * MRT: Nothing needs to be done, as IGMP traffic is always local to
  * a link and uses a link-scope multicast address.
  */
 static void
 igmp_intr(struct mbuf *m)
 {
 	struct ip_moptions	 imo;
 	struct ifnet		*ifp;
 	struct mbuf		*ipopts, *m0;
 	int			 error;
 	uint32_t		 ifindex;
 
 	CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m);
 
 	/*
 	 * Set VNET image pointer from enqueued mbuf chain
 	 * before doing anything else. Whilst we use interface
 	 * indexes to guard against interface detach, they are
 	 * unique to each VIMAGE and must be retrieved.
 	 */
 	CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr));
 	ifindex = igmp_restore_context(m);
 
 	/*
 	 * Check if the ifnet still exists. This limits the scope of
 	 * any race in the absence of a global ifp lock for low cost
 	 * (an array lookup).
 	 */
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.",
 		    __func__, m, ifindex);
 		m_freem(m);
 		IPSTAT_INC(ips_noroute);
 		goto out;
 	}
 
 	ipopts = V_igmp_sendra ? m_raopt : NULL;
 
 	imo.imo_multicast_ttl  = 1;
 	imo.imo_multicast_vif  = -1;
 	imo.imo_multicast_loop = (V_ip_mrouter != NULL);
 
 	/*
 	 * If the user requested that IGMP traffic be explicitly
 	 * redirected to the loopback interface (e.g. they are running a
 	 * MANET interface and the routing protocol needs to see the
 	 * updates), handle this now.
 	 */
 	if (m->m_flags & M_IGMP_LOOP)
 		imo.imo_multicast_ifp = V_loif;
 	else
 		imo.imo_multicast_ifp = ifp;
 
 	if (m->m_flags & M_IGMPV2) {
 		m0 = m;
 	} else {
 		m0 = igmp_v3_encap_report(ifp, m);
 		if (m0 == NULL) {
 			CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m);
 			m_freem(m);
 			IPSTAT_INC(ips_odropped);
 			goto out;
 		}
 	}
 
 	igmp_scrub_context(m0);
 	m_clrprotoflags(m);
 	m0->m_pkthdr.rcvif = V_loif;
 #ifdef MAC
 	mac_netinet_igmp_send(ifp, m0);
 #endif
 	error = ip_output(m0, ipopts, NULL, 0, &imo, NULL);
 	if (error) {
 		CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error);
 		goto out;
 	}
 
 	IGMPSTAT_INC(igps_snd_reports);
 
 out:
 	/*
 	 * We must restore the existing vnet pointer before
 	 * continuing as we are run from netisr context.
 	 */
 	CURVNET_RESTORE();
 }
 
 /*
  * Encapsulate an IGMPv3 report.
  *
  * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf
  * chain has already had its IP/IGMPv3 header prepended. In this case
  * the function will not attempt to prepend; the lengths and checksums
  * will however be re-computed.
  *
  * Returns a pointer to the new mbuf chain head, or NULL if the
  * allocation failed.
  */
 static struct mbuf *
 igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
 {
 	struct rm_priotracker	in_ifa_tracker;
 	struct igmp_report	*igmp;
 	struct ip		*ip;
 	int			 hdrlen, igmpreclen;
 
 	KASSERT((m->m_flags & M_PKTHDR),
 	    ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
 
 	igmpreclen = m_length(m, NULL);
 	hdrlen = sizeof(struct ip) + sizeof(struct igmp_report);
 
 	if (m->m_flags & M_IGMPV3_HDR) {
 		igmpreclen -= hdrlen;
 	} else {
 		M_PREPEND(m, hdrlen, M_NOWAIT);
 		if (m == NULL)
 			return (NULL);
 		m->m_flags |= M_IGMPV3_HDR;
 	}
 
 	CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen);
 
 	m->m_data += sizeof(struct ip);
 	m->m_len -= sizeof(struct ip);
 
 	igmp = mtod(m, struct igmp_report *);
 	igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT;
 	igmp->ir_rsv1 = 0;
 	igmp->ir_rsv2 = 0;
 	igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs);
 	igmp->ir_cksum = 0;
 	igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen);
 	m->m_pkthdr.PH_vt.vt_nrecs = 0;
 
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_tos = IPTOS_PREC_INTERNETCONTROL;
 	ip->ip_len = htons(hdrlen + igmpreclen);
 	ip->ip_off = htons(IP_DF);
 	ip->ip_p = IPPROTO_IGMP;
 	ip->ip_sum = 0;
 
 	ip->ip_src.s_addr = INADDR_ANY;
 
 	if (m->m_flags & M_IGMP_LOOP) {
 		struct in_ifaddr *ia;
 
 		IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 		if (ia != NULL) {
 			ip->ip_src = ia->ia_addr.sin_addr;
 			ifa_free(&ia->ia_ifa);
 		}
 	}
 
 	ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP);
 
 	return (m);
 }
 
 #ifdef KTR
 static char *
 igmp_rec_type_to_str(const int type)
 {
 
 	switch (type) {
 		case IGMP_CHANGE_TO_EXCLUDE_MODE:
 			return "TO_EX";
 			break;
 		case IGMP_CHANGE_TO_INCLUDE_MODE:
 			return "TO_IN";
 			break;
 		case IGMP_MODE_IS_EXCLUDE:
 			return "MODE_EX";
 			break;
 		case IGMP_MODE_IS_INCLUDE:
 			return "MODE_IN";
 			break;
 		case IGMP_ALLOW_NEW_SOURCES:
 			return "ALLOW_NEW";
 			break;
 		case IGMP_BLOCK_OLD_SOURCES:
 			return "BLOCK_OLD";
 			break;
 		default:
 			break;
 	}
 	return "unknown";
 }
 #endif
 
+#ifdef VIMAGE
 static void
 vnet_igmp_init(const void *unused __unused)
 {
 
-	netisr_register(&igmp_nh);
+	netisr_register_vnet(&igmp_nh);
 }
 VNET_SYSINIT(vnet_igmp_init, SI_SUB_PROTO_MC, SI_ORDER_ANY,
     vnet_igmp_init, NULL);
 
-#ifdef VIMAGE
 static void
 vnet_igmp_uninit(const void *unused __unused)
 {
 
 	/* This can happen when we shutdown the entire network stack. */
 	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
 
-	netisr_unregister(&igmp_nh);
+	netisr_unregister_vnet(&igmp_nh);
 }
 VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY,
     vnet_igmp_uninit, NULL);
 #endif
 
 #ifdef DDB
 DB_SHOW_COMMAND(igi_list, db_show_igi_list)
 {
 	struct igmp_ifsoftc *igi, *tigi;
 	LIST_HEAD(_igi_list, igmp_ifsoftc) *igi_head;
 
         if (!have_addr) {
                 db_printf("usage: show igi_list <addr>\n");
                 return;
         }
         igi_head = (struct _igi_list *)addr;
 
 	LIST_FOREACH_SAFE(igi, igi_head, igi_link, tigi) {
 		db_printf("igmp_ifsoftc %p:\n", igi);
 		db_printf("    ifp %p\n", igi->igi_ifp);
 		db_printf("    version %u\n", igi->igi_version);
 		db_printf("    v1_timer %u\n", igi->igi_v1_timer);
 		db_printf("    v2_timer %u\n", igi->igi_v2_timer);
 		db_printf("    v3_timer %u\n", igi->igi_v3_timer);
 		db_printf("    flags %#x\n", igi->igi_flags);
 		db_printf("    rv %u\n", igi->igi_rv);
 		db_printf("    qi %u\n", igi->igi_qi);
 		db_printf("    qri %u\n", igi->igi_qri);
 		db_printf("    uri %u\n", igi->igi_uri);
 		/* SLIST_HEAD(,in_multi)   igi_relinmhead */
 		/* struct mbufq    igi_gq; */
 		db_printf("\n");
 	}
 }
 #endif
 
 static int
 igmp_modevent(module_t mod, int type, void *unused __unused)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		CTR1(KTR_IGMPV3, "%s: initializing", __func__);
 		IGMP_LOCK_INIT();
 		m_raopt = igmp_ra_alloc();
+		netisr_register(&igmp_nh);
 		break;
 	case MOD_UNLOAD:
 		CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
+		netisr_unregister(&igmp_nh);
 		m_free(m_raopt);
 		m_raopt = NULL;
 		IGMP_LOCK_DESTROY();
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t igmp_mod = {
     "igmp",
     igmp_modevent,
     0
 };
 DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE);
Index: projects/vnet/sys/netinet/ip_input.c
===================================================================
--- projects/vnet/sys/netinet/ip_input.c	(revision 299172)
+++ projects/vnet/sys/netinet/ip_input.c	(revision 299173)
@@ -1,1363 +1,1367 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bootp.h"
 #include "opt_ipfw.h"
 #include "opt_ipstealth.h"
 #include "opt_ipsec.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hhook.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 
 #include <net/pfil.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_options.h>
 #include <machine/in_cksum.h>
 #include <netinet/ip_carp.h>
 #ifdef IPSEC
 #include <netinet/ip_ipsec.h>
 #include <netipsec/ipsec.h>
 #include <netipsec/key.h>
 #endif /* IPSEC */
 #include <netinet/in_rss.h>
 
 #include <sys/socketvar.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef CTASSERT
 CTASSERT(sizeof(struct ip) == 20);
 #endif
 
 /* IP reassembly functions are defined in ip_reass.c. */
 extern void ipreass_init(void);
 extern void ipreass_drain(void);
 extern void ipreass_slowtimo(void);
 #ifdef VIMAGE
 extern void ipreass_destroy(void);
 #endif
 
 struct rmlock in_ifaddr_lock;
 RM_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock");
 
 VNET_DEFINE(int, rsvp_on);
 
 VNET_DEFINE(int, ipforwarding);
 SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipforwarding), 0,
     "Enable IP forwarding between interfaces");
 
 static VNET_DEFINE(int, ipsendredirects) = 1;	/* XXX */
 #define	V_ipsendredirects	VNET(ipsendredirects)
 SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipsendredirects), 0,
     "Enable sending IP redirects");
 
 /*
  * XXX - Setting ip_checkinterface mostly implements the receive side of
  * the Strong ES model described in RFC 1122, but since the routing table
  * and transmit implementation do not implement the Strong ES model,
  * setting this to 1 results in an odd hybrid.
  *
  * XXX - ip_checkinterface currently must be disabled if you use ipnat
  * to translate the destination address to another local interface.
  *
  * XXX - ip_checkinterface must be disabled if you add IP aliases
  * to the loopback interface instead of the interface where the
  * packets for those addresses are received.
  */
 static VNET_DEFINE(int, ip_checkinterface);
 #define	V_ip_checkinterface	VNET(ip_checkinterface)
 SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip_checkinterface), 0,
     "Verify packet arrives on correct interface");
 
 VNET_DEFINE(struct pfil_head, inet_pfil_hook);	/* Packet filter hooks */
 
 static struct netisr_handler ip_nh = {
 	.nh_name = "ip",
 	.nh_handler = ip_input,
 	.nh_proto = NETISR_IP,
 #ifdef	RSS
 	.nh_m2cpuid = rss_soft_m2cpuid_v4,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 #else
 	.nh_policy = NETISR_POLICY_FLOW,
 #endif
 };
 
 #ifdef	RSS
 /*
  * Directly dispatched frames are currently assumed
  * to have a flowid already calculated.
  *
  * It should likely have something that assert it
  * actually has valid flow details.
  */
 static struct netisr_handler ip_direct_nh = {
 	.nh_name = "ip_direct",
 	.nh_handler = ip_direct_input,
 	.nh_proto = NETISR_IP_DIRECT,
 	.nh_m2cpuid = rss_soft_m2cpuid_v4,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 };
 #endif
 
 extern	struct domain inetdomain;
 extern	struct protosw inetsw[];
 u_char	ip_protox[IPPROTO_MAX];
 VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead);  /* first inet address */
 VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table  */
 VNET_DEFINE(u_long, in_ifaddrhmask);		/* mask for hash table */
 
 #ifdef IPCTL_DEFMTU
 SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
     &ip_mtu, 0, "Default MTU");
 #endif
 
 #ifdef IPSTEALTH
 VNET_DEFINE(int, ipstealth);
 SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipstealth), 0,
     "IP stealth mode, no TTL decrementation on forwarding");
 #endif
 
 /*
  * IP statistics are stored in the "array" of counter(9)s.
  */
 VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat);
 VNET_PCPUSTAT_SYSINIT(ipstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat,
     "IP statistics (struct ipstat, netinet/ip_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ipstat);
 #endif /* VIMAGE */
 
 /*
  * Kernel module interface for updating ipstat.  The argument is an index
  * into ipstat treated as an array.
  */
 void
 kmod_ipstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(ipstat)[statnum], 1);
 }
 
 void
 kmod_ipstat_dec(int statnum)
 {
 
 	counter_u64_add(VNET(ipstat)[statnum], -1);
 }
 
 static int
 sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip_nh, qlimit));
 }
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen,
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I",
     "Maximum size of the IP input queue");
 
 static int
 sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS)
 {
 	u_int64_t qdrops_long;
 	int error, qdrops;
 
 	netisr_getqdrops(&ip_nh, &qdrops_long);
 	qdrops = qdrops_long;
 	error = sysctl_handle_int(oidp, &qdrops, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qdrops != 0)
 		return (EINVAL);
 	netisr_clearqdrops(&ip_nh);
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops,
     CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I",
     "Number of packets dropped from the IP input queue");
 
 #ifdef	RSS
 static int
 sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip_direct_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip_direct_nh, qlimit));
 }
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_direct_queue_maxlen,
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I",
     "Maximum size of the IP direct input queue");
 
 static int
 sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS)
 {
 	u_int64_t qdrops_long;
 	int error, qdrops;
 
 	netisr_getqdrops(&ip_direct_nh, &qdrops_long);
 	qdrops = qdrops_long;
 	error = sysctl_handle_int(oidp, &qdrops, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qdrops != 0)
 		return (EINVAL);
 	netisr_clearqdrops(&ip_direct_nh);
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_direct_queue_drops,
     CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I",
     "Number of packets dropped from the IP direct input queue");
 #endif	/* RSS */
 
 /*
  * IP initialization: fill in IP protocol switch table.
  * All protocols not implemented in kernel go to raw IP protocol handler.
  */
 void
 ip_init(void)
 {
 	struct protosw *pr;
 	int i;
 
 	TAILQ_INIT(&V_in_ifaddrhead);
 	V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask);
 
 	/* Initialize IP reassembly queue. */
 	ipreass_init();
 
 	/* Initialize packet filter hooks. */
 	V_inet_pfil_hook.ph_type = PFIL_TYPE_AF;
 	V_inet_pfil_hook.ph_af = AF_INET;
 	if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to register pfil hook, "
 			"error %d\n", __func__, i);
 
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET,
 	    &V_ipsec_hhh_in[HHOOK_IPSEC_INET],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register input helper hook\n",
 		    __func__);
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET,
 	    &V_ipsec_hhh_out[HHOOK_IPSEC_INET],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register output helper hook\n",
 		    __func__);
 
 	/* Skip initialization of globals for non-default instances. */
-	if (!IS_DEFAULT_VNET(curvnet))
-		goto out;
+	if (!IS_DEFAULT_VNET(curvnet)) {
+		netisr_register_vnet(&ip_nh);
+#ifdef	RSS
+		netisr_register_vnet(&ip_direct_nh);
+#endif
+		return;
+	}
 
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		panic("ip_init: PF_INET not found");
 
 	/* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
 	for (i = 0; i < IPPROTO_MAX; i++)
 		ip_protox[i] = pr - inetsw;
 	/*
 	 * Cycle through IP protocols and put them into the appropriate place
 	 * in ip_protox[].
 	 */
 	for (pr = inetdomain.dom_protosw;
 	    pr < inetdomain.dom_protoswNPROTOSW; pr++)
 		if (pr->pr_domain->dom_family == PF_INET &&
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
 			/* Be careful to only index valid IP protocols. */
 			if (pr->pr_protocol < IPPROTO_MAX)
 				ip_protox[pr->pr_protocol] = pr - inetsw;
 		}
 
-out:
 	netisr_register(&ip_nh);
 #ifdef	RSS
 	netisr_register(&ip_direct_nh);
 #endif
 }
 
 #ifdef VIMAGE
 static void
 ip_destroy(void *unused __unused)
 {
 	struct ifnet *ifp;
 	int error;
 
 #ifdef	RSS
-	netisr_unregister(&ip_direct_nh);
+	netisr_unregister_vnet(&ip_direct_nh);
 #endif
-	netisr_unregister(&ip_nh);
+	netisr_unregister_vnet(&ip_nh);
 
 	if ((error = pfil_head_unregister(&V_inet_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to unregister pfil hook, "
 		    "error %d\n", __func__, error);
 
 	error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister input helper hook "
 		    "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET: "
 		    "error %d returned\n", __func__, error);
 	}
 	error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister output helper hook "
 		    "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET: "
 		    "error %d returned\n", __func__, error);
 	}
 
 	/* Remove the IPv4 addresses from all interfaces. */
 	in_ifscrub_all();
 
 	/* Make sure the IPv4 routes are gone as well. */
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
 		rt_flushifroutes_af(ifp, AF_INET);
 	IFNET_RUNLOCK();
 
 	/* Destroy IP reassembly queue. */
 	ipreass_destroy();
 
 	/* Cleanup in_ifaddr hash table; should be empty. */
 	hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask);
 }
 
 VNET_SYSUNINIT(ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_destroy, NULL);
 #endif
 
 #ifdef	RSS
 /*
  * IP direct input routine.
  *
  * This is called when reinjecting completed fragments where
  * all of the previous checking and book-keeping has been done.
  */
 void
 ip_direct_input(struct mbuf *m)
 {
 	struct ip *ip;
 	int hlen;
 
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 
 	IPSTAT_INC(ips_delivered);
 	(*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p);
 	return;
 }
 #endif
 
 /*
  * Ip input routine.  Checksum and byte swap header.  If fragmented
  * try to reassemble.  Process options.  Pass to next level.
  */
 void
 ip_input(struct mbuf *m)
 {
 	struct ip *ip = NULL;
 	struct in_ifaddr *ia = NULL;
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	int    checkif, hlen = 0;
 	uint16_t sum, ip_len;
 	int dchg = 0;				/* dest changed after fw */
 	struct in_addr odst;			/* original dst address */
 
 	M_ASSERTPKTHDR(m);
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		/* Set up some basics that will be used later. */
 		ip = mtod(m, struct ip *);
 		hlen = ip->ip_hl << 2;
 		ip_len = ntohs(ip->ip_len);
 		goto ours;
 	}
 
 	IPSTAT_INC(ips_total);
 
 	if (m->m_pkthdr.len < sizeof(struct ip))
 		goto tooshort;
 
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == NULL) {
 		IPSTAT_INC(ips_toosmall);
 		return;
 	}
 	ip = mtod(m, struct ip *);
 
 	if (ip->ip_v != IPVERSION) {
 		IPSTAT_INC(ips_badvers);
 		goto bad;
 	}
 
 	hlen = ip->ip_hl << 2;
 	if (hlen < sizeof(struct ip)) {	/* minimum header length */
 		IPSTAT_INC(ips_badhlen);
 		goto bad;
 	}
 	if (hlen > m->m_len) {
 		if ((m = m_pullup(m, hlen)) == NULL) {
 			IPSTAT_INC(ips_badhlen);
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 
 	IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL);
 
 	/* 127/8 must not appear on wire - RFC1122 */
 	ifp = m->m_pkthdr.rcvif;
 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			IPSTAT_INC(ips_badaddr);
 			goto bad;
 		}
 	}
 
 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
 		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
 	} else {
 		if (hlen == sizeof(struct ip)) {
 			sum = in_cksum_hdr(ip);
 		} else {
 			sum = in_cksum(m, hlen);
 		}
 	}
 	if (sum) {
 		IPSTAT_INC(ips_badsum);
 		goto bad;
 	}
 
 #ifdef ALTQ
 	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
 		/* packet is dropped by traffic conditioner */
 		return;
 #endif
 
 	ip_len = ntohs(ip->ip_len);
 	if (ip_len < hlen) {
 		IPSTAT_INC(ips_badlen);
 		goto bad;
 	}
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IP header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len < ip_len) {
 tooshort:
 		IPSTAT_INC(ips_tooshort);
 		goto bad;
 	}
 	if (m->m_pkthdr.len > ip_len) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = ip_len;
 			m->m_pkthdr.len = ip_len;
 		} else
 			m_adj(m, ip_len - m->m_pkthdr.len);
 	}
 
 	/* Try to forward the packet, but if we fail continue */
 #ifdef IPSEC
 	/* For now we do not handle IPSEC in tryforward. */
 	if (!key_havesp(IPSEC_DIR_INBOUND) && !key_havesp(IPSEC_DIR_OUTBOUND) &&
 	    (V_ipforwarding == 1))
 		if (ip_tryforward(m) == NULL)
 			return;
 	/*
 	 * Bypass packet filtering for packets previously handled by IPsec.
 	 */
 	if (ip_ipsec_filtertunnel(m))
 		goto passin;
 #else
 	if (V_ipforwarding == 1)
 		if (ip_tryforward(m) == NULL)
 			return;
 #endif /* IPSEC */
 
 	/*
 	 * Run through list of hooks for input packets.
 	 *
 	 * NB: Beware of the destination address changing (e.g.
 	 *     by NAT rewriting).  When this happens, tell
 	 *     ip_forward to do the right thing.
 	 */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet_pfil_hook))
 		goto passin;
 
 	odst = ip->ip_dst;
 	if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0)
 		return;
 	if (m == NULL)			/* consumed by filter */
 		return;
 
 	ip = mtod(m, struct ip *);
 	dchg = (odst.s_addr != ip->ip_dst.s_addr);
 	ifp = m->m_pkthdr.rcvif;
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		goto ours;
 	}
 	if (m->m_flags & M_IP_NEXTHOP) {
 		dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL);
 		if (dchg != 0) {
 			/*
 			 * Directly ship the packet on.  This allows
 			 * forwarding packets originally destined to us
 			 * to some other directly connected host.
 			 */
 			ip_forward(m, 1);
 			return;
 		}
 	}
 passin:
 
 	/*
 	 * Process options and, if not destined for us,
 	 * ship it on.  ip_dooptions returns 1 when an
 	 * error was detected (causing an icmp message
 	 * to be sent and the original packet to be freed).
 	 */
 	if (hlen > sizeof (struct ip) && ip_dooptions(m, 0))
 		return;
 
         /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
          * matter if it is destined to another node, or whether it is 
          * a multicast one, RSVP wants it! and prevents it from being forwarded
          * anywhere else. Also checks if the rsvp daemon is running before
 	 * grabbing the packet.
          */
 	if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) 
 		goto ours;
 
 	/*
 	 * Check our list of addresses, to see if the packet is for us.
 	 * If we don't have any addresses, assume any unicast packet
 	 * we receive might be for us (and let the upper layers deal
 	 * with it).
 	 */
 	if (TAILQ_EMPTY(&V_in_ifaddrhead) &&
 	    (m->m_flags & (M_MCAST|M_BCAST)) == 0)
 		goto ours;
 
 	/*
 	 * Enable a consistency check between the destination address
 	 * and the arrival interface for a unicast packet (the RFC 1122
 	 * strong ES model) if IP forwarding is disabled and the packet
 	 * is not locally generated and the packet is not subject to
 	 * 'ipfw fwd'.
 	 *
 	 * XXX - Checking also should be disabled if the destination
 	 * address is ipnat'ed to a different interface.
 	 *
 	 * XXX - Checking is incompatible with IP aliases added
 	 * to the loopback interface instead of the interface where
 	 * the packets are received.
 	 *
 	 * XXX - This is the case for carp vhost IPs as well so we
 	 * insert a workaround. If the packet got here, we already
 	 * checked with carp_iamatch() and carp_forus().
 	 */
 	checkif = V_ip_checkinterface && (V_ipforwarding == 0) && 
 	    ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) &&
 	    ifp->if_carp == NULL && (dchg == 0);
 
 	/*
 	 * Check for exact addresses in the hash bucket.
 	 */
 	/* IN_IFADDR_RLOCK(); */
 	LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
 		/*
 		 * If the address matches, verify that the packet
 		 * arrived via the correct interface if checking is
 		 * enabled.
 		 */
 		if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && 
 		    (!checkif || ia->ia_ifp == ifp)) {
 			counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 			counter_u64_add(ia->ia_ifa.ifa_ibytes,
 			    m->m_pkthdr.len);
 			/* IN_IFADDR_RUNLOCK(); */
 			goto ours;
 		}
 	}
 	/* IN_IFADDR_RUNLOCK(); */
 
 	/*
 	 * Check for broadcast addresses.
 	 *
 	 * Only accept broadcast packets that arrive via the matching
 	 * interface.  Reception of forwarded directed broadcasts would
 	 * be handled via ip_forward() and ether_output() with the loopback
 	 * into the stack for SIMPLEX interfaces handled by ether_output().
 	 */
 	if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
 		IF_ADDR_RLOCK(ifp);
 	        TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
 			    ip->ip_dst.s_addr) {
 				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_ibytes,
 				    m->m_pkthdr.len);
 				IF_ADDR_RUNLOCK(ifp);
 				goto ours;
 			}
 #ifdef BOOTP_COMPAT
 			if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) {
 				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_ibytes,
 				    m->m_pkthdr.len);
 				IF_ADDR_RUNLOCK(ifp);
 				goto ours;
 			}
 #endif
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		ia = NULL;
 	}
 	/* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */
 	if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		if (V_ip_mrouter) {
 			/*
 			 * If we are acting as a multicast router, all
 			 * incoming multicast packets are passed to the
 			 * kernel-level multicast forwarding function.
 			 * The packet is returned (relatively) intact; if
 			 * ip_mforward() returns a non-zero value, the packet
 			 * must be discarded, else it may be accepted below.
 			 */
 			if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) {
 				IPSTAT_INC(ips_cantforward);
 				m_freem(m);
 				return;
 			}
 
 			/*
 			 * The process-level routing daemon needs to receive
 			 * all multicast IGMP packets, whether or not this
 			 * host belongs to their destination groups.
 			 */
 			if (ip->ip_p == IPPROTO_IGMP)
 				goto ours;
 			IPSTAT_INC(ips_forward);
 		}
 		/*
 		 * Assume the packet is for us, to avoid prematurely taking
 		 * a lock on the in_multi hash. Protocols must perform
 		 * their own filtering and update statistics accordingly.
 		 */
 		goto ours;
 	}
 	if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST)
 		goto ours;
 	if (ip->ip_dst.s_addr == INADDR_ANY)
 		goto ours;
 
 	/*
 	 * Not for us; forward if possible and desirable.
 	 */
 	if (V_ipforwarding == 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 	} else {
 		ip_forward(m, dchg);
 	}
 	return;
 
 ours:
 #ifdef IPSTEALTH
 	/*
 	 * IPSTEALTH: Process non-routing options only
 	 * if the packet is destined for us.
 	 */
 	if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1))
 		return;
 #endif /* IPSTEALTH */
 
 	/*
 	 * Attempt reassembly; if it succeeds, proceed.
 	 * ip_reass() will return a different mbuf.
 	 */
 	if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) {
 		/* XXXGL: shouldn't we save & set m_flags? */
 		m = ip_reass(m);
 		if (m == NULL)
 			return;
 		ip = mtod(m, struct ip *);
 		/* Get the header length of the reassembled packet */
 		hlen = ip->ip_hl << 2;
 	}
 
 #ifdef IPSEC
 	/*
 	 * enforce IPsec policy checking if we are seeing last header.
 	 * note that we do not visit this with protocols with pcb layer
 	 * code - like udp/tcp/raw ip.
 	 */
 	if (ip_ipsec_input(m, ip->ip_p) != 0)
 		goto bad;
 #endif /* IPSEC */
 
 	/*
 	 * Switch out to protocol's input routine.
 	 */
 	IPSTAT_INC(ips_delivered);
 
 	(*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p);
 	return;
 bad:
 	m_freem(m);
 }
 
 /*
  * IP timer processing;
  * if a timer expires on a reassembly
  * queue, discard it.
  */
 void
 ip_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		ipreass_slowtimo();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 void
 ip_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		ipreass_drain();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * The protocol to be inserted into ip_protox[] must be already registered
  * in inetsw[], either statically or through pf_proto_register().
  */
 int
 ipproto_register(short ipproto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/*
 	 * The protocol slot must not be occupied by another protocol
 	 * already.  An index pointing to IPPROTO_RAW is unused.
 	 */
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip_protox[ipproto] != pr - inetsw)	/* IPPROTO_RAW */
 		return (EEXIST);
 
 	/* Find the protocol position in inetsw[] and set the index. */
 	for (pr = inetdomain.dom_protosw;
 	     pr < inetdomain.dom_protoswNPROTOSW; pr++) {
 		if (pr->pr_domain->dom_family == PF_INET &&
 		    pr->pr_protocol && pr->pr_protocol == ipproto) {
 			ip_protox[pr->pr_protocol] = pr - inetsw;
 			return (0);
 		}
 	}
 	return (EPROTONOSUPPORT);
 }
 
 int
 ipproto_unregister(short ipproto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/* Check if the protocol was indeed registered. */
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip_protox[ipproto] == pr - inetsw)  /* IPPROTO_RAW */
 		return (ENOENT);
 
 	/* Reset the protocol slot to IPPROTO_RAW. */
 	ip_protox[ipproto] = pr - inetsw;
 	return (0);
 }
 
 u_char inetctlerrmap[PRC_NCMDS] = {
 	0,		0,		0,		0,
 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
 	0,		0,		EHOSTUNREACH,	0,
 	ENOPROTOOPT,	ECONNREFUSED
 };
 
 /*
  * Forward a packet.  If some error occurs return the sender
  * an icmp packet.  Note we can't always generate a meaningful
  * icmp message because icmp doesn't have a large enough repertoire
  * of codes and types.
  *
  * If not forwarding, just drop the packet.  This could be confusing
  * if ipforwarding was zero but some routing protocol was advancing
  * us as a gateway to somewhere.  However, we must let the routing
  * protocol deal with that.
  *
  * The srcrt parameter indicates whether the packet is being forwarded
  * via a source route.
  */
 void
 ip_forward(struct mbuf *m, int srcrt)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct in_ifaddr *ia;
 	struct mbuf *mcopy;
 	struct sockaddr_in *sin;
 	struct in_addr dest;
 	struct route ro;
 	int error, type = 0, code = 0, mtu = 0;
 
 	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 #ifdef IPSEC
 	if (ip_ipsec_fwd(m) != 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 #endif /* IPSEC */
 #ifdef IPSTEALTH
 	if (!V_ipstealth) {
 #endif
 		if (ip->ip_ttl <= IPTTLDEC) {
 			icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
 			    0, 0);
 			return;
 		}
 #ifdef IPSTEALTH
 	}
 #endif
 
 	bzero(&ro, sizeof(ro));
 	sin = (struct sockaddr_in *)&ro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = ip->ip_dst;
 #ifdef RADIX_MPATH
 	rtalloc_mpath_fib(&ro,
 	    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
 	    M_GETFIB(m));
 #else
 	in_rtalloc_ign(&ro, 0, M_GETFIB(m));
 #endif
 	if (ro.ro_rt != NULL) {
 		ia = ifatoia(ro.ro_rt->rt_ifa);
 		ifa_ref(&ia->ia_ifa);
 	} else
 		ia = NULL;
 #ifndef IPSEC
 	/*
 	 * 'ia' may be NULL if there is no route for this destination.
 	 * In case of IPsec, Don't discard it just yet, but pass it to
 	 * ip_output in case of outgoing IPsec policy.
 	 */
 	if (!srcrt && ia == NULL) {
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		RO_RTFREE(&ro);
 		return;
 	}
 #endif
 
 	/*
 	 * Save the IP header and at most 8 bytes of the payload,
 	 * in case we need to generate an ICMP message to the src.
 	 *
 	 * XXX this can be optimized a lot by saving the data in a local
 	 * buffer on the stack (72 bytes at most), and only allocating the
 	 * mbuf if really necessary. The vast majority of the packets
 	 * are forwarded without having to send an ICMP back (either
 	 * because unnecessary, or because rate limited), so we are
 	 * really we are wasting a lot of work here.
 	 *
 	 * We don't use m_copy() because it might return a reference
 	 * to a shared cluster. Both this function and ip_output()
 	 * assume exclusive access to the IP header in `m', so any
 	 * data in a cluster may change before we reach icmp_error().
 	 */
 	mcopy = m_gethdr(M_NOWAIT, m->m_type);
 	if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) {
 		/*
 		 * It's probably ok if the pkthdr dup fails (because
 		 * the deep copy of the tag chain failed), but for now
 		 * be conservative and just discard the copy since
 		 * code below may some day want the tags.
 		 */
 		m_free(mcopy);
 		mcopy = NULL;
 	}
 	if (mcopy != NULL) {
 		mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy));
 		mcopy->m_pkthdr.len = mcopy->m_len;
 		m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
 	}
 
 #ifdef IPSTEALTH
 	if (!V_ipstealth) {
 #endif
 		ip->ip_ttl -= IPTTLDEC;
 #ifdef IPSTEALTH
 	}
 #endif
 
 	/*
 	 * If forwarding packet using same interface that it came in on,
 	 * perhaps should send a redirect to sender to shortcut a hop.
 	 * Only send redirect if source is sending directly to us,
 	 * and if packet was not source routed (or has any options).
 	 * Also, don't send redirect if forwarding using a default route
 	 * or a route modified by a redirect.
 	 */
 	dest.s_addr = 0;
 	if (!srcrt && V_ipsendredirects &&
 	    ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) {
 		struct rtentry *rt;
 
 		rt = ro.ro_rt;
 
 		if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
 		    satosin(rt_key(rt))->sin_addr.s_addr != 0) {
 #define	RTA(rt)	((struct in_ifaddr *)(rt->rt_ifa))
 			u_long src = ntohl(ip->ip_src.s_addr);
 
 			if (RTA(rt) &&
 			    (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) {
 				if (rt->rt_flags & RTF_GATEWAY)
 					dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr;
 				else
 					dest.s_addr = ip->ip_dst.s_addr;
 				/* Router requirements says to only send host redirects */
 				type = ICMP_REDIRECT;
 				code = ICMP_REDIRECT_HOST;
 			}
 		}
 	}
 
 	error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);
 
 	if (error == EMSGSIZE && ro.ro_rt)
 		mtu = ro.ro_rt->rt_mtu;
 	RO_RTFREE(&ro);
 
 	if (error)
 		IPSTAT_INC(ips_cantforward);
 	else {
 		IPSTAT_INC(ips_forward);
 		if (type)
 			IPSTAT_INC(ips_redirectsent);
 		else {
 			if (mcopy)
 				m_freem(mcopy);
 			if (ia != NULL)
 				ifa_free(&ia->ia_ifa);
 			return;
 		}
 	}
 	if (mcopy == NULL) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return;
 	}
 
 	switch (error) {
 
 	case 0:				/* forwarded, but need redirect */
 		/* type, code set above */
 		break;
 
 	case ENETUNREACH:
 	case EHOSTUNREACH:
 	case ENETDOWN:
 	case EHOSTDOWN:
 	default:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_HOST;
 		break;
 
 	case EMSGSIZE:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_NEEDFRAG;
 
 #ifdef IPSEC
 		/* 
 		 * If IPsec is configured for this path,
 		 * override any possibly mtu value set by ip_output.
 		 */ 
 		mtu = ip_ipsec_mtu(mcopy, mtu);
 #endif /* IPSEC */
 		/*
 		 * If the MTU was set before make sure we are below the
 		 * interface MTU.
 		 * If the MTU wasn't set before use the interface mtu or
 		 * fall back to the next smaller mtu step compared to the
 		 * current packet size.
 		 */
 		if (mtu != 0) {
 			if (ia != NULL)
 				mtu = min(mtu, ia->ia_ifp->if_mtu);
 		} else {
 			if (ia != NULL)
 				mtu = ia->ia_ifp->if_mtu;
 			else
 				mtu = ip_next_mtu(ntohs(ip->ip_len), 0);
 		}
 		IPSTAT_INC(ips_cantfrag);
 		break;
 
 	case ENOBUFS:
 	case EACCES:			/* ipfw denied packet */
 		m_freem(mcopy);
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return;
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	icmp_error(mcopy, type, code, dest.s_addr, mtu);
 }
 
 void
 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
     struct mbuf *m)
 {
 
 	if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) {
 		struct bintime bt;
 
 		bintime(&bt);
 		if (inp->inp_socket->so_options & SO_BINTIME) {
 			*mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt),
 			    SCM_BINTIME, SOL_SOCKET);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 		if (inp->inp_socket->so_options & SO_TIMESTAMP) {
 			struct timeval tv;
 
 			bintime2timeval(&bt, &tv);
 			*mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv),
 			    SCM_TIMESTAMP, SOL_SOCKET);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 	if (inp->inp_flags & INP_RECVDSTADDR) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_dst,
 		    sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVTTL) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_ttl,
 		    sizeof(u_char), IP_RECVTTL, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #ifdef notyet
 	/* XXX
 	 * Moving these out of udp_input() made them even more broken
 	 * than they already were.
 	 */
 	/* options were tossed already */
 	if (inp->inp_flags & INP_RECVOPTS) {
 		*mp = sbcreatecontrol((caddr_t)opts_deleted_above,
 		    sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	/* ip_srcroute doesn't do what we want here, need to fix */
 	if (inp->inp_flags & INP_RECVRETOPTS) {
 		*mp = sbcreatecontrol((caddr_t)ip_srcroute(m),
 		    sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #endif
 	if (inp->inp_flags & INP_RECVIF) {
 		struct ifnet *ifp;
 		struct sdlbuf {
 			struct sockaddr_dl sdl;
 			u_char	pad[32];
 		} sdlbuf;
 		struct sockaddr_dl *sdp;
 		struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
 
 		if ((ifp = m->m_pkthdr.rcvif) &&
 		    ifp->if_index && ifp->if_index <= V_if_index) {
 			sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr;
 			/*
 			 * Change our mind and don't try copy.
 			 */
 			if (sdp->sdl_family != AF_LINK ||
 			    sdp->sdl_len > sizeof(sdlbuf)) {
 				goto makedummy;
 			}
 			bcopy(sdp, sdl2, sdp->sdl_len);
 		} else {
 makedummy:	
 			sdl2->sdl_len =
 			    offsetof(struct sockaddr_dl, sdl_data[0]);
 			sdl2->sdl_family = AF_LINK;
 			sdl2->sdl_index = 0;
 			sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
 		}
 		*mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len,
 		    IP_RECVIF, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVTOS) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_tos,
 		    sizeof(u_char), IP_RECVTOS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if (inp->inp_flags2 & INP_RECVFLOWID) {
 		uint32_t flowid, flow_type;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		/*
 		 * XXX should handle the failure of one or the
 		 * other - don't populate both?
 		 */
 		*mp = sbcreatecontrol((caddr_t) &flowid,
 		    sizeof(uint32_t), IP_FLOWID, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 		*mp = sbcreatecontrol((caddr_t) &flow_type,
 		    sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 #ifdef	RSS
 	if (inp->inp_flags2 & INP_RECVRSSBUCKETID) {
 		uint32_t flowid, flow_type;
 		uint32_t rss_bucketid;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) {
 			*mp = sbcreatecontrol((caddr_t) &rss_bucketid,
 			   sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 #endif
 }
 
 /*
  * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the
  * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on
  * locking.  This code remains in ip_input.c as ip_mroute.c is optionally
  * compiled.
  */
 static VNET_DEFINE(int, ip_rsvp_on);
 VNET_DEFINE(struct socket *, ip_rsvpd);
 
 #define	V_ip_rsvp_on		VNET(ip_rsvp_on)
 
 int
 ip_rsvp_init(struct socket *so)
 {
 
 	if (so->so_type != SOCK_RAW ||
 	    so->so_proto->pr_protocol != IPPROTO_RSVP)
 		return EOPNOTSUPP;
 
 	if (V_ip_rsvpd != NULL)
 		return EADDRINUSE;
 
 	V_ip_rsvpd = so;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-increment
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (!V_ip_rsvp_on) {
 		V_ip_rsvp_on = 1;
 		V_rsvp_on++;
 	}
 
 	return 0;
 }
 
 int
 ip_rsvp_done(void)
 {
 
 	V_ip_rsvpd = NULL;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-decrement
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (V_ip_rsvp_on) {
 		V_ip_rsvp_on = 0;
 		V_rsvp_on--;
 	}
 	return 0;
 }
 
 int
 rsvp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m;
 
 	m = *mp;
 	*mp = NULL;
 
 	if (rsvp_input_p) { /* call the real one if loaded */
 		*mp = m;
 		rsvp_input_p(mp, offp, proto);
 		return (IPPROTO_DONE);
 	}
 
 	/* Can still get packets with rsvp_on = 0 if there is a local member
 	 * of the group to which the RSVP packet is addressed.  But in this
 	 * case we want to throw the packet away.
 	 */
 	
 	if (!V_rsvp_on) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if (V_ip_rsvpd != NULL) { 
 		*mp = m;
 		rip_input(mp, offp, proto);
 		return (IPPROTO_DONE);
 	}
 	/* Drop the packet */
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
Index: projects/vnet/sys/netinet6/in6.c
===================================================================
--- projects/vnet/sys/netinet6/in6.c	(revision 299172)
+++ projects/vnet/sys/netinet6/in6.c	(revision 299173)
@@ -1,2532 +1,2532 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6.c,v 1.259 2002/01/21 11:37:50 keiichi Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in.c	8.2 (Berkeley) 11/15/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/errno.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/systm.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/if_dl.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <net/if_llatbl.h>
 #include <netinet/if_ether.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_carp.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/ip6_mroute.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
 
 VNET_DECLARE(int, icmp6_nodeinfo_oldmcprefix);
 #define V_icmp6_nodeinfo_oldmcprefix	VNET(icmp6_nodeinfo_oldmcprefix)
 
 /*
  * Definitions of some costant IP6 addresses.
  */
 const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
 const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
 const struct in6_addr in6addr_nodelocal_allnodes =
 	IN6ADDR_NODELOCAL_ALLNODES_INIT;
 const struct in6_addr in6addr_linklocal_allnodes =
 	IN6ADDR_LINKLOCAL_ALLNODES_INIT;
 const struct in6_addr in6addr_linklocal_allrouters =
 	IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
 const struct in6_addr in6addr_linklocal_allv2routers =
 	IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT;
 
 const struct in6_addr in6mask0 = IN6MASK0;
 const struct in6_addr in6mask32 = IN6MASK32;
 const struct in6_addr in6mask64 = IN6MASK64;
 const struct in6_addr in6mask96 = IN6MASK96;
 const struct in6_addr in6mask128 = IN6MASK128;
 
 const struct sockaddr_in6 sa6_any =
 	{ sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 };
 
 static int in6_notify_ifa(struct ifnet *, struct in6_ifaddr *,
 	struct in6_aliasreq *, int);
 static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *);
 
 static int in6_validate_ifra(struct ifnet *, struct in6_aliasreq *,
     struct in6_ifaddr *, int);
 static struct in6_ifaddr *in6_alloc_ifa(struct ifnet *,
     struct in6_aliasreq *, int flags);
 static int in6_update_ifa_internal(struct ifnet *, struct in6_aliasreq *,
     struct in6_ifaddr *, int, int);
 static int in6_broadcast_ifa(struct ifnet *, struct in6_aliasreq *,
     struct in6_ifaddr *, int);
 
 #define ifa2ia6(ifa)	((struct in6_ifaddr *)(ifa))
 #define ia62ifa(ia6)	(&((ia6)->ia_ifa))
 
 
 void
 in6_newaddrmsg(struct in6_ifaddr *ia, int cmd)
 {
 	struct sockaddr_dl gateway;
 	struct sockaddr_in6 mask, addr;
 	struct rtentry rt;
 
 	/*
 	 * initialize for rtmsg generation
 	 */
 	bzero(&gateway, sizeof(gateway));
 	gateway.sdl_len = sizeof(gateway);
 	gateway.sdl_family = AF_LINK;
 
 	bzero(&rt, sizeof(rt));
 	rt.rt_gateway = (struct sockaddr *)&gateway;
 	memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask));
 	memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr));
 	rt_mask(&rt) = (struct sockaddr *)&mask;
 	rt_key(&rt) = (struct sockaddr *)&addr;
 	rt.rt_flags = RTF_HOST | RTF_STATIC;
 	if (cmd == RTM_ADD)
 		rt.rt_flags |= RTF_UP;
 	/* Announce arrival of local address to all FIBs. */
 	rt_newaddrmsg(cmd, &ia->ia_ifa, 0, &rt);
 }
 
 int
 in6_mask2len(struct in6_addr *mask, u_char *lim0)
 {
 	int x = 0, y;
 	u_char *lim = lim0, *p;
 
 	/* ignore the scope_id part */
 	if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask))
 		lim = (u_char *)mask + sizeof(*mask);
 	for (p = (u_char *)mask; p < lim; x++, p++) {
 		if (*p != 0xff)
 			break;
 	}
 	y = 0;
 	if (p < lim) {
 		for (y = 0; y < 8; y++) {
 			if ((*p & (0x80 >> y)) == 0)
 				break;
 		}
 	}
 
 	/*
 	 * when the limit pointer is given, do a stricter check on the
 	 * remaining bits.
 	 */
 	if (p < lim) {
 		if (y != 0 && (*p & (0x00ff >> y)) != 0)
 			return (-1);
 		for (p = p + 1; p < lim; p++)
 			if (*p != 0)
 				return (-1);
 	}
 
 	return x * 8 + y;
 }
 
 #ifdef COMPAT_FREEBSD32
 struct in6_ndifreq32 {
 	char ifname[IFNAMSIZ];
 	uint32_t ifindex;
 };
 #define	SIOCGDEFIFACE32_IN6	_IOWR('i', 86, struct in6_ndifreq32)
 #endif
 
 int
 in6_control(struct socket *so, u_long cmd, caddr_t data,
     struct ifnet *ifp, struct thread *td)
 {
 	struct	in6_ifreq *ifr = (struct in6_ifreq *)data;
 	struct	in6_ifaddr *ia = NULL;
 	struct	in6_aliasreq *ifra = (struct in6_aliasreq *)data;
 	struct sockaddr_in6 *sa6;
 	int carp_attached = 0;
 	int error;
 	u_long ocmd = cmd;
 
 	/*
 	 * Compat to make pre-10.x ifconfig(8) operable.
 	 */
 	if (cmd == OSIOCAIFADDR_IN6)
 		cmd = SIOCAIFADDR_IN6;
 
 	switch (cmd) {
 	case SIOCGETSGCNT_IN6:
 	case SIOCGETMIFCNT_IN6:
 		/*
 		 * XXX mrt_ioctl has a 3rd, unused, FIB argument in route.c.
 		 * We cannot see how that would be needed, so do not adjust the
 		 * KPI blindly; more likely should clean up the IPv4 variant.
 		 */
 		return (mrt6_ioctl ? mrt6_ioctl(cmd, data) : EOPNOTSUPP);
 	}
 
 	switch (cmd) {
 	case SIOCAADDRCTL_POLICY:
 	case SIOCDADDRCTL_POLICY:
 		if (td != NULL) {
 			error = priv_check(td, PRIV_NETINET_ADDRCTRL6);
 			if (error)
 				return (error);
 		}
 		return (in6_src_ioctl(cmd, data));
 	}
 
 	if (ifp == NULL)
 		return (EOPNOTSUPP);
 
 	switch (cmd) {
 	case SIOCSNDFLUSH_IN6:
 	case SIOCSPFXFLUSH_IN6:
 	case SIOCSRTRFLUSH_IN6:
 	case SIOCSDEFIFACE_IN6:
 	case SIOCSIFINFO_FLAGS:
 	case SIOCSIFINFO_IN6:
 		if (td != NULL) {
 			error = priv_check(td, PRIV_NETINET_ND6);
 			if (error)
 				return (error);
 		}
 		/* FALLTHROUGH */
 	case OSIOCGIFINFO_IN6:
 	case SIOCGIFINFO_IN6:
 	case SIOCGNBRINFO_IN6:
 	case SIOCGDEFIFACE_IN6:
 		return (nd6_ioctl(cmd, data, ifp));
 
 #ifdef COMPAT_FREEBSD32
 	case SIOCGDEFIFACE32_IN6:
 		{
 			struct in6_ndifreq ndif;
 			struct in6_ndifreq32 *ndif32;
 
 			error = nd6_ioctl(SIOCGDEFIFACE_IN6, (caddr_t)&ndif,
 			    ifp);
 			if (error)
 				return (error);
 			ndif32 = (struct in6_ndifreq32 *)data;
 			ndif32->ifindex = ndif.ifindex;
 			return (0);
 		}
 #endif
 	}
 
 	switch (cmd) {
 	case SIOCSIFPREFIX_IN6:
 	case SIOCDIFPREFIX_IN6:
 	case SIOCAIFPREFIX_IN6:
 	case SIOCCIFPREFIX_IN6:
 	case SIOCSGIFPREFIX_IN6:
 	case SIOCGIFPREFIX_IN6:
 		log(LOG_NOTICE,
 		    "prefix ioctls are now invalidated. "
 		    "please use ifconfig.\n");
 		return (EOPNOTSUPP);
 	}
 
 	switch (cmd) {
 	case SIOCSSCOPE6:
 		if (td != NULL) {
 			error = priv_check(td, PRIV_NETINET_SCOPE6);
 			if (error)
 				return (error);
 		}
 		/* FALLTHROUGH */
 	case SIOCGSCOPE6:
 	case SIOCGSCOPE6DEF:
 		return (scope6_ioctl(cmd, data, ifp));
 	}
 
 	/*
 	 * Find address for this interface, if it exists.
 	 *
 	 * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation
 	 * only, and used the first interface address as the target of other
 	 * operations (without checking ifra_addr).  This was because netinet
 	 * code/API assumed at most 1 interface address per interface.
 	 * Since IPv6 allows a node to assign multiple addresses
 	 * on a single interface, we almost always look and check the
 	 * presence of ifra_addr, and reject invalid ones here.
 	 * It also decreases duplicated code among SIOC*_IN6 operations.
 	 */
 	switch (cmd) {
 	case SIOCAIFADDR_IN6:
 	case SIOCSIFPHYADDR_IN6:
 		sa6 = &ifra->ifra_addr;
 		break;
 	case SIOCSIFADDR_IN6:
 	case SIOCGIFADDR_IN6:
 	case SIOCSIFDSTADDR_IN6:
 	case SIOCSIFNETMASK_IN6:
 	case SIOCGIFDSTADDR_IN6:
 	case SIOCGIFNETMASK_IN6:
 	case SIOCDIFADDR_IN6:
 	case SIOCGIFPSRCADDR_IN6:
 	case SIOCGIFPDSTADDR_IN6:
 	case SIOCGIFAFLAG_IN6:
 	case SIOCSNDFLUSH_IN6:
 	case SIOCSPFXFLUSH_IN6:
 	case SIOCSRTRFLUSH_IN6:
 	case SIOCGIFALIFETIME_IN6:
 	case SIOCSIFALIFETIME_IN6:
 	case SIOCGIFSTAT_IN6:
 	case SIOCGIFSTAT_ICMP6:
 		sa6 = &ifr->ifr_addr;
 		break;
 	case SIOCSIFADDR:
 	case SIOCSIFBRDADDR:
 	case SIOCSIFDSTADDR:
 	case SIOCSIFNETMASK:
 		/*
 		 * Although we should pass any non-INET6 ioctl requests
 		 * down to driver, we filter some legacy INET requests.
 		 * Drivers trust SIOCSIFADDR et al to come from an already
 		 * privileged layer, and do not perform any credentials
 		 * checks or input validation.
 		 */
 		return (EINVAL);
 	default:
 		sa6 = NULL;
 		break;
 	}
 	if (sa6 && sa6->sin6_family == AF_INET6) {
 		if (sa6->sin6_scope_id != 0)
 			error = sa6_embedscope(sa6, 0);
 		else
 			error = in6_setscope(&sa6->sin6_addr, ifp, NULL);
 		if (error != 0)
 			return (error);
 		if (td != NULL && (error = prison_check_ip6(td->td_ucred,
 		    &sa6->sin6_addr)) != 0)
 			return (error);
 		ia = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr);
 	} else
 		ia = NULL;
 
 	switch (cmd) {
 	case SIOCSIFADDR_IN6:
 	case SIOCSIFDSTADDR_IN6:
 	case SIOCSIFNETMASK_IN6:
 		/*
 		 * Since IPv6 allows a node to assign multiple addresses
 		 * on a single interface, SIOCSIFxxx ioctls are deprecated.
 		 */
 		/* we decided to obsolete this command (20000704) */
 		error = EINVAL;
 		goto out;
 
 	case SIOCDIFADDR_IN6:
 		/*
 		 * for IPv4, we look for existing in_ifaddr here to allow
 		 * "ifconfig if0 delete" to remove the first IPv4 address on
 		 * the interface.  For IPv6, as the spec allows multiple
 		 * interface address from the day one, we consider "remove the
 		 * first one" semantics to be not preferable.
 		 */
 		if (ia == NULL) {
 			error = EADDRNOTAVAIL;
 			goto out;
 		}
 		/* FALLTHROUGH */
 	case SIOCAIFADDR_IN6:
 		/*
 		 * We always require users to specify a valid IPv6 address for
 		 * the corresponding operation.
 		 */
 		if (ifra->ifra_addr.sin6_family != AF_INET6 ||
 		    ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) {
 			error = EAFNOSUPPORT;
 			goto out;
 		}
 
 		if (td != NULL) {
 			error = priv_check(td, (cmd == SIOCDIFADDR_IN6) ?
 			    PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR);
 			if (error)
 				goto out;
 		}
 		/* FALLTHROUGH */
 	case SIOCGIFSTAT_IN6:
 	case SIOCGIFSTAT_ICMP6:
 		if (ifp->if_afdata[AF_INET6] == NULL) {
 			error = EPFNOSUPPORT;
 			goto out;
 		}
 		break;
 
 	case SIOCGIFADDR_IN6:
 		/* This interface is basically deprecated. use SIOCGIFCONF. */
 		/* FALLTHROUGH */
 	case SIOCGIFAFLAG_IN6:
 	case SIOCGIFNETMASK_IN6:
 	case SIOCGIFDSTADDR_IN6:
 	case SIOCGIFALIFETIME_IN6:
 		/* must think again about its semantics */
 		if (ia == NULL) {
 			error = EADDRNOTAVAIL;
 			goto out;
 		}
 		break;
 
 	case SIOCSIFALIFETIME_IN6:
 	    {
 		struct in6_addrlifetime *lt;
 
 		if (td != NULL) {
 			error = priv_check(td, PRIV_NETINET_ALIFETIME6);
 			if (error)
 				goto out;
 		}
 		if (ia == NULL) {
 			error = EADDRNOTAVAIL;
 			goto out;
 		}
 		/* sanity for overflow - beware unsigned */
 		lt = &ifr->ifr_ifru.ifru_lifetime;
 		if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME &&
 		    lt->ia6t_vltime + time_uptime < time_uptime) {
 			error = EINVAL;
 			goto out;
 		}
 		if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME &&
 		    lt->ia6t_pltime + time_uptime < time_uptime) {
 			error = EINVAL;
 			goto out;
 		}
 		break;
 	    }
 	}
 
 	switch (cmd) {
 	case SIOCGIFADDR_IN6:
 		ifr->ifr_addr = ia->ia_addr;
 		if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0)
 			goto out;
 		break;
 
 	case SIOCGIFDSTADDR_IN6:
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
 			error = EINVAL;
 			goto out;
 		}
 		/*
 		 * XXX: should we check if ifa_dstaddr is NULL and return
 		 * an error?
 		 */
 		ifr->ifr_dstaddr = ia->ia_dstaddr;
 		if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0)
 			goto out;
 		break;
 
 	case SIOCGIFNETMASK_IN6:
 		ifr->ifr_addr = ia->ia_prefixmask;
 		break;
 
 	case SIOCGIFAFLAG_IN6:
 		ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags;
 		break;
 
 	case SIOCGIFSTAT_IN6:
 		COUNTER_ARRAY_COPY(((struct in6_ifextra *)
 		    ifp->if_afdata[AF_INET6])->in6_ifstat,
 		    &ifr->ifr_ifru.ifru_stat,
 		    sizeof(struct in6_ifstat) / sizeof(uint64_t));
 		break;
 
 	case SIOCGIFSTAT_ICMP6:
 		COUNTER_ARRAY_COPY(((struct in6_ifextra *)
 		    ifp->if_afdata[AF_INET6])->icmp6_ifstat,
 		    &ifr->ifr_ifru.ifru_icmp6stat,
 		    sizeof(struct icmp6_ifstat) / sizeof(uint64_t));
 		break;
 
 	case SIOCGIFALIFETIME_IN6:
 		ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime;
 		if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
 			time_t maxexpire;
 			struct in6_addrlifetime *retlt =
 			    &ifr->ifr_ifru.ifru_lifetime;
 
 			/*
 			 * XXX: adjust expiration time assuming time_t is
 			 * signed.
 			 */
 			maxexpire = (-1) &
 			    ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
 			if (ia->ia6_lifetime.ia6t_vltime <
 			    maxexpire - ia->ia6_updatetime) {
 				retlt->ia6t_expire = ia->ia6_updatetime +
 				    ia->ia6_lifetime.ia6t_vltime;
 			} else
 				retlt->ia6t_expire = maxexpire;
 		}
 		if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
 			time_t maxexpire;
 			struct in6_addrlifetime *retlt =
 			    &ifr->ifr_ifru.ifru_lifetime;
 
 			/*
 			 * XXX: adjust expiration time assuming time_t is
 			 * signed.
 			 */
 			maxexpire = (-1) &
 			    ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
 			if (ia->ia6_lifetime.ia6t_pltime <
 			    maxexpire - ia->ia6_updatetime) {
 				retlt->ia6t_preferred = ia->ia6_updatetime +
 				    ia->ia6_lifetime.ia6t_pltime;
 			} else
 				retlt->ia6t_preferred = maxexpire;
 		}
 		break;
 
 	case SIOCSIFALIFETIME_IN6:
 		ia->ia6_lifetime = ifr->ifr_ifru.ifru_lifetime;
 		/* for sanity */
 		if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
 			ia->ia6_lifetime.ia6t_expire =
 				time_uptime + ia->ia6_lifetime.ia6t_vltime;
 		} else
 			ia->ia6_lifetime.ia6t_expire = 0;
 		if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
 			ia->ia6_lifetime.ia6t_preferred =
 				time_uptime + ia->ia6_lifetime.ia6t_pltime;
 		} else
 			ia->ia6_lifetime.ia6t_preferred = 0;
 		break;
 
 	case SIOCAIFADDR_IN6:
 	{
 		struct nd_prefixctl pr0;
 		struct nd_prefix *pr;
 
 		/*
 		 * first, make or update the interface address structure,
 		 * and link it to the list.
 		 */
 		if ((error = in6_update_ifa(ifp, ifra, ia, 0)) != 0)
 			goto out;
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr))
 		    == NULL) {
 			/*
 			 * this can happen when the user specify the 0 valid
 			 * lifetime.
 			 */
 			break;
 		}
 
 		if (cmd == ocmd && ifra->ifra_vhid > 0) {
 			if (carp_attach_p != NULL)
 				error = (*carp_attach_p)(&ia->ia_ifa,
 				    ifra->ifra_vhid);
 			else
 				error = EPROTONOSUPPORT;
 			if (error)
 				goto out;
 			else
 				carp_attached = 1;
 		}
 
 		/*
 		 * then, make the prefix on-link on the interface.
 		 * XXX: we'd rather create the prefix before the address, but
 		 * we need at least one address to install the corresponding
 		 * interface route, so we configure the address first.
 		 */
 
 		/*
 		 * convert mask to prefix length (prefixmask has already
 		 * been validated in in6_update_ifa().
 		 */
 		bzero(&pr0, sizeof(pr0));
 		pr0.ndpr_ifp = ifp;
 		pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr,
 		    NULL);
 		if (pr0.ndpr_plen == 128) {
 			/* we don't need to install a host route. */
 			goto aifaddr_out;
 		}
 		pr0.ndpr_prefix = ifra->ifra_addr;
 		/* apply the mask for safety. */
 		IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr,
 		    &ifra->ifra_prefixmask.sin6_addr);
 
 		/*
 		 * XXX: since we don't have an API to set prefix (not address)
 		 * lifetimes, we just use the same lifetimes as addresses.
 		 * The (temporarily) installed lifetimes can be overridden by
 		 * later advertised RAs (when accept_rtadv is non 0), which is
 		 * an intended behavior.
 		 */
 		pr0.ndpr_raf_onlink = 1; /* should be configurable? */
 		pr0.ndpr_raf_auto =
 		    ((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0);
 		pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime;
 		pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime;
 
 		/* add the prefix if not yet. */
 		if ((pr = nd6_prefix_lookup(&pr0)) == NULL) {
 			/*
 			 * nd6_prelist_add will install the corresponding
 			 * interface route.
 			 */
 			if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) {
 				if (carp_attached)
 					(*carp_detach_p)(&ia->ia_ifa);
 				goto out;
 			}
 			if (pr == NULL) {
 				if (carp_attached)
 					(*carp_detach_p)(&ia->ia_ifa);
 				log(LOG_ERR, "nd6_prelist_add succeeded but "
 				    "no prefix\n");
 				error = EINVAL;
 				goto out;
 			}
 		}
 
 		/* relate the address to the prefix */
 		if (ia->ia6_ndpr == NULL) {
 			ia->ia6_ndpr = pr;
 			pr->ndpr_refcnt++;
 
 			/*
 			 * If this is the first autoconf address from the
 			 * prefix, create a temporary address as well
 			 * (when required).
 			 */
 			if ((ia->ia6_flags & IN6_IFF_AUTOCONF) &&
 			    V_ip6_use_tempaddr && pr->ndpr_refcnt == 1) {
 				int e;
 				if ((e = in6_tmpifadd(ia, 1, 0)) != 0) {
 					log(LOG_NOTICE, "in6_control: failed "
 					    "to create a temporary address, "
 					    "errno=%d\n", e);
 				}
 			}
 		}
 
 		/*
 		 * this might affect the status of autoconfigured addresses,
 		 * that is, this address might make other addresses detached.
 		 */
 		pfxlist_onlink_check();
 aifaddr_out:
 		if (error != 0 || ia == NULL)
 			break;
 		/*
 		 * Try to clear the flag when a new IPv6 address is added
 		 * onto an IFDISABLED interface and it succeeds.
 		 */
 		if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) {
 			struct in6_ndireq nd;
 
 			memset(&nd, 0, sizeof(nd));
 			nd.ndi.flags = ND_IFINFO(ifp)->flags;
 			nd.ndi.flags &= ~ND6_IFF_IFDISABLED;
 			if (nd6_ioctl(SIOCSIFINFO_FLAGS, (caddr_t)&nd, ifp) < 0)
 				log(LOG_NOTICE, "SIOCAIFADDR_IN6: "
 				    "SIOCSIFINFO_FLAGS for -ifdisabled "
 				    "failed.");
 			/*
 			 * Ignore failure of clearing the flag intentionally.
 			 * The failure means address duplication was detected.
 			 */
 		}
 		EVENTHANDLER_INVOKE(ifaddr_event, ifp);
 		break;
 	}
 
 	case SIOCDIFADDR_IN6:
 	{
 		struct nd_prefix *pr;
 
 		/*
 		 * If the address being deleted is the only one that owns
 		 * the corresponding prefix, expire the prefix as well.
 		 * XXX: theoretically, we don't have to worry about such
 		 * relationship, since we separate the address management
 		 * and the prefix management.  We do this, however, to provide
 		 * as much backward compatibility as possible in terms of
 		 * the ioctl operation.
 		 * Note that in6_purgeaddr() will decrement ndpr_refcnt.
 		 */
 		pr = ia->ia6_ndpr;
 		in6_purgeaddr(&ia->ia_ifa);
 		if (pr && pr->ndpr_refcnt == 0)
 			prelist_remove(pr);
 		EVENTHANDLER_INVOKE(ifaddr_event, ifp);
 		break;
 	}
 
 	default:
 		if (ifp->if_ioctl == NULL) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		goto out;
 	}
 
 	error = 0;
 out:
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	return (error);
 }
 
 
 /*
  * Join necessary multicast groups.  Factored out from in6_update_ifa().
  * This entire work should only be done once, for the default FIB.
  */
 static int
 in6_update_ifa_join_mc(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int flags, struct in6_multi **in6m_sol)
 {
 	char ip6buf[INET6_ADDRSTRLEN];
 	struct in6_addr mltaddr;
 	struct in6_multi_mship *imm;
 	int delay, error;
 
 	KASSERT(in6m_sol != NULL, ("%s: in6m_sol is NULL", __func__));
 
 	/* Join solicited multicast addr for new host id. */
 	bzero(&mltaddr, sizeof(struct in6_addr));
 	mltaddr.s6_addr32[0] = IPV6_ADDR_INT32_MLL;
 	mltaddr.s6_addr32[2] = htonl(1);
 	mltaddr.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3];
 	mltaddr.s6_addr8[12] = 0xff;
 	if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) {
 		/* XXX: should not happen */
 		log(LOG_ERR, "%s: in6_setscope failed\n", __func__);
 		goto cleanup;
 	}
 	delay = error = 0;
 	if ((flags & IN6_IFAUPDATE_DADDELAY)) {
 		/*
 		 * We need a random delay for DAD on the address being
 		 * configured.  It also means delaying transmission of the
 		 * corresponding MLD report to avoid report collision.
 		 * [RFC 4861, Section 6.3.7]
 		 */
 		delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz);
 	}
 	imm = in6_joingroup(ifp, &mltaddr, &error, delay);
 	if (imm == NULL) {
 		nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s "
 		    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr),
 		    if_name(ifp), error));
 		goto cleanup;
 	}
 	LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 	*in6m_sol = imm->i6mm_maddr;
 
 	/*
 	 * Join link-local all-nodes address.
 	 */
 	mltaddr = in6addr_linklocal_allnodes;
 	if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0)
 		goto cleanup; /* XXX: should not fail */
 
 	imm = in6_joingroup(ifp, &mltaddr, &error, 0);
 	if (imm == NULL) {
 		nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s "
 		    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr),
 		    if_name(ifp), error));
 		goto cleanup;
 	}
 	LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 
 	/*
 	 * Join node information group address.
 	 */
 	delay = 0;
 	if ((flags & IN6_IFAUPDATE_DADDELAY)) {
 		/*
 		 * The spec does not say anything about delay for this group,
 		 * but the same logic should apply.
 		 */
 		delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz);
 	}
 	if (in6_nigroup(ifp, NULL, -1, &mltaddr) == 0) {
 		/* XXX jinmei */
 		imm = in6_joingroup(ifp, &mltaddr, &error, delay);
 		if (imm == NULL)
 			nd6log((LOG_WARNING,
 			    "%s: in6_joingroup failed for %s on %s "
 			    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf,
 			    &mltaddr), if_name(ifp), error));
 			/* XXX not very fatal, go on... */
 		else
 			LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 	}
 	if (V_icmp6_nodeinfo_oldmcprefix &&
 	    in6_nigroup_oldmcprefix(ifp, NULL, -1, &mltaddr) == 0) {
 		imm = in6_joingroup(ifp, &mltaddr, &error, delay);
 		if (imm == NULL)
 			nd6log((LOG_WARNING,
 			    "%s: in6_joingroup failed for %s on %s "
 			    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf,
 			    &mltaddr), if_name(ifp), error));
 			/* XXX not very fatal, go on... */
 		else
 			LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 	}
 
 	/*
 	 * Join interface-local all-nodes address.
 	 * (ff01::1%ifN, and ff01::%ifN/32)
 	 */
 	mltaddr = in6addr_nodelocal_allnodes;
 	if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0)
 		goto cleanup; /* XXX: should not fail */
 
 	imm = in6_joingroup(ifp, &mltaddr, &error, 0);
 	if (imm == NULL) {
 		nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s "
 		    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf,
 		    &mltaddr), if_name(ifp), error));
 		goto cleanup;
 	}
 	LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 
 cleanup:
 	return (error);
 }
 
 /*
  * Update parameters of an IPv6 interface address.
  * If necessary, a new entry is created and linked into address chains.
  * This function is separated from in6_control().
  */
 int
 in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int flags)
 {
 	int error, hostIsNew = 0;
 
 	if ((error = in6_validate_ifra(ifp, ifra, ia, flags)) != 0)
 		return (error);
 
 	if (ia == NULL) {
 		hostIsNew = 1;
 		if ((ia = in6_alloc_ifa(ifp, ifra, flags)) == NULL)
 			return (ENOBUFS);
 	}
 
 	error = in6_update_ifa_internal(ifp, ifra, ia, hostIsNew, flags);
 	if (error != 0) {
 		if (hostIsNew != 0) {
 			in6_unlink_ifa(ia, ifp);
 			ifa_free(&ia->ia_ifa);
 		}
 		return (error);
 	}
 
 	if (hostIsNew)
 		error = in6_broadcast_ifa(ifp, ifra, ia, flags);
 
 	return (error);
 }
 
 /*
  * Fill in basic IPv6 address request info.
  */
 void
 in6_prepare_ifra(struct in6_aliasreq *ifra, const struct in6_addr *addr,
     const struct in6_addr *mask)
 {
 
 	memset(ifra, 0, sizeof(struct in6_aliasreq));
 
 	ifra->ifra_addr.sin6_family = AF_INET6;
 	ifra->ifra_addr.sin6_len = sizeof(struct sockaddr_in6);
 	if (addr != NULL)
 		ifra->ifra_addr.sin6_addr = *addr;
 
 	ifra->ifra_prefixmask.sin6_family = AF_INET6;
 	ifra->ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6);
 	if (mask != NULL)
 		ifra->ifra_prefixmask.sin6_addr = *mask;
 }
 
 static int
 in6_validate_ifra(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int flags)
 {
 	int plen = -1;
 	struct sockaddr_in6 dst6;
 	struct in6_addrlifetime *lt;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	/* Validate parameters */
 	if (ifp == NULL || ifra == NULL) /* this maybe redundant */
 		return (EINVAL);
 
 	/*
 	 * The destination address for a p2p link must have a family
 	 * of AF_UNSPEC or AF_INET6.
 	 */
 	if ((ifp->if_flags & IFF_POINTOPOINT) != 0 &&
 	    ifra->ifra_dstaddr.sin6_family != AF_INET6 &&
 	    ifra->ifra_dstaddr.sin6_family != AF_UNSPEC)
 		return (EAFNOSUPPORT);
 
 	/*
 	 * Validate address
 	 */
 	if (ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6) ||
 	    ifra->ifra_addr.sin6_family != AF_INET6)
 		return (EINVAL);
 
 	/*
 	 * validate ifra_prefixmask.  don't check sin6_family, netmask
 	 * does not carry fields other than sin6_len.
 	 */
 	if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6))
 		return (EINVAL);
 	/*
 	 * Because the IPv6 address architecture is classless, we require
 	 * users to specify a (non 0) prefix length (mask) for a new address.
 	 * We also require the prefix (when specified) mask is valid, and thus
 	 * reject a non-consecutive mask.
 	 */
 	if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0)
 		return (EINVAL);
 	if (ifra->ifra_prefixmask.sin6_len != 0) {
 		plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr,
 		    (u_char *)&ifra->ifra_prefixmask +
 		    ifra->ifra_prefixmask.sin6_len);
 		if (plen <= 0)
 			return (EINVAL);
 	} else {
 		/*
 		 * In this case, ia must not be NULL.  We just use its prefix
 		 * length.
 		 */
 		plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL);
 	}
 	/*
 	 * If the destination address on a p2p interface is specified,
 	 * and the address is a scoped one, validate/set the scope
 	 * zone identifier.
 	 */
 	dst6 = ifra->ifra_dstaddr;
 	if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 &&
 	    (dst6.sin6_family == AF_INET6)) {
 		struct in6_addr in6_tmp;
 		u_int32_t zoneid;
 
 		in6_tmp = dst6.sin6_addr;
 		if (in6_setscope(&in6_tmp, ifp, &zoneid))
 			return (EINVAL); /* XXX: should be impossible */
 
 		if (dst6.sin6_scope_id != 0) {
 			if (dst6.sin6_scope_id != zoneid)
 				return (EINVAL);
 		} else		/* user omit to specify the ID. */
 			dst6.sin6_scope_id = zoneid;
 
 		/* convert into the internal form */
 		if (sa6_embedscope(&dst6, 0))
 			return (EINVAL); /* XXX: should be impossible */
 	}
 	/* Modify original ifra_dstaddr to reflect changes */
 	ifra->ifra_dstaddr = dst6;
 
 	/*
 	 * The destination address can be specified only for a p2p or a
 	 * loopback interface.  If specified, the corresponding prefix length
 	 * must be 128.
 	 */
 	if (ifra->ifra_dstaddr.sin6_family == AF_INET6) {
 		if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) {
 			/* XXX: noisy message */
 			nd6log((LOG_INFO, "in6_update_ifa: a destination can "
 			    "be specified for a p2p or a loopback IF only\n"));
 			return (EINVAL);
 		}
 		if (plen != 128) {
 			nd6log((LOG_INFO, "in6_update_ifa: prefixlen should "
 			    "be 128 when dstaddr is specified\n"));
 			return (EINVAL);
 		}
 	}
 	/* lifetime consistency check */
 	lt = &ifra->ifra_lifetime;
 	if (lt->ia6t_pltime > lt->ia6t_vltime)
 		return (EINVAL);
 	if (lt->ia6t_vltime == 0) {
 		/*
 		 * the following log might be noisy, but this is a typical
 		 * configuration mistake or a tool's bug.
 		 */
 		nd6log((LOG_INFO,
 		    "in6_update_ifa: valid lifetime is 0 for %s\n",
 		    ip6_sprintf(ip6buf, &ifra->ifra_addr.sin6_addr)));
 
 		if (ia == NULL)
 			return (0); /* there's nothing to do */
 	}
 
 	/* Check prefix mask */
 	if (ia != NULL && ifra->ifra_prefixmask.sin6_len != 0) {
 		/*
 		 * We prohibit changing the prefix length of an existing
 		 * address, because
 		 * + such an operation should be rare in IPv6, and
 		 * + the operation would confuse prefix management.
 		 */
 		if (ia->ia_prefixmask.sin6_len != 0 &&
 		    in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) != plen) {
 			nd6log((LOG_INFO, "in6_validate_ifa: the prefix length "
 			    "of an existing %s address should not be changed\n",
 			    ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
 
 			return (EINVAL);
 		}
 	}
 
 	return (0);
 }
 
 
 /*
  * Allocate a new ifaddr and link it into chains.
  */
 static struct in6_ifaddr *
 in6_alloc_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags)
 {
 	struct in6_ifaddr *ia;
 
 	/*
 	 * When in6_alloc_ifa() is called in a process of a received
 	 * RA, it is called under an interrupt context.  So, we should
 	 * call malloc with M_NOWAIT.
 	 */
 	ia = (struct in6_ifaddr *)ifa_alloc(sizeof(*ia), M_NOWAIT);
 	if (ia == NULL)
 		return (NULL);
 	LIST_INIT(&ia->ia6_memberships);
 	/* Initialize the address and masks, and put time stamp */
 	ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr;
 	ia->ia_addr.sin6_family = AF_INET6;
 	ia->ia_addr.sin6_len = sizeof(ia->ia_addr);
 	/* XXX: Can we assign ,sin6_addr and skip the rest? */
 	ia->ia_addr = ifra->ifra_addr;
 	ia->ia6_createtime = time_uptime;
 	if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) {
 		/*
 		 * Some functions expect that ifa_dstaddr is not
 		 * NULL for p2p interfaces.
 		 */
 		ia->ia_ifa.ifa_dstaddr =
 		    (struct sockaddr *)&ia->ia_dstaddr;
 	} else {
 		ia->ia_ifa.ifa_dstaddr = NULL;
 	}
 
 	/* set prefix mask if any */
 	ia->ia_ifa.ifa_netmask = (struct sockaddr *)&ia->ia_prefixmask;
 	if (ifra->ifra_prefixmask.sin6_len != 0) {
 		ia->ia_prefixmask.sin6_family = AF_INET6;
 		ia->ia_prefixmask.sin6_len = ifra->ifra_prefixmask.sin6_len;
 		ia->ia_prefixmask.sin6_addr = ifra->ifra_prefixmask.sin6_addr;
 	}
 
 	ia->ia_ifp = ifp;
 	ifa_ref(&ia->ia_ifa);			/* if_addrhead */
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_INSERT_TAIL(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 
 	ifa_ref(&ia->ia_ifa);			/* in6_ifaddrhead */
 	IN6_IFADDR_WLOCK();
 	TAILQ_INSERT_TAIL(&V_in6_ifaddrhead, ia, ia_link);
 	LIST_INSERT_HEAD(IN6ADDR_HASH(&ia->ia_addr.sin6_addr), ia, ia6_hash);
 	IN6_IFADDR_WUNLOCK();
 
 	return (ia);
 }
 
 /*
  * Update/configure interface address parameters:
  *
  * 1) Update lifetime
  * 2) Update interface metric ad flags
  * 3) Notify other subsystems
  */
 static int
 in6_update_ifa_internal(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int hostIsNew, int flags)
 {
 	int error;
 
 	/* update timestamp */
 	ia->ia6_updatetime = time_uptime;
 
 	/*
 	 * Set lifetimes.  We do not refer to ia6t_expire and ia6t_preferred
 	 * to see if the address is deprecated or invalidated, but initialize
 	 * these members for applications.
 	 */
 	ia->ia6_lifetime = ifra->ifra_lifetime;
 	if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
 		ia->ia6_lifetime.ia6t_expire =
 		    time_uptime + ia->ia6_lifetime.ia6t_vltime;
 	} else
 		ia->ia6_lifetime.ia6t_expire = 0;
 	if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
 		ia->ia6_lifetime.ia6t_preferred =
 		    time_uptime + ia->ia6_lifetime.ia6t_pltime;
 	} else
 		ia->ia6_lifetime.ia6t_preferred = 0;
 
 	/*
 	 * backward compatibility - if IN6_IFF_DEPRECATED is set from the
 	 * userland, make it deprecated.
 	 */
 	if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) {
 		ia->ia6_lifetime.ia6t_pltime = 0;
 		ia->ia6_lifetime.ia6t_preferred = time_uptime;
 	}
 
 	/*
 	 * configure address flags.
 	 */
 	ia->ia6_flags = ifra->ifra_flags;
 
 	/*
 	 * Make the address tentative before joining multicast addresses,
 	 * so that corresponding MLD responses would not have a tentative
 	 * source address.
 	 */
 	ia->ia6_flags &= ~IN6_IFF_DUPLICATED;	/* safety */
 
 	/*
 	 * DAD should be performed for an new address or addresses on
 	 * an interface with ND6_IFF_IFDISABLED.
 	 */
 	if (in6if_do_dad(ifp) &&
 	    (hostIsNew || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)))
 		ia->ia6_flags |= IN6_IFF_TENTATIVE;
 
 	/* notify other subsystems */
 	error = in6_notify_ifa(ifp, ia, ifra, hostIsNew);
 
 	return (error);
 }
 
 /*
  * Do link-level ifa job:
  * 1) Add lle entry for added address
  * 2) Notifies routing socket users about new address
  * 3) join appropriate multicast group
  * 4) start DAD if enabled
  */
 static int
 in6_broadcast_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int flags)
 {
 	struct in6_multi *in6m_sol;
 	int error = 0;
 
 	/* Add local address to lltable, if necessary (ex. on p2p link). */
 	if ((error = nd6_add_ifa_lle(ia)) != 0) {
 		in6_purgeaddr(&ia->ia_ifa);
 		ifa_free(&ia->ia_ifa);
 		return (error);
 	}
 
 	/* Join necessary multicast groups. */
 	in6m_sol = NULL;
 	if ((ifp->if_flags & IFF_MULTICAST) != 0) {
 		error = in6_update_ifa_join_mc(ifp, ifra, ia, flags, &in6m_sol);
 		if (error != 0) {
 			in6_purgeaddr(&ia->ia_ifa);
 			ifa_free(&ia->ia_ifa);
 			return (error);
 		}
 	}
 
 	/* Perform DAD, if the address is TENTATIVE. */
 	if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) {
 		int delay, mindelay, maxdelay;
 
 		delay = 0;
 		if ((flags & IN6_IFAUPDATE_DADDELAY)) {
 			/*
 			 * We need to impose a delay before sending an NS
 			 * for DAD.  Check if we also needed a delay for the
 			 * corresponding MLD message.  If we did, the delay
 			 * should be larger than the MLD delay (this could be
 			 * relaxed a bit, but this simple logic is at least
 			 * safe).
 			 * XXX: Break data hiding guidelines and look at
 			 * state for the solicited multicast group.
 			 */
 			mindelay = 0;
 			if (in6m_sol != NULL &&
 			    in6m_sol->in6m_state == MLD_REPORTING_MEMBER) {
 				mindelay = in6m_sol->in6m_timer;
 			}
 			maxdelay = MAX_RTR_SOLICITATION_DELAY * hz;
 			if (maxdelay - mindelay == 0)
 				delay = 0;
 			else {
 				delay =
 				    (arc4random() % (maxdelay - mindelay)) +
 				    mindelay;
 			}
 		}
 		nd6_dad_start((struct ifaddr *)ia, delay);
 	}
 
 	in6_newaddrmsg(ia, RTM_ADD);
 	ifa_free(&ia->ia_ifa);
 	return (error);
 }
 
 void
 in6_purgeaddr(struct ifaddr *ifa)
 {
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa;
 	struct in6_multi_mship *imm;
 	int plen, error;
 
 	if (ifa->ifa_carp)
 		(*carp_detach_p)(ifa);
 
 	/*
 	 * Remove the loopback route to the interface address.
 	 * The check for the current setting of "nd6_useloopback"
 	 * is not needed.
 	 */
 	if (ia->ia_flags & IFA_RTSELF) {
 		error = ifa_del_loopback_route((struct ifaddr *)ia,
 		    (struct sockaddr *)&ia->ia_addr);
 		if (error == 0)
 			ia->ia_flags &= ~IFA_RTSELF;
 	}
 
 	/* stop DAD processing */
 	nd6_dad_stop(ifa);
 
 	/* Leave multicast groups. */
 	while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) {
 		LIST_REMOVE(imm, i6mm_chain);
 		in6_leavegroup(imm);
 	}
 	plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */
 	if ((ia->ia_flags & IFA_ROUTE) && plen == 128) {
 		error = rtinit(&(ia->ia_ifa), RTM_DELETE, ia->ia_flags |
 		    (ia->ia_dstaddr.sin6_family == AF_INET6 ? RTF_HOST : 0));
 		if (error != 0)
 			log(LOG_INFO, "%s: err=%d, destination address delete "
 			    "failed\n", __func__, error);
 		ia->ia_flags &= ~IFA_ROUTE;
 	}
 
 	in6_newaddrmsg(ia, RTM_DELETE);
 	in6_unlink_ifa(ia, ifp);
 }
 
 static void
 in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp)
 {
 	char ip6buf[INET6_ADDRSTRLEN];
 	int remove_lle;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 	ifa_free(&ia->ia_ifa);			/* if_addrhead */
 
 	/*
 	 * Defer the release of what might be the last reference to the
 	 * in6_ifaddr so that it can't be freed before the remainder of the
 	 * cleanup.
 	 */
 	IN6_IFADDR_WLOCK();
 	TAILQ_REMOVE(&V_in6_ifaddrhead, ia, ia_link);
 	LIST_REMOVE(ia, ia6_hash);
 	IN6_IFADDR_WUNLOCK();
 
 	/*
 	 * Release the reference to the base prefix.  There should be a
 	 * positive reference.
 	 */
 	remove_lle = 0;
 	if (ia->ia6_ndpr == NULL) {
 		nd6log((LOG_NOTICE,
 		    "in6_unlink_ifa: autoconf'ed address "
 		    "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia))));
 	} else {
 		ia->ia6_ndpr->ndpr_refcnt--;
 		/* Do not delete lles within prefix if refcont != 0 */
 		if (ia->ia6_ndpr->ndpr_refcnt == 0)
 			remove_lle = 1;
 		ia->ia6_ndpr = NULL;
 	}
 
 	nd6_rem_ifa_lle(ia, remove_lle);
 
 	/*
 	 * Also, if the address being removed is autoconf'ed, call
 	 * pfxlist_onlink_check() since the release might affect the status of
 	 * other (detached) addresses.
 	 */
 	if ((ia->ia6_flags & IN6_IFF_AUTOCONF)) {
 		pfxlist_onlink_check();
 	}
 	ifa_free(&ia->ia_ifa);			/* in6_ifaddrhead */
 }
 
 /*
  * Notifies other subsystems about address change/arrival:
  * 1) Notifies device handler on the first IPv6 address assignment
  * 2) Handle routing table changes for P2P links and route
  * 3) Handle routing table changes for address host route
  */
 static int
 in6_notify_ifa(struct ifnet *ifp, struct in6_ifaddr *ia,
     struct in6_aliasreq *ifra, int hostIsNew)
 {
 	int	error = 0, plen, ifacount = 0;
 	struct ifaddr *ifa;
 	struct sockaddr_in6 *pdst;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	/*
 	 * Give the interface a chance to initialize
 	 * if this is its first address,
 	 */
 	if (hostIsNew != 0) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			ifacount++;
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 
 	if (ifacount <= 1 && ifp->if_ioctl) {
 		error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * If a new destination address is specified, scrub the old one and
 	 * install the new destination.  Note that the interface must be
 	 * p2p or loopback.
 	 */
 	pdst = &ifra->ifra_dstaddr;
 	if (pdst->sin6_family == AF_INET6 &&
 	    !IN6_ARE_ADDR_EQUAL(&pdst->sin6_addr, &ia->ia_dstaddr.sin6_addr)) {
 		if ((ia->ia_flags & IFA_ROUTE) != 0 &&
 		    (rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST) != 0)) {
 			nd6log((LOG_ERR, "in6_update_ifa_internal: failed to "
 			    "remove a route to the old destination: %s\n",
 			    ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
 			/* proceed anyway... */
 		} else
 			ia->ia_flags &= ~IFA_ROUTE;
 		ia->ia_dstaddr = *pdst;
 	}
 
 	/*
 	 * If a new destination address is specified for a point-to-point
 	 * interface, install a route to the destination as an interface
 	 * direct route.
 	 * XXX: the logic below rejects assigning multiple addresses on a p2p
 	 * interface that share the same destination.
 	 */
 	plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */
 	if (!(ia->ia_flags & IFA_ROUTE) && plen == 128 &&
 	    ia->ia_dstaddr.sin6_family == AF_INET6) {
 		int rtflags = RTF_UP | RTF_HOST;
 		/*
 		 * Handle the case for ::1 .
 		 */
 		if (ifp->if_flags & IFF_LOOPBACK)
 			ia->ia_flags |= IFA_RTSELF;
 		error = rtinit(&ia->ia_ifa, RTM_ADD, ia->ia_flags | rtflags);
 		if (error)
 			return (error);
 		ia->ia_flags |= IFA_ROUTE;
 	}
 
 	/*
 	 * add a loopback route to self if not exists
 	 */
 	if (!(ia->ia_flags & IFA_RTSELF) && V_nd6_useloopback) {
 		error = ifa_add_loopback_route((struct ifaddr *)ia,
 		    (struct sockaddr *)&ia->ia_addr);
 		if (error == 0)
 			ia->ia_flags |= IFA_RTSELF;
 	}
 
 	return (error);
 }
 
 /*
  * Find an IPv6 interface link-local address specific to an interface.
  * ifaddr is returned referenced.
  */
 struct in6_ifaddr *
 in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags)
 {
 	struct ifaddr *ifa;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) {
 			if ((((struct in6_ifaddr *)ifa)->ia6_flags &
 			    ignoreflags) != 0)
 				continue;
 			ifa_ref(ifa);
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return ((struct in6_ifaddr *)ifa);
 }
 
 
 /*
  * find the internet address corresponding to a given address.
  * ifaddr is returned referenced.
  */
 struct in6_ifaddr *
 in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	LIST_FOREACH(ia, IN6ADDR_HASH(addr), ia6_hash) {
 		if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) {
 			if (zoneid != 0 &&
 			    zoneid != ia->ia_addr.sin6_scope_id)
 				continue;
 			ifa_ref(&ia->ia_ifa);
 			break;
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	return (ia);
 }
 
 /*
  * find the internet address corresponding to a given interface and address.
  * ifaddr is returned referenced.
  */
 struct in6_ifaddr *
 in6ifa_ifpwithaddr(struct ifnet *ifp, const struct in6_addr *addr)
 {
 	struct ifaddr *ifa;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) {
 			ifa_ref(ifa);
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return ((struct in6_ifaddr *)ifa);
 }
 
 /*
  * Find a link-local scoped address on ifp and return it if any.
  */
 struct in6_ifaddr *
 in6ifa_llaonifp(struct ifnet *ifp)
 {
 	struct sockaddr_in6 *sin6;
 	struct ifaddr *ifa;
 
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)
 		return (NULL);
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		sin6 = (struct sockaddr_in6 *)ifa->ifa_addr;
 		if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) ||
 		    IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr) ||
 		    IN6_IS_ADDR_MC_NODELOCAL(&sin6->sin6_addr))
 			break;
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return ((struct in6_ifaddr *)ifa);
 }
 
 /*
  * Convert IP6 address to printable (loggable) representation. Caller
  * has to make sure that ip6buf is at least INET6_ADDRSTRLEN long.
  */
 static char digits[] = "0123456789abcdef";
 char *
 ip6_sprintf(char *ip6buf, const struct in6_addr *addr)
 {
 	int i, cnt = 0, maxcnt = 0, idx = 0, index = 0;
 	char *cp;
 	const u_int16_t *a = (const u_int16_t *)addr;
 	const u_int8_t *d;
 	int dcolon = 0, zero = 0;
 
 	cp = ip6buf;
 
 	for (i = 0; i < 8; i++) {
 		if (*(a + i) == 0) {
 			cnt++;
 			if (cnt == 1)
 				idx = i;
 		}
 		else if (maxcnt < cnt) {
 			maxcnt = cnt;
 			index = idx;
 			cnt = 0;
 		}
 	}
 	if (maxcnt < cnt) {
 		maxcnt = cnt;
 		index = idx;
 	}
 
 	for (i = 0; i < 8; i++) {
 		if (dcolon == 1) {
 			if (*a == 0) {
 				if (i == 7)
 					*cp++ = ':';
 				a++;
 				continue;
 			} else
 				dcolon = 2;
 		}
 		if (*a == 0) {
 			if (dcolon == 0 && *(a + 1) == 0 && i == index) {
 				if (i == 0)
 					*cp++ = ':';
 				*cp++ = ':';
 				dcolon = 1;
 			} else {
 				*cp++ = '0';
 				*cp++ = ':';
 			}
 			a++;
 			continue;
 		}
 		d = (const u_char *)a;
 		/* Try to eliminate leading zeros in printout like in :0001. */
 		zero = 1;
 		*cp = digits[*d >> 4];
 		if (*cp != '0') {
 			zero = 0;
 			cp++;
 		}
 		*cp = digits[*d++ & 0xf];
 		if (zero == 0 || (*cp != '0')) {
 			zero = 0;
 			cp++;
 		}
 		*cp = digits[*d >> 4];
 		if (zero == 0 || (*cp != '0')) {
 			zero = 0;
 			cp++;
 		}
 		*cp++ = digits[*d & 0xf];
 		*cp++ = ':';
 		a++;
 	}
 	*--cp = '\0';
 	return (ip6buf);
 }
 
 int
 in6_localaddr(struct in6_addr *in6)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6))
 		return 1;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
 		if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr,
 		    &ia->ia_prefixmask.sin6_addr)) {
 			IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 			return 1;
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 
 	return (0);
 }
 
 /*
  * Return 1 if an internet address is for the local host and configured
  * on one of its interfaces.
  */
 int
 in6_localip(struct in6_addr *in6)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	LIST_FOREACH(ia, IN6ADDR_HASH(in6), ia6_hash) {
 		if (IN6_ARE_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr)) {
 			IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 			return (1);
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	return (0);
 }
  
 /*
  * Return 1 if an internet address is configured on an interface.
  */
 int
 in6_ifhasaddr(struct ifnet *ifp, struct in6_addr *addr)
 {
 	struct in6_addr in6;
 	struct ifaddr *ifa;
 	struct in6_ifaddr *ia6;
 
 	in6 = *addr;
 	if (in6_clearscope(&in6))
 		return (0);
 	in6_setscope(&in6, ifp, NULL);
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		ia6 = (struct in6_ifaddr *)ifa;
 		if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &in6)) {
 			IF_ADDR_RUNLOCK(ifp);
 			return (1);
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (0);
 }
 
 int
 in6_is_addr_deprecated(struct sockaddr_in6 *sa6)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	LIST_FOREACH(ia, IN6ADDR_HASH(&sa6->sin6_addr), ia6_hash) {
 		if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), &sa6->sin6_addr)) {
 			if (ia->ia6_flags & IN6_IFF_DEPRECATED) {
 				IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 				return (1); /* true */
 			}
 			break;
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 
 	return (0);		/* false */
 }
 
 /*
  * return length of part which dst and src are equal
  * hard coding...
  */
 int
 in6_matchlen(struct in6_addr *src, struct in6_addr *dst)
 {
 	int match = 0;
 	u_char *s = (u_char *)src, *d = (u_char *)dst;
 	u_char *lim = s + 16, r;
 
 	while (s < lim)
 		if ((r = (*d++ ^ *s++)) != 0) {
 			while (r < 128) {
 				match++;
 				r <<= 1;
 			}
 			break;
 		} else
 			match += 8;
 	return match;
 }
 
 /* XXX: to be scope conscious */
 int
 in6_are_prefix_equal(struct in6_addr *p1, struct in6_addr *p2, int len)
 {
 	int bytelen, bitlen;
 
 	/* sanity check */
 	if (0 > len || len > 128) {
 		log(LOG_ERR, "in6_are_prefix_equal: invalid prefix length(%d)\n",
 		    len);
 		return (0);
 	}
 
 	bytelen = len / 8;
 	bitlen = len % 8;
 
 	if (bcmp(&p1->s6_addr, &p2->s6_addr, bytelen))
 		return (0);
 	if (bitlen != 0 &&
 	    p1->s6_addr[bytelen] >> (8 - bitlen) !=
 	    p2->s6_addr[bytelen] >> (8 - bitlen))
 		return (0);
 
 	return (1);
 }
 
 void
 in6_prefixlen2mask(struct in6_addr *maskp, int len)
 {
 	u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
 	int bytelen, bitlen, i;
 
 	/* sanity check */
 	if (0 > len || len > 128) {
 		log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n",
 		    len);
 		return;
 	}
 
 	bzero(maskp, sizeof(*maskp));
 	bytelen = len / 8;
 	bitlen = len % 8;
 	for (i = 0; i < bytelen; i++)
 		maskp->s6_addr[i] = 0xff;
 	if (bitlen)
 		maskp->s6_addr[bytelen] = maskarray[bitlen - 1];
 }
 
 /*
  * return the best address out of the same scope. if no address was
  * found, return the first valid address from designated IF.
  */
 struct in6_ifaddr *
 in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst)
 {
 	int dst_scope =	in6_addrscope(dst), blen = -1, tlen;
 	struct ifaddr *ifa;
 	struct in6_ifaddr *besta = 0;
 	struct in6_ifaddr *dep[2];	/* last-resort: deprecated */
 
 	dep[0] = dep[1] = NULL;
 
 	/*
 	 * We first look for addresses in the same scope.
 	 * If there is one, return it.
 	 * If two or more, return one which matches the dst longest.
 	 * If none, return one of global addresses assigned other ifs.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST)
 			continue; /* XXX: is there any case to allow anycast? */
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY)
 			continue; /* don't use this interface */
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED)
 			continue;
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) {
 			if (V_ip6_use_deprecated)
 				dep[0] = (struct in6_ifaddr *)ifa;
 			continue;
 		}
 
 		if (dst_scope == in6_addrscope(IFA_IN6(ifa))) {
 			/*
 			 * call in6_matchlen() as few as possible
 			 */
 			if (besta) {
 				if (blen == -1)
 					blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst);
 				tlen = in6_matchlen(IFA_IN6(ifa), dst);
 				if (tlen > blen) {
 					blen = tlen;
 					besta = (struct in6_ifaddr *)ifa;
 				}
 			} else
 				besta = (struct in6_ifaddr *)ifa;
 		}
 	}
 	if (besta) {
 		ifa_ref(&besta->ia_ifa);
 		IF_ADDR_RUNLOCK(ifp);
 		return (besta);
 	}
 
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST)
 			continue; /* XXX: is there any case to allow anycast? */
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY)
 			continue; /* don't use this interface */
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED)
 			continue;
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) {
 			if (V_ip6_use_deprecated)
 				dep[1] = (struct in6_ifaddr *)ifa;
 			continue;
 		}
 
 		if (ifa != NULL)
 			ifa_ref(ifa);
 		IF_ADDR_RUNLOCK(ifp);
 		return (struct in6_ifaddr *)ifa;
 	}
 
 	/* use the last-resort values, that are, deprecated addresses */
 	if (dep[0]) {
 		ifa_ref((struct ifaddr *)dep[0]);
 		IF_ADDR_RUNLOCK(ifp);
 		return dep[0];
 	}
 	if (dep[1]) {
 		ifa_ref((struct ifaddr *)dep[1]);
 		IF_ADDR_RUNLOCK(ifp);
 		return dep[1];
 	}
 
 	IF_ADDR_RUNLOCK(ifp);
 	return NULL;
 }
 
 /*
  * perform DAD when interface becomes IFF_UP.
  */
 void
 in6_if_up(struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 	struct in6_ifaddr *ia;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		ia = (struct in6_ifaddr *)ifa;
 		if (ia->ia6_flags & IN6_IFF_TENTATIVE) {
 			/*
 			 * The TENTATIVE flag was likely set by hand
 			 * beforehand, implicitly indicating the need for DAD.
 			 * We may be able to skip the random delay in this
 			 * case, but we impose delays just in case.
 			 */
 			nd6_dad_start(ifa,
 			    arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz));
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	/*
 	 * special cases, like 6to4, are handled in in6_ifattach
 	 */
 	in6_ifattach(ifp, NULL);
 }
 
 int
 in6if_do_dad(struct ifnet *ifp)
 {
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0)
 		return (0);
 
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) ||
 	    (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD))
 		return (0);
 
 	/*
 	 * Our DAD routine requires the interface up and running.
 	 * However, some interfaces can be up before the RUNNING
 	 * status.  Additionaly, users may try to assign addresses
 	 * before the interface becomes up (or running).
 	 * This function returns EAGAIN in that case.
 	 * The caller should mark "tentative" on the address instead of
 	 * performing DAD immediately.
 	 */
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		return (EAGAIN);
 
 	return (1);
 }
 
 /*
  * Calculate max IPv6 MTU through all the interfaces and store it
  * to in6_maxmtu.
  */
 void
 in6_setmaxmtu(void)
 {
 	unsigned long maxmtu = 0;
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		/* this function can be called during ifnet initialization */
 		if (!ifp->if_afdata[AF_INET6])
 			continue;
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0 &&
 		    IN6_LINKMTU(ifp) > maxmtu)
 			maxmtu = IN6_LINKMTU(ifp);
 	}
 	IFNET_RUNLOCK_NOSLEEP();
 	if (maxmtu)	/* update only when maxmtu is positive */
 		V_in6_maxmtu = maxmtu;
 }
 
 /*
  * Provide the length of interface identifiers to be used for the link attached
  * to the given interface.  The length should be defined in "IPv6 over
  * xxx-link" document.  Note that address architecture might also define
  * the length for a particular set of address prefixes, regardless of the
  * link type.  As clarified in rfc2462bis, those two definitions should be
  * consistent, and those really are as of August 2004.
  */
 int
 in6_if2idlen(struct ifnet *ifp)
 {
 	switch (ifp->if_type) {
 	case IFT_ETHER:		/* RFC2464 */
 	case IFT_PROPVIRTUAL:	/* XXX: no RFC. treat it as ether */
 	case IFT_L2VLAN:	/* ditto */
 	case IFT_IEEE80211:	/* ditto */
 	case IFT_BRIDGE:	/* bridge(4) only does Ethernet-like links */
 	case IFT_INFINIBAND:
 		return (64);
 	case IFT_FDDI:		/* RFC2467 */
 		return (64);
 	case IFT_ISO88025:	/* RFC2470 (IPv6 over Token Ring) */
 		return (64);
 	case IFT_PPP:		/* RFC2472 */
 		return (64);
 	case IFT_ARCNET:	/* RFC2497 */
 		return (64);
 	case IFT_FRELAY:	/* RFC2590 */
 		return (64);
 	case IFT_IEEE1394:	/* RFC3146 */
 		return (64);
 	case IFT_GIF:
 		return (64);	/* draft-ietf-v6ops-mech-v2-07 */
 	case IFT_LOOP:
 		return (64);	/* XXX: is this really correct? */
 	default:
 		/*
 		 * Unknown link type:
 		 * It might be controversial to use the today's common constant
 		 * of 64 for these cases unconditionally.  For full compliance,
 		 * we should return an error in this case.  On the other hand,
 		 * if we simply miss the standard for the link type or a new
 		 * standard is defined for a new link type, the IFID length
 		 * is very likely to be the common constant.  As a compromise,
 		 * we always use the constant, but make an explicit notice
 		 * indicating the "unknown" case.
 		 */
 		printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type);
 		return (64);
 	}
 }
 
 #include <sys/sysctl.h>
 
 struct in6_llentry {
 	struct llentry		base;
 };
 
 #define	IN6_LLTBL_DEFAULT_HSIZE	32
 #define	IN6_LLTBL_HASH(k, h) \
 	(((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1))
 
 /*
  * Do actual deallocation of @lle.
  * Called by LLE_FREE_LOCKED when number of references
  * drops to zero.
  */
 static void
 in6_lltable_destroy_lle(struct llentry *lle)
 {
 
 	LLE_WUNLOCK(lle);
 	LLE_LOCK_DESTROY(lle);
 	LLE_REQ_DESTROY(lle);
 	free(lle, M_LLTABLE);
 }
 
 static struct llentry *
 in6_lltable_new(const struct in6_addr *addr6, u_int flags)
 {
 	struct in6_llentry *lle;
 
 	lle = malloc(sizeof(struct in6_llentry), M_LLTABLE, M_NOWAIT | M_ZERO);
 	if (lle == NULL)		/* NB: caller generates msg */
 		return NULL;
 
 	lle->base.r_l3addr.addr6 = *addr6;
 	lle->base.lle_refcnt = 1;
 	lle->base.lle_free = in6_lltable_destroy_lle;
 	LLE_LOCK_INIT(&lle->base);
 	LLE_REQ_INIT(&lle->base);
 	callout_init(&lle->base.lle_timer, 1);
 
 	return (&lle->base);
 }
 
 static int
 in6_lltable_match_prefix(const struct sockaddr *saddr,
     const struct sockaddr *smask, u_int flags, struct llentry *lle)
 {
 	const struct in6_addr *addr, *mask, *lle_addr;
 
 	addr = &((const struct sockaddr_in6 *)saddr)->sin6_addr;
 	mask = &((const struct sockaddr_in6 *)smask)->sin6_addr;
 	lle_addr = &lle->r_l3addr.addr6;
 
 	if (IN6_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0)
 		return (0);
 
 	if (lle->la_flags & LLE_IFADDR) {
 
 		/*
 		 * Delete LLE_IFADDR records IFF address & flag matches.
 		 * Note that addr is the interface address within prefix
 		 * being matched.
 		 */
 		if (IN6_ARE_ADDR_EQUAL(addr, lle_addr) &&
 		    (flags & LLE_STATIC) != 0)
 			return (1);
 		return (0);
 	}
 
 	/* flags & LLE_STATIC means deleting both dynamic and static entries */
 	if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC))
 		return (1);
 
 	return (0);
 }
 
 static void
 in6_lltable_free_entry(struct lltable *llt, struct llentry *lle)
 {
 	struct ifnet *ifp;
 
 	LLE_WLOCK_ASSERT(lle);
 	KASSERT(llt != NULL, ("lltable is NULL"));
 
 	/* Unlink entry from table */
 	if ((lle->la_flags & LLE_LINKED) != 0) {
 
 		ifp = llt->llt_ifp;
 		IF_AFDATA_WLOCK_ASSERT(ifp);
 		lltable_unlink_entry(llt, lle);
 	}
 
 	if (callout_stop(&lle->lle_timer) > 0)
 		LLE_REMREF(lle);
 
 	llentry_free(lle);
 }
 
 static int
 in6_lltable_rtcheck(struct ifnet *ifp,
 		    u_int flags,
 		    const struct sockaddr *l3addr)
 {
 	const struct sockaddr_in6 *sin6;
 	struct nhop6_basic nh6;
 	struct in6_addr dst;
 	uint32_t scopeid;
 	int error;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	KASSERT(l3addr->sa_family == AF_INET6,
 	    ("sin_family %d", l3addr->sa_family));
 
 	/* Our local addresses are always only installed on the default FIB. */
 
 	sin6 = (const struct sockaddr_in6 *)l3addr;
 	in6_splitscope(&sin6->sin6_addr, &dst, &scopeid);
 	error = fib6_lookup_nh_basic(RT_DEFAULT_FIB, &dst, scopeid, 0, 0, &nh6);
 	if (error != 0 || (nh6.nh_flags & NHF_GATEWAY) || nh6.nh_ifp != ifp) {
 		struct ifaddr *ifa;
 		/*
 		 * Create an ND6 cache for an IPv6 neighbor
 		 * that is not covered by our own prefix.
 		 */
 		ifa = ifaof_ifpforaddr(l3addr, ifp);
 		if (ifa != NULL) {
 			ifa_free(ifa);
 			return 0;
 		}
 		log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n",
 		    ip6_sprintf(ip6buf, &sin6->sin6_addr));
 		return EINVAL;
 	}
 	return 0;
 }
 
 static inline uint32_t
 in6_lltable_hash_dst(const struct in6_addr *dst, uint32_t hsize)
 {
 
 	return (IN6_LLTBL_HASH(dst->s6_addr32[3], hsize));
 }
 
 static uint32_t
 in6_lltable_hash(const struct llentry *lle, uint32_t hsize)
 {
 
 	return (in6_lltable_hash_dst(&lle->r_l3addr.addr6, hsize));
 }
 
 static void
 in6_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa)
 {
 	struct sockaddr_in6 *sin6;
 
 	sin6 = (struct sockaddr_in6 *)sa;
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(*sin6);
 	sin6->sin6_addr = lle->r_l3addr.addr6;
 }
 
 static inline struct llentry *
 in6_lltable_find_dst(struct lltable *llt, const struct in6_addr *dst)
 {
 	struct llentry *lle;
 	struct llentries *lleh;
 	u_int hashidx;
 
 	hashidx = in6_lltable_hash_dst(dst, llt->llt_hsize);
 	lleh = &llt->lle_head[hashidx];
 	LIST_FOREACH(lle, lleh, lle_next) {
 		if (lle->la_flags & LLE_DELETED)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(&lle->r_l3addr.addr6, dst))
 			break;
 	}
 
 	return (lle);
 }
 
 static void
 in6_lltable_delete_entry(struct lltable *llt, struct llentry *lle)
 {
 
 	lle->la_flags |= LLE_DELETED;
 	EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED);
 #ifdef DIAGNOSTIC
 	log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle);
 #endif
 	llentry_free(lle);
 }
 
 static struct llentry *
 in6_lltable_alloc(struct lltable *llt, u_int flags,
 	const struct sockaddr *l3addr)
 {
 	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
 	struct ifnet *ifp = llt->llt_ifp;
 	struct llentry *lle;
 	char linkhdr[LLE_MAX_LINKHDR];
 	size_t linkhdrsize;
 	int lladdr_off;
 
 	KASSERT(l3addr->sa_family == AF_INET6,
 	    ("sin_family %d", l3addr->sa_family));
 
 	/*
 	 * A route that covers the given address must have
 	 * been installed 1st because we are doing a resolution,
 	 * verify this.
 	 */
 	if (!(flags & LLE_IFADDR) &&
 	    in6_lltable_rtcheck(ifp, flags, l3addr) != 0)
 		return (NULL);
 
 	lle = in6_lltable_new(&sin6->sin6_addr, flags);
 	if (lle == NULL) {
 		log(LOG_INFO, "lla_lookup: new lle malloc failed\n");
 		return (NULL);
 	}
 	lle->la_flags = flags;
 	if ((flags & LLE_IFADDR) == LLE_IFADDR) {
 		linkhdrsize = LLE_MAX_LINKHDR;
 		if (lltable_calc_llheader(ifp, AF_INET6, IF_LLADDR(ifp),
 		    linkhdr, &linkhdrsize, &lladdr_off) != 0)
 			return (NULL);
 		lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize,
 		    lladdr_off);
 		lle->la_flags |= LLE_STATIC;
 	}
 
 	if ((lle->la_flags & LLE_STATIC) != 0)
 		lle->ln_state = ND6_LLINFO_REACHABLE;
 
 	return (lle);
 }
 
 static struct llentry *
 in6_lltable_lookup(struct lltable *llt, u_int flags,
 	const struct sockaddr *l3addr)
 {
 	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
 	struct llentry *lle;
 
 	IF_AFDATA_LOCK_ASSERT(llt->llt_ifp);
 	KASSERT(l3addr->sa_family == AF_INET6,
 	    ("sin_family %d", l3addr->sa_family));
 
 	lle = in6_lltable_find_dst(llt, &sin6->sin6_addr);
 
 	if (lle == NULL)
 		return (NULL);
 
 	KASSERT((flags & (LLE_UNLOCKED|LLE_EXCLUSIVE)) !=
 	    (LLE_UNLOCKED|LLE_EXCLUSIVE),("wrong lle request flags: 0x%X",
 	    flags));
 
 	if (flags & LLE_UNLOCKED)
 		return (lle);
 
 	if (flags & LLE_EXCLUSIVE)
 		LLE_WLOCK(lle);
 	else
 		LLE_RLOCK(lle);
 	return (lle);
 }
 
 static int
 in6_lltable_dump_entry(struct lltable *llt, struct llentry *lle,
     struct sysctl_req *wr)
 {
 	struct ifnet *ifp = llt->llt_ifp;
 	/* XXX stack use */
 	struct {
 		struct rt_msghdr	rtm;
 		struct sockaddr_in6	sin6;
 		/*
 		 * ndp.c assumes that sdl is word aligned
 		 */
 #ifdef __LP64__
 		uint32_t		pad;
 #endif
 		struct sockaddr_dl	sdl;
 	} ndpc;
 	struct sockaddr_dl *sdl;
 	int error;
 
 	bzero(&ndpc, sizeof(ndpc));
 			/* skip deleted entries */
 			if ((lle->la_flags & LLE_DELETED) == LLE_DELETED)
 				return (0);
 			/* Skip if jailed and not a valid IP of the prison. */
 			lltable_fill_sa_entry(lle,
 			    (struct sockaddr *)&ndpc.sin6);
 			if (prison_if(wr->td->td_ucred,
 			    (struct sockaddr *)&ndpc.sin6) != 0)
 				return (0);
 			/*
 			 * produce a msg made of:
 			 *  struct rt_msghdr;
 			 *  struct sockaddr_in6 (IPv6)
 			 *  struct sockaddr_dl;
 			 */
 			ndpc.rtm.rtm_msglen = sizeof(ndpc);
 			ndpc.rtm.rtm_version = RTM_VERSION;
 			ndpc.rtm.rtm_type = RTM_GET;
 			ndpc.rtm.rtm_flags = RTF_UP;
 			ndpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY;
 			if (V_deembed_scopeid)
 				sa6_recoverscope(&ndpc.sin6);
 
 			/* publish */
 			if (lle->la_flags & LLE_PUB)
 				ndpc.rtm.rtm_flags |= RTF_ANNOUNCE;
 
 			sdl = &ndpc.sdl;
 			sdl->sdl_family = AF_LINK;
 			sdl->sdl_len = sizeof(*sdl);
 			sdl->sdl_alen = ifp->if_addrlen;
 			sdl->sdl_index = ifp->if_index;
 			sdl->sdl_type = ifp->if_type;
 			bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen);
 			if (lle->la_expire != 0)
 				ndpc.rtm.rtm_rmx.rmx_expire = lle->la_expire +
 				    lle->lle_remtime / hz +
 				    time_second - time_uptime;
 			ndpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA);
 			if (lle->la_flags & LLE_STATIC)
 				ndpc.rtm.rtm_flags |= RTF_STATIC;
 			if (lle->la_flags & LLE_IFADDR)
 				ndpc.rtm.rtm_flags |= RTF_PINNED;
 			if (lle->ln_router != 0)
 				ndpc.rtm.rtm_flags |= RTF_GATEWAY;
 			ndpc.rtm.rtm_rmx.rmx_pksent = lle->la_asked;
 			/* Store state in rmx_weight value */
 			ndpc.rtm.rtm_rmx.rmx_state = lle->ln_state;
 			ndpc.rtm.rtm_index = ifp->if_index;
 			error = SYSCTL_OUT(wr, &ndpc, sizeof(ndpc));
 
 	return (error);
 }
 
 static struct lltable *
 in6_lltattach(struct ifnet *ifp)
 {
 	struct lltable *llt;
 
 	llt = lltable_allocate_htbl(IN6_LLTBL_DEFAULT_HSIZE);
 	llt->llt_af = AF_INET6;
 	llt->llt_ifp = ifp;
 
 	llt->llt_lookup = in6_lltable_lookup;
 	llt->llt_alloc_entry = in6_lltable_alloc;
 	llt->llt_delete_entry = in6_lltable_delete_entry;
 	llt->llt_dump_entry = in6_lltable_dump_entry;
 	llt->llt_hash = in6_lltable_hash;
 	llt->llt_fill_sa_entry = in6_lltable_fill_sa_entry;
 	llt->llt_free_entry = in6_lltable_free_entry;
 	llt->llt_match_prefix = in6_lltable_match_prefix;
  	lltable_link(llt);
 
 	return (llt);
 }
 
 void *
 in6_domifattach(struct ifnet *ifp)
 {
 	struct in6_ifextra *ext;
 
 	/* There are not IPv6-capable interfaces. */
 	switch (ifp->if_type) {
 	case IFT_PFLOG:
 	case IFT_PFSYNC:
 	case IFT_USB:
 		return (NULL);
 	}
 	ext = (struct in6_ifextra *)malloc(sizeof(*ext), M_IFADDR, M_WAITOK);
 	bzero(ext, sizeof(*ext));
 
 	ext->in6_ifstat = malloc(sizeof(counter_u64_t) *
 	    sizeof(struct in6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK);
 	COUNTER_ARRAY_ALLOC(ext->in6_ifstat,
 	    sizeof(struct in6_ifstat) / sizeof(uint64_t), M_WAITOK);
 
 	ext->icmp6_ifstat = malloc(sizeof(counter_u64_t) *
 	    sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_IFADDR,
 	    M_WAITOK);
 	COUNTER_ARRAY_ALLOC(ext->icmp6_ifstat,
 	    sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_WAITOK);
 
 	ext->nd_ifinfo = nd6_ifattach(ifp);
 	ext->scope6_id = scope6_ifattach(ifp);
 	ext->lltable = in6_lltattach(ifp);
 
 	ext->mld_ifinfo = mld_domifattach(ifp);
 
 	return ext;
 }
 
 int
 in6_domifmtu(struct ifnet *ifp)
 {
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return ifp->if_mtu;
 
 	return (IN6_LINKMTU(ifp));
 }
 
 void
 in6_domifdetach(struct ifnet *ifp, void *aux)
 {
 	struct in6_ifextra *ext = (struct in6_ifextra *)aux;
 
 	mld_domifdetach(ifp);
 	scope6_ifdetach(ext->scope6_id);
-	nd6_ifdetach(ext->nd_ifinfo);
+	nd6_ifdetach(ifp, ext->nd_ifinfo);
 	lltable_free(ext->lltable);
 	COUNTER_ARRAY_FREE(ext->in6_ifstat,
 	    sizeof(struct in6_ifstat) / sizeof(uint64_t));
 	free(ext->in6_ifstat, M_IFADDR);
 	COUNTER_ARRAY_FREE(ext->icmp6_ifstat,
 	    sizeof(struct icmp6_ifstat) / sizeof(uint64_t));
 	free(ext->icmp6_ifstat, M_IFADDR);
 	free(ext, M_IFADDR);
 }
 
 /*
  * Convert sockaddr_in6 to sockaddr_in.  Original sockaddr_in6 must be
  * v4 mapped addr or v4 compat addr
  */
 void
 in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6)
 {
 
 	bzero(sin, sizeof(*sin));
 	sin->sin_len = sizeof(struct sockaddr_in);
 	sin->sin_family = AF_INET;
 	sin->sin_port = sin6->sin6_port;
 	sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3];
 }
 
 /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */
 void
 in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6)
 {
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_len = sizeof(struct sockaddr_in6);
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_port = sin->sin_port;
 	sin6->sin6_addr.s6_addr32[0] = 0;
 	sin6->sin6_addr.s6_addr32[1] = 0;
 	sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP;
 	sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr;
 }
 
 /* Convert sockaddr_in6 into sockaddr_in. */
 void
 in6_sin6_2_sin_in_sock(struct sockaddr *nam)
 {
 	struct sockaddr_in *sin_p;
 	struct sockaddr_in6 sin6;
 
 	/*
 	 * Save original sockaddr_in6 addr and convert it
 	 * to sockaddr_in.
 	 */
 	sin6 = *(struct sockaddr_in6 *)nam;
 	sin_p = (struct sockaddr_in *)nam;
 	in6_sin6_2_sin(sin_p, &sin6);
 }
 
 /* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */
 void
 in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam)
 {
 	struct sockaddr_in *sin_p;
 	struct sockaddr_in6 *sin6_p;
 
 	sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK);
 	sin_p = (struct sockaddr_in *)*nam;
 	in6_sin_2_v4mapsin6(sin_p, sin6_p);
 	free(*nam, M_SONAME);
 	*nam = (struct sockaddr *)sin6_p;
 }
Index: projects/vnet/sys/netinet6/ip6_input.c
===================================================================
--- projects/vnet/sys/netinet6/ip6_input.c	(revision 299172)
+++ projects/vnet/sys/netinet6/ip6_input.c	(revision 299173)
@@ -1,1735 +1,1739 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipfw.h"
 #include "opt_ipsec.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hhook.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_systm.h>
 #include <net/if_llatbl.h>
 #ifdef INET
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #endif /* INET */
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/icmp6.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/in6_rss.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netinet6/ip6_ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif /* IPSEC */
 
 #include <netinet6/ip6protosw.h>
 
 extern struct domain inet6domain;
 
 u_char ip6_protox[IPPROTO_MAX];
 VNET_DEFINE(struct in6_ifaddrhead, in6_ifaddrhead);
 VNET_DEFINE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl);
 VNET_DEFINE(u_long, in6_ifaddrhmask);
 
 static struct netisr_handler ip6_nh = {
 	.nh_name = "ip6",
 	.nh_handler = ip6_input,
 	.nh_proto = NETISR_IPV6,
 #ifdef RSS
 	.nh_m2cpuid = rss_soft_m2cpuid_v6,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 #else
 	.nh_policy = NETISR_POLICY_FLOW,
 #endif
 };
 
 #ifdef RSS
 static struct netisr_handler ip6_direct_nh = {
 	.nh_name = "ip6_direct",
 	.nh_handler = ip6_direct_input,
 	.nh_proto = NETISR_IPV6_DIRECT,
 	.nh_m2cpuid = rss_soft_m2cpuid_v6,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 };
 #endif
 
 VNET_DEFINE(struct pfil_head, inet6_pfil_hook);
 
 VNET_PCPUSTAT_DEFINE(struct ip6stat, ip6stat);
 VNET_PCPUSTAT_SYSINIT(ip6stat);
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ip6stat);
 #endif /* VIMAGE */
 
 struct rmlock in6_ifaddr_lock;
 RM_SYSINIT(in6_ifaddr_lock, &in6_ifaddr_lock, "in6_ifaddr_lock");
 
 static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *);
 #ifdef PULLDOWN_TEST
 static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int);
 #endif
 
 /*
  * IP6 initialization: fill in IP6 protocol switch table.
  * All protocols not implemented in kernel go to raw IP6 protocol handler.
  */
 void
 ip6_init(void)
 {
 	struct protosw *pr;
 	int i;
 
 	TUNABLE_INT_FETCH("net.inet6.ip6.auto_linklocal",
 	    &V_ip6_auto_linklocal);
 	TUNABLE_INT_FETCH("net.inet6.ip6.accept_rtadv", &V_ip6_accept_rtadv);
 	TUNABLE_INT_FETCH("net.inet6.ip6.no_radr", &V_ip6_no_radr);
 
 	TAILQ_INIT(&V_in6_ifaddrhead);
 	V_in6_ifaddrhashtbl = hashinit(IN6ADDR_NHASH, M_IFADDR,
 	    &V_in6_ifaddrhmask);
 
 	/* Initialize packet filter hooks. */
 	V_inet6_pfil_hook.ph_type = PFIL_TYPE_AF;
 	V_inet6_pfil_hook.ph_af = AF_INET6;
 	if ((i = pfil_head_register(&V_inet6_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to register pfil hook, "
 			"error %d\n", __func__, i);
 
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET6,
 	    &V_ipsec_hhh_in[HHOOK_IPSEC_INET6],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register input helper hook\n",
 		    __func__);
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET6,
 	    &V_ipsec_hhh_out[HHOOK_IPSEC_INET6],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register output helper hook\n",
 		    __func__);
 
 	scope6_init();
 	addrsel_policy_init();
 	nd6_init();
 	frag6_init();
 
 	V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR;
 
 	/* Skip global initialization stuff for non-default instances. */
-	if (!IS_DEFAULT_VNET(curvnet))
-		goto out;
+	if (!IS_DEFAULT_VNET(curvnet)) {
+		netisr_register_vnet(&ip6_nh);
+#ifdef RSS
+		netisr_register_vnet(&ip6_direct_nh);
+#endif
+		return;
+	}
 
 	pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		panic("ip6_init");
 
 	/* Initialize the entire ip6_protox[] array to IPPROTO_RAW. */
 	for (i = 0; i < IPPROTO_MAX; i++)
 		ip6_protox[i] = pr - inet6sw;
 	/*
 	 * Cycle through IP protocols and put them into the appropriate place
 	 * in ip6_protox[].
 	 */
 	for (pr = inet6domain.dom_protosw;
 	    pr < inet6domain.dom_protoswNPROTOSW; pr++)
 		if (pr->pr_domain->dom_family == PF_INET6 &&
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
 			/* Be careful to only index valid IP protocols. */
 			if (pr->pr_protocol < IPPROTO_MAX)
 				ip6_protox[pr->pr_protocol] = pr - inet6sw;
 		}
 
-out:
 	netisr_register(&ip6_nh);
 #ifdef RSS
 	netisr_register(&ip6_direct_nh);
 #endif
 }
 
 /*
  * The protocol to be inserted into ip6_protox[] must be already registered
  * in inet6sw[], either statically or through pf_proto_register().
  */
 int
 ip6proto_register(short ip6proto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/*
 	 * The protocol slot must not be occupied by another protocol
 	 * already.  An index pointing to IPPROTO_RAW is unused.
 	 */
 	pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip6_protox[ip6proto] != pr - inet6sw)	/* IPPROTO_RAW */
 		return (EEXIST);
 
 	/*
 	 * Find the protocol position in inet6sw[] and set the index.
 	 */
 	for (pr = inet6domain.dom_protosw;
 	    pr < inet6domain.dom_protoswNPROTOSW; pr++) {
 		if (pr->pr_domain->dom_family == PF_INET6 &&
 		    pr->pr_protocol && pr->pr_protocol == ip6proto) {
 			ip6_protox[pr->pr_protocol] = pr - inet6sw;
 			return (0);
 		}
 	}
 	return (EPROTONOSUPPORT);
 }
 
 int
 ip6proto_unregister(short ip6proto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/* Check if the protocol was indeed registered. */
 	pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip6_protox[ip6proto] == pr - inet6sw)	/* IPPROTO_RAW */
 		return (ENOENT);
 
 	/* Reset the protocol slot to IPPROTO_RAW. */
 	ip6_protox[ip6proto] = pr - inet6sw;
 	return (0);
 }
 
 #ifdef VIMAGE
 static void
 ip6_destroy(void *unused __unused)
 {
 	struct ifaddr *ifa, *nifa;
 	struct ifnet *ifp;
 	int error;
 
 #ifdef RSS
-	netisr_unregister(&ip6_direct_nh);
+	netisr_unregister_vnet(&ip6_direct_nh);
 #endif
-	netisr_unregister(&ip6_nh);
+	netisr_unregister_vnet(&ip6_nh);
 
 	if ((error = pfil_head_unregister(&V_inet6_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to unregister pfil hook, "
 		    "error %d\n", __func__, error);
 	error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET6]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister input helper hook "
 		    "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET6: "
 		    "error %d returned\n", __func__, error);
 	}
 	error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET6]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister output helper hook "
 		    "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET6: "
 		    "error %d returned\n", __func__, error);
 	}
 
 	/* Cleanup addresses. */
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		/* Cannot lock here - lock recursion. */
 		/* IF_ADDR_LOCK(ifp); */
 		TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) {
 
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			in6_purgeaddr(ifa);
 		}
 		/* IF_ADDR_UNLOCK(ifp); */
 		in6_ifdetach(ifp, 0);
 		mld_domifdetach(ifp);
 		/* Make sure any routes are gone as well. */
 		rt_flushifroutes_af(ifp, AF_INET6);
 	}
 	IFNET_RUNLOCK();
 
 	nd6_destroy();
 	in6_ifattach_destroy();
 
 	hashdestroy(V_in6_ifaddrhashtbl, M_IFADDR, V_in6_ifaddrhmask);
 }
 
 VNET_SYSUNINIT(inet6, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip6_destroy, NULL);
 #endif
 
 static int
 ip6_input_hbh(struct mbuf *m, uint32_t *plen, uint32_t *rtalert, int *off,
     int *nxt, int *ours)
 {
 	struct ip6_hdr *ip6;
 	struct ip6_hbh *hbh;
 
 	if (ip6_hopopts_input(plen, rtalert, &m, off)) {
 #if 0	/*touches NULL pointer*/
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 #endif
 		goto out;	/* m have already been freed */
 	}
 
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * if the payload length field is 0 and the next header field
 	 * indicates Hop-by-Hop Options header, then a Jumbo Payload
 	 * option MUST be included.
 	 */
 	if (ip6->ip6_plen == 0 && *plen == 0) {
 		/*
 		 * Note that if a valid jumbo payload option is
 		 * contained, ip6_hopopts_input() must set a valid
 		 * (non-zero) payload length to the variable plen.
 		 */
 		IP6STAT_INC(ip6s_badoptions);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
 		icmp6_error(m, ICMP6_PARAM_PROB,
 			    ICMP6_PARAMPROB_HEADER,
 			    (caddr_t)&ip6->ip6_plen - (caddr_t)ip6);
 		goto out;
 	}
 #ifndef PULLDOWN_TEST
 	/* ip6_hopopts_input() ensures that mbuf is contiguous */
 	hbh = (struct ip6_hbh *)(ip6 + 1);
 #else
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
 		sizeof(struct ip6_hbh));
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		goto out;
 	}
 #endif
 	*nxt = hbh->ip6h_nxt;
 
 	/*
 	 * If we are acting as a router and the packet contains a
 	 * router alert option, see if we know the option value.
 	 * Currently, we only support the option value for MLD, in which
 	 * case we should pass the packet to the multicast routing
 	 * daemon.
 	 */
 	if (*rtalert != ~0) {
 		switch (*rtalert) {
 		case IP6OPT_RTALERT_MLD:
 			if (V_ip6_forwarding)
 				*ours = 1;
 			break;
 		default:
 			/*
 			 * RFC2711 requires unrecognized values must be
 			 * silently ignored.
 			 */
 			break;
 		}
 	}
 
 	return (0);
 
 out:
 	return (1);
 }
 
 #ifdef RSS
 /*
  * IPv6 direct input routine.
  *
  * This is called when reinjecting completed fragments where
  * all of the previous checking and book-keeping has been done.
  */
 void
 ip6_direct_input(struct mbuf *m)
 {
 	int off, nxt;
 	int nest;
 	struct m_tag *mtag;
 	struct ip6_direct_ctx *ip6dc;
 
 	mtag = m_tag_locate(m, MTAG_ABI_IPV6, IPV6_TAG_DIRECT, NULL);
 	KASSERT(mtag != NULL, ("Reinjected packet w/o direct ctx tag!"));
 
 	ip6dc = (struct ip6_direct_ctx *)(mtag + 1);
 	nxt = ip6dc->ip6dc_nxt;
 	off = ip6dc->ip6dc_off;
 
 	nest = 0;
 
 	m_tag_delete(m, mtag);
 
 	while (nxt != IPPROTO_DONE) {
 		if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) {
 			IP6STAT_INC(ip6s_toomanyhdr);
 			goto bad;
 		}
 
 		/*
 		 * protection against faulty packet - there should be
 		 * more sanity checks in header chain processing.
 		 */
 		if (m->m_pkthdr.len < off) {
 			IP6STAT_INC(ip6s_tooshort);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 			goto bad;
 		}
 
 #ifdef IPSEC
 		/*
 		 * enforce IPsec policy checking if we are seeing last header.
 		 * note that we do not visit this with protocols with pcb layer
 		 * code - like udp/tcp/raw ip.
 		 */
 		if (ip6_ipsec_input(m, nxt))
 			goto bad;
 #endif /* IPSEC */
 
 		nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
 	}
 	return;
 bad:
 	m_freem(m);
 }
 #endif
 
 void
 ip6_input(struct mbuf *m)
 {
 	struct in6_addr odst;
 	struct ip6_hdr *ip6;
 	struct in6_ifaddr *ia;
 	u_int32_t plen;
 	u_int32_t rtalert = ~0;
 	int off = sizeof(struct ip6_hdr), nest;
 	int nxt, ours = 0;
 	int srcrt = 0;
 
 #ifdef IPSEC
 	/*
 	 * should the inner packet be considered authentic?
 	 * see comment in ah4_input().
 	 * NB: m cannot be NULL when passed to the input routine
 	 */
 
 	m->m_flags &= ~M_AUTHIPHDR;
 	m->m_flags &= ~M_AUTHIPDGM;
 
 #endif /* IPSEC */
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		/*
 		 * Firewall changed destination to local.
 		 */
 		m->m_flags &= ~M_FASTFWD_OURS;
 		ours = 1;
 		ip6 = mtod(m, struct ip6_hdr *);
 		goto hbhcheck;
 	}
 
 	/*
 	 * mbuf statistics
 	 */
 	if (m->m_flags & M_EXT) {
 		if (m->m_next)
 			IP6STAT_INC(ip6s_mext2m);
 		else
 			IP6STAT_INC(ip6s_mext1);
 	} else {
 		if (m->m_next) {
 			if (m->m_flags & M_LOOP) {
 				IP6STAT_INC(ip6s_m2m[V_loif->if_index]);
 			} else if (m->m_pkthdr.rcvif->if_index < IP6S_M2MMAX)
 				IP6STAT_INC(
 				    ip6s_m2m[m->m_pkthdr.rcvif->if_index]);
 			else
 				IP6STAT_INC(ip6s_m2m[0]);
 		} else
 			IP6STAT_INC(ip6s_m1);
 	}
 
 	/* drop the packet if IPv6 operation is disabled on the IF */
 	if ((ND_IFINFO(m->m_pkthdr.rcvif)->flags & ND6_IFF_IFDISABLED))
 		goto bad;
 
 	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_receive);
 	IP6STAT_INC(ip6s_total);
 
 #ifndef PULLDOWN_TEST
 	/*
 	 * L2 bridge code and some other code can return mbuf chain
 	 * that does not conform to KAME requirement.  too bad.
 	 * XXX: fails to join if interface MTU > MCLBYTES.  jumbogram?
 	 */
 	if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) {
 		struct mbuf *n;
 
 		if (m->m_pkthdr.len > MHLEN)
 			n = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		else
 			n = m_gethdr(M_NOWAIT, MT_DATA);
 		if (n == NULL) {
 			m_freem(m);
 			return;	/* ENOBUFS */
 		}
 
 		m_move_pkthdr(n, m);
 		m_copydata(m, 0, n->m_pkthdr.len, mtod(n, caddr_t));
 		n->m_len = n->m_pkthdr.len;
 		m_freem(m);
 		m = n;
 	}
 	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /* nothing */);
 #endif
 
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		struct ifnet *inifp;
 		inifp = m->m_pkthdr.rcvif;
 		if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
 			IP6STAT_INC(ip6s_toosmall);
 			in6_ifstat_inc(inifp, ifs6_in_hdrerr);
 			return;
 		}
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 		IP6STAT_INC(ip6s_badvers);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
 		goto bad;
 	}
 
 	IP6STAT_INC(ip6s_nxthist[ip6->ip6_nxt]);
 
 	IP_PROBE(receive, NULL, NULL, ip6, m->m_pkthdr.rcvif, NULL, ip6);
 
 	/*
 	 * Check against address spoofing/corruption.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) {
 		/*
 		 * XXX: "badscope" is not very suitable for a multicast source.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) &&
 	    !(m->m_flags & M_LOOP)) {
 		/*
 		 * In this case, the packet should come from the loopback
 		 * interface.  However, we cannot just check the if_flags,
 		 * because ip6_mloopback() passes the "actual" interface
 		 * as the outgoing/incoming interface.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
 	    IPV6_ADDR_MC_SCOPE(&ip6->ip6_dst) == 0) {
 		/*
 		 * RFC4291 2.7:
 		 * Nodes must not originate a packet to a multicast address
 		 * whose scop field contains the reserved value 0; if such
 		 * a packet is received, it must be silently dropped.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #ifdef ALTQ
 	if (altq_input != NULL && (*altq_input)(m, AF_INET6) == 0) {
 		/* packet is dropped by traffic conditioner */
 		return;
 	}
 #endif
 	/*
 	 * The following check is not documented in specs.  A malicious
 	 * party may be able to use IPv4 mapped addr to confuse tcp/udp stack
 	 * and bypass security checks (act as if it was from 127.0.0.1 by using
 	 * IPv6 src ::ffff:127.0.0.1).  Be cautious.
 	 *
 	 * This check chokes if we are in an SIIT cloud.  As none of BSDs
 	 * support IPv4-less kernel compilation, we cannot support SIIT
 	 * environment at all.  So, it makes more sense for us to reject any
 	 * malicious packets for non-SIIT environment, than try to do a
 	 * partial support for SIIT environment.
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #if 0
 	/*
 	 * Reject packets with IPv4 compatible addresses (auto tunnel).
 	 *
 	 * The code forbids auto tunnel relay case in RFC1933 (the check is
 	 * stronger than RFC1933).  We may want to re-enable it if mech-xx
 	 * is revised to forbid relaying case.
 	 */
 	if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #endif
 #ifdef IPSEC
 	/*
 	 * Bypass packet filtering for packets previously handled by IPsec.
 	 */
 	if (ip6_ipsec_filtertunnel(m))
 		goto passin;
 #endif /* IPSEC */
 
 	/*
 	 * Run through list of hooks for input packets.
 	 *
 	 * NB: Beware of the destination address changing
 	 *     (e.g. by NAT rewriting).  When this happens,
 	 *     tell ip6_forward to do the right thing.
 	 */
 	odst = ip6->ip6_dst;
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet6_pfil_hook))
 		goto passin;
 
 	if (pfil_run_hooks(&V_inet6_pfil_hook, &m,
 	    m->m_pkthdr.rcvif, PFIL_IN, NULL))
 		return;
 	if (m == NULL)			/* consumed by filter */
 		return;
 	ip6 = mtod(m, struct ip6_hdr *);
 	srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst);
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		ours = 1;
 		goto hbhcheck;
 	}
 	if ((m->m_flags & M_IP6_NEXTHOP) &&
 	    m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) {
 		/*
 		 * Directly ship the packet on.  This allows forwarding
 		 * packets originally destined to us to some other directly
 		 * connected host.
 		 */
 		ip6_forward(m, 1);
 		return;
 	}
 
 passin:
 	/*
 	 * Disambiguate address scope zones (if there is ambiguity).
 	 * We first make sure that the original source or destination address
 	 * is not in our internal form for scoped addresses.  Such addresses
 	 * are not necessarily invalid spec-wise, but we cannot accept them due
 	 * to the usage conflict.
 	 * in6_setscope() then also checks and rejects the cases where src or
 	 * dst are the loopback address and the receiving interface
 	 * is not loopback.
 	 */
 	if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope); /* XXX */
 		goto bad;
 	}
 	if (in6_setscope(&ip6->ip6_src, m->m_pkthdr.rcvif, NULL) ||
 	    in6_setscope(&ip6->ip6_dst, m->m_pkthdr.rcvif, NULL)) {
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 	/*
 	 * Multicast check. Assume packet is for us to avoid
 	 * prematurely taking locks.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		ours = 1;
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mcast);
 		goto hbhcheck;
 	}
 	/*
 	 * Unicast check
 	 * XXX: For now we keep link-local IPv6 addresses with embedded
 	 *      scope zone id, therefore we use zero zoneid here.
 	 */
 	ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */);
 	if (ia != NULL) {
 		if (ia->ia6_flags & IN6_IFF_NOTREADY) {
 			char ip6bufs[INET6_ADDRSTRLEN];
 			char ip6bufd[INET6_ADDRSTRLEN];
 			/* address is not ready, so discard the packet. */
 			nd6log((LOG_INFO,
 			    "ip6_input: packet to an unready address %s->%s\n",
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst)));
 			ifa_free(&ia->ia_ifa);
 			goto bad;
 		}
 		/* Count the packet in the ip address stats */
 		counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 		counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len);
 		ifa_free(&ia->ia_ifa);
 		ours = 1;
 		goto hbhcheck;
 	}
 
 	/*
 	 * Now there is no reason to process the packet if it's not our own
 	 * and we're not a router.
 	 */
 	if (!V_ip6_forwarding) {
 		IP6STAT_INC(ip6s_cantforward);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 		goto bad;
 	}
 
   hbhcheck:
 	/*
 	 * Process Hop-by-Hop options header if it's contained.
 	 * m may be modified in ip6_hopopts_input().
 	 * If a JumboPayload option is included, plen will also be modified.
 	 */
 	plen = (u_int32_t)ntohs(ip6->ip6_plen);
 	if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
 		if (ip6_input_hbh(m, &plen, &rtalert, &off, &nxt, &ours) != 0)
 			return;
 	} else
 		nxt = ip6->ip6_nxt;
 
 	/*
 	 * Use mbuf flags to propagate Router Alert option to
 	 * ICMPv6 layer, as hop-by-hop options have been stripped.
 	 */
 	if (rtalert != ~0)
 		m->m_flags |= M_RTALERT_MLD;
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IPv6 header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) {
 		IP6STAT_INC(ip6s_tooshort);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 		goto bad;
 	}
 	if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = sizeof(struct ip6_hdr) + plen;
 			m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen;
 		} else
 			m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len);
 	}
 
 	/*
 	 * Forward if desirable.
 	 */
 	if (V_ip6_mrouter &&
 	    IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		/*
 		 * If we are acting as a multicast router, all
 		 * incoming multicast packets are passed to the
 		 * kernel-level multicast forwarding function.
 		 * The packet is returned (relatively) intact; if
 		 * ip6_mforward() returns a non-zero value, the packet
 		 * must be discarded, else it may be accepted below.
 		 *
 		 * XXX TODO: Check hlim and multicast scope here to avoid
 		 * unnecessarily calling into ip6_mforward().
 		 */
 		if (ip6_mforward &&
 		    ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) {
 			IP6STAT_INC(ip6s_cantforward);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 			goto bad;
 		}
 	} else if (!ours) {
 		ip6_forward(m, srcrt);
 		return;
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * Malicious party may be able to use IPv4 mapped addr to confuse
 	 * tcp/udp stack and bypass security checks (act as if it was from
 	 * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1).  Be cautious.
 	 *
 	 * For SIIT end node behavior, you may want to disable the check.
 	 * However, you will  become vulnerable to attacks using IPv4 mapped
 	 * source.
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 
 	/*
 	 * Tell launch routine the next header
 	 */
 	IP6STAT_INC(ip6s_delivered);
 	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_deliver);
 	nest = 0;
 
 	while (nxt != IPPROTO_DONE) {
 		if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) {
 			IP6STAT_INC(ip6s_toomanyhdr);
 			goto bad;
 		}
 
 		/*
 		 * protection against faulty packet - there should be
 		 * more sanity checks in header chain processing.
 		 */
 		if (m->m_pkthdr.len < off) {
 			IP6STAT_INC(ip6s_tooshort);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 			goto bad;
 		}
 
 #ifdef IPSEC
 		/*
 		 * enforce IPsec policy checking if we are seeing last header.
 		 * note that we do not visit this with protocols with pcb layer
 		 * code - like udp/tcp/raw ip.
 		 */
 		if (ip6_ipsec_input(m, nxt))
 			goto bad;
 #endif /* IPSEC */
 
 		nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
 	}
 	return;
 bad:
 	m_freem(m);
 }
 
 /*
  * Hop-by-Hop options header processing. If a valid jumbo payload option is
  * included, the real payload length will be stored in plenp.
  *
  * rtalertp - XXX: should be stored more smart way
  */
 static int
 ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp,
     struct mbuf **mp, int *offp)
 {
 	struct mbuf *m = *mp;
 	int off = *offp, hbhlen;
 	struct ip6_hbh *hbh;
 
 	/* validation of the length of the header */
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1);
 	hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off);
 	hbhlen = (hbh->ip6h_len + 1) << 3;
 
 	IP6_EXTHDR_CHECK(m, off, hbhlen, -1);
 	hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off);
 #else
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m,
 		sizeof(struct ip6_hdr), sizeof(struct ip6_hbh));
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		return -1;
 	}
 	hbhlen = (hbh->ip6h_len + 1) << 3;
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
 		hbhlen);
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		return -1;
 	}
 #endif
 	off += hbhlen;
 	hbhlen -= sizeof(struct ip6_hbh);
 	if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh),
 				hbhlen, rtalertp, plenp) < 0)
 		return (-1);
 
 	*offp = off;
 	*mp = m;
 	return (0);
 }
 
 /*
  * Search header for all Hop-by-hop options and process each option.
  * This function is separate from ip6_hopopts_input() in order to
  * handle a case where the sending node itself process its hop-by-hop
  * options header. In such a case, the function is called from ip6_output().
  *
  * The function assumes that hbh header is located right after the IPv6 header
  * (RFC2460 p7), opthead is pointer into data content in m, and opthead to
  * opthead + hbhlen is located in contiguous memory region.
  */
 int
 ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen,
     u_int32_t *rtalertp, u_int32_t *plenp)
 {
 	struct ip6_hdr *ip6;
 	int optlen = 0;
 	u_int8_t *opt = opthead;
 	u_int16_t rtalert_val;
 	u_int32_t jumboplen;
 	const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh);
 
 	for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) {
 		switch (*opt) {
 		case IP6OPT_PAD1:
 			optlen = 1;
 			break;
 		case IP6OPT_PADN:
 			if (hbhlen < IP6OPT_MINLEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			optlen = *(opt + 1) + 2;
 			break;
 		case IP6OPT_ROUTER_ALERT:
 			/* XXX may need check for alignment */
 			if (hbhlen < IP6OPT_RTALERT_LEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) {
 				/* XXX stat */
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 1 - opthead);
 				return (-1);
 			}
 			optlen = IP6OPT_RTALERT_LEN;
 			bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2);
 			*rtalertp = ntohs(rtalert_val);
 			break;
 		case IP6OPT_JUMBO:
 			/* XXX may need check for alignment */
 			if (hbhlen < IP6OPT_JUMBO_LEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) {
 				/* XXX stat */
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 1 - opthead);
 				return (-1);
 			}
 			optlen = IP6OPT_JUMBO_LEN;
 
 			/*
 			 * IPv6 packets that have non 0 payload length
 			 * must not contain a jumbo payload option.
 			 */
 			ip6 = mtod(m, struct ip6_hdr *);
 			if (ip6->ip6_plen) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt - opthead);
 				return (-1);
 			}
 
 			/*
 			 * We may see jumbolen in unaligned location, so
 			 * we'd need to perform bcopy().
 			 */
 			bcopy(opt + 2, &jumboplen, sizeof(jumboplen));
 			jumboplen = (u_int32_t)htonl(jumboplen);
 
 #if 1
 			/*
 			 * if there are multiple jumbo payload options,
 			 * *plenp will be non-zero and the packet will be
 			 * rejected.
 			 * the behavior may need some debate in ipngwg -
 			 * multiple options does not make sense, however,
 			 * there's no explicit mention in specification.
 			 */
 			if (*plenp != 0) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 2 - opthead);
 				return (-1);
 			}
 #endif
 
 			/*
 			 * jumbo payload length must be larger than 65535.
 			 */
 			if (jumboplen <= IPV6_MAXPACKET) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 2 - opthead);
 				return (-1);
 			}
 			*plenp = jumboplen;
 
 			break;
 		default:		/* unknown option */
 			if (hbhlen < IP6OPT_MINLEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			optlen = ip6_unknown_opt(opt, m,
 			    erroff + opt - opthead);
 			if (optlen == -1)
 				return (-1);
 			optlen += 2;
 			break;
 		}
 	}
 
 	return (0);
 
   bad:
 	m_freem(m);
 	return (-1);
 }
 
 /*
  * Unknown option processing.
  * The third argument `off' is the offset from the IPv6 header to the option,
  * which is necessary if the IPv6 header the and option header and IPv6 header
  * is not contiguous in order to return an ICMPv6 error.
  */
 int
 ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off)
 {
 	struct ip6_hdr *ip6;
 
 	switch (IP6OPT_TYPE(*optp)) {
 	case IP6OPT_TYPE_SKIP: /* ignore the option */
 		return ((int)*(optp + 1));
 	case IP6OPT_TYPE_DISCARD:	/* silently discard */
 		m_freem(m);
 		return (-1);
 	case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */
 		IP6STAT_INC(ip6s_badoptions);
 		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off);
 		return (-1);
 	case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */
 		IP6STAT_INC(ip6s_badoptions);
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    (m->m_flags & (M_BCAST|M_MCAST)))
 			m_freem(m);
 		else
 			icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_OPTION, off);
 		return (-1);
 	}
 
 	m_freem(m);		/* XXX: NOTREACHED */
 	return (-1);
 }
 
 /*
  * Create the "control" list for this pcb.
  * These functions will not modify mbuf chain at all.
  *
  * With KAME mbuf chain restriction:
  * The routine will be called from upper layer handlers like tcp6_input().
  * Thus the routine assumes that the caller (tcp6_input) have already
  * called IP6_EXTHDR_CHECK() and all the extension headers are located in the
  * very first mbuf on the mbuf chain.
  *
  * ip6_savecontrol_v4 will handle those options that are possible to be
  * set on a v4-mapped socket.
  * ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those
  * options and handle the v6-only ones itself.
  */
 struct mbuf **
 ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp,
     int *v4only)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 
 #ifdef SO_TIMESTAMP
 	if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) {
 		struct timeval tv;
 
 		microtime(&tv);
 		*mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
 		    SCM_TIMESTAMP, SOL_SOCKET);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #endif
 
 #define IS2292(inp, x, y)	(((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y))
 	/* RFC 2292 sec. 5 */
 	if ((inp->inp_flags & IN6P_PKTINFO) != 0) {
 		struct in6_pktinfo pi6;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			pi6.ipi6_addr.s6_addr32[0] = 0;
 			pi6.ipi6_addr.s6_addr32[1] = 0;
 			pi6.ipi6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP;
 			pi6.ipi6_addr.s6_addr32[3] = ip->ip_dst.s_addr;
 #else
 			/* We won't hit this code */
 			bzero(&pi6.ipi6_addr, sizeof(struct in6_addr));
 #endif
 		} else {	
 			bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr));
 			in6_clearscope(&pi6.ipi6_addr);	/* XXX */
 		}
 		pi6.ipi6_ifindex =
 		    (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0;
 
 		*mp = sbcreatecontrol((caddr_t) &pi6,
 		    sizeof(struct in6_pktinfo),
 		    IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) {
 		int hlim;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			hlim = ip->ip_ttl;
 #else
 			/* We won't hit this code */
 			hlim = 0;
 #endif
 		} else {
 			hlim = ip6->ip6_hlim & 0xff;
 		}
 		*mp = sbcreatecontrol((caddr_t) &hlim, sizeof(int),
 		    IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT),
 		    IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if ((inp->inp_flags & IN6P_TCLASS) != 0) {
 		int tclass;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			tclass = ip->ip_tos;
 #else
 			/* We won't hit this code */
 			tclass = 0;
 #endif
 		} else {
 			u_int32_t flowinfo;
 
 			flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK);
 			flowinfo >>= 20;
 			tclass = flowinfo & 0xff;
 		}
 		*mp = sbcreatecontrol((caddr_t) &tclass, sizeof(int),
 		    IPV6_TCLASS, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if (v4only != NULL) {
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 			*v4only = 1;
 		} else {
 			*v4only = 0;
 		}
 	}
 
 	return (mp);
 }
 
 void
 ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	int v4only = 0;
 
 	mp = ip6_savecontrol_v4(in6p, m, mp, &v4only);
 	if (v4only)
 		return;
 
 	/*
 	 * IPV6_HOPOPTS socket option.  Recall that we required super-user
 	 * privilege for the option (see ip6_ctloutput), but it might be too
 	 * strict, since there might be some hop-by-hop options which can be
 	 * returned to normal user.
 	 * See also RFC 2292 section 6 (or RFC 3542 section 8).
 	 */
 	if ((in6p->inp_flags & IN6P_HOPOPTS) != 0) {
 		/*
 		 * Check if a hop-by-hop options header is contatined in the
 		 * received packet, and if so, store the options as ancillary
 		 * data. Note that a hop-by-hop options header must be
 		 * just after the IPv6 header, which is assured through the
 		 * IPv6 input processing.
 		 */
 		if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
 			struct ip6_hbh *hbh;
 			int hbhlen = 0;
 #ifdef PULLDOWN_TEST
 			struct mbuf *ext;
 #endif
 
 #ifndef PULLDOWN_TEST
 			hbh = (struct ip6_hbh *)(ip6 + 1);
 			hbhlen = (hbh->ip6h_len + 1) << 3;
 #else
 			ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr),
 			    ip6->ip6_nxt);
 			if (ext == NULL) {
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 			hbh = mtod(ext, struct ip6_hbh *);
 			hbhlen = (hbh->ip6h_len + 1) << 3;
 			if (hbhlen != ext->m_len) {
 				m_freem(ext);
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 #endif
 
 			/*
 			 * XXX: We copy the whole header even if a
 			 * jumbo payload option is included, the option which
 			 * is to be removed before returning according to
 			 * RFC2292.
 			 * Note: this constraint is removed in RFC3542
 			 */
 			*mp = sbcreatecontrol((caddr_t)hbh, hbhlen,
 			    IS2292(in6p, IPV6_2292HOPOPTS, IPV6_HOPOPTS),
 			    IPPROTO_IPV6);
 			if (*mp)
 				mp = &(*mp)->m_next;
 #ifdef PULLDOWN_TEST
 			m_freem(ext);
 #endif
 		}
 	}
 
 	if ((in6p->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) {
 		int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr);
 
 		/*
 		 * Search for destination options headers or routing
 		 * header(s) through the header chain, and stores each
 		 * header as ancillary data.
 		 * Note that the order of the headers remains in
 		 * the chain of ancillary data.
 		 */
 		while (1) {	/* is explicit loop prevention necessary? */
 			struct ip6_ext *ip6e = NULL;
 			int elen;
 #ifdef PULLDOWN_TEST
 			struct mbuf *ext = NULL;
 #endif
 
 			/*
 			 * if it is not an extension header, don't try to
 			 * pull it from the chain.
 			 */
 			switch (nxt) {
 			case IPPROTO_DSTOPTS:
 			case IPPROTO_ROUTING:
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_AH: /* is it possible? */
 				break;
 			default:
 				goto loopend;
 			}
 
 #ifndef PULLDOWN_TEST
 			if (off + sizeof(*ip6e) > m->m_len)
 				goto loopend;
 			ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off);
 			if (nxt == IPPROTO_AH)
 				elen = (ip6e->ip6e_len + 2) << 2;
 			else
 				elen = (ip6e->ip6e_len + 1) << 3;
 			if (off + elen > m->m_len)
 				goto loopend;
 #else
 			ext = ip6_pullexthdr(m, off, nxt);
 			if (ext == NULL) {
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 			ip6e = mtod(ext, struct ip6_ext *);
 			if (nxt == IPPROTO_AH)
 				elen = (ip6e->ip6e_len + 2) << 2;
 			else
 				elen = (ip6e->ip6e_len + 1) << 3;
 			if (elen != ext->m_len) {
 				m_freem(ext);
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 #endif
 
 			switch (nxt) {
 			case IPPROTO_DSTOPTS:
 				if (!(in6p->inp_flags & IN6P_DSTOPTS))
 					break;
 
 				*mp = sbcreatecontrol((caddr_t)ip6e, elen,
 				    IS2292(in6p,
 					IPV6_2292DSTOPTS, IPV6_DSTOPTS),
 				    IPPROTO_IPV6);
 				if (*mp)
 					mp = &(*mp)->m_next;
 				break;
 			case IPPROTO_ROUTING:
 				if (!(in6p->inp_flags & IN6P_RTHDR))
 					break;
 
 				*mp = sbcreatecontrol((caddr_t)ip6e, elen,
 				    IS2292(in6p, IPV6_2292RTHDR, IPV6_RTHDR),
 				    IPPROTO_IPV6);
 				if (*mp)
 					mp = &(*mp)->m_next;
 				break;
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_AH: /* is it possible? */
 				break;
 
 			default:
 				/*
 				 * other cases have been filtered in the above.
 				 * none will visit this case.  here we supply
 				 * the code just in case (nxt overwritten or
 				 * other cases).
 				 */
 #ifdef PULLDOWN_TEST
 				m_freem(ext);
 #endif
 				goto loopend;
 
 			}
 
 			/* proceed with the next header. */
 			off += elen;
 			nxt = ip6e->ip6e_nxt;
 			ip6e = NULL;
 #ifdef PULLDOWN_TEST
 			m_freem(ext);
 			ext = NULL;
 #endif
 		}
 	  loopend:
 		;
 	}
 
 	if (in6p->inp_flags2 & INP_RECVFLOWID) {
 		uint32_t flowid, flow_type;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		/*
 		 * XXX should handle the failure of one or the
 		 * other - don't populate both?
 		 */
 		*mp = sbcreatecontrol((caddr_t) &flowid,
 		    sizeof(uint32_t), IPV6_FLOWID, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 		*mp = sbcreatecontrol((caddr_t) &flow_type,
 		    sizeof(uint32_t), IPV6_FLOWTYPE, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 #ifdef	RSS
 	if (in6p->inp_flags2 & INP_RECVRSSBUCKETID) {
 		uint32_t flowid, flow_type;
 		uint32_t rss_bucketid;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) {
 			*mp = sbcreatecontrol((caddr_t) &rss_bucketid,
 			   sizeof(uint32_t), IPV6_RSSBUCKETID, IPPROTO_IPV6);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 #endif
 
 }
 #undef IS2292
 
 void
 ip6_notify_pmtu(struct inpcb *inp, struct sockaddr_in6 *dst, u_int32_t mtu)
 {
 	struct socket *so;
 	struct mbuf *m_mtu;
 	struct ip6_mtuinfo mtuctl;
 
 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 	/*
 	 * Notify the error by sending IPV6_PATHMTU ancillary data if
 	 * application wanted to know the MTU value.
 	 * NOTE: we notify disconnected sockets, because some udp
 	 * applications keep sending sockets disconnected.
 	 * NOTE: our implementation doesn't notify connected sockets that has
 	 * foreign address that is different than given destination addresses
 	 * (this is permitted by RFC 3542).
 	 */
 	if ((inp->inp_flags & IN6P_MTU) == 0 || (
 	    !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
 	    !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &dst->sin6_addr)))
 		return;
 
 	mtuctl.ip6m_mtu = mtu;
 	mtuctl.ip6m_addr = *dst;
 	if (sa6_recoverscope(&mtuctl.ip6m_addr))
 		return;
 
 	if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl),
 	    IPV6_PATHMTU, IPPROTO_IPV6)) == NULL)
 		return;
 
 	so =  inp->inp_socket;
 	if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu)
 	    == 0) {
 		m_freem(m_mtu);
 		/* XXX: should count statistics */
 	} else
 		sorwakeup(so);
 }
 
 #ifdef PULLDOWN_TEST
 /*
  * pull single extension header from mbuf chain.  returns single mbuf that
  * contains the result, or NULL on error.
  */
 static struct mbuf *
 ip6_pullexthdr(struct mbuf *m, size_t off, int nxt)
 {
 	struct ip6_ext ip6e;
 	size_t elen;
 	struct mbuf *n;
 
 #ifdef DIAGNOSTIC
 	switch (nxt) {
 	case IPPROTO_DSTOPTS:
 	case IPPROTO_ROUTING:
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_AH: /* is it possible? */
 		break;
 	default:
 		printf("ip6_pullexthdr: invalid nxt=%d\n", nxt);
 	}
 #endif
 
 	m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 	if (nxt == IPPROTO_AH)
 		elen = (ip6e.ip6e_len + 2) << 2;
 	else
 		elen = (ip6e.ip6e_len + 1) << 3;
 
 	if (elen > MLEN)
 		n = m_getcl(M_NOWAIT, MT_DATA, 0);
 	else
 		n = m_get(M_NOWAIT, MT_DATA);
 	if (n == NULL)
 		return NULL;
 
 	m_copydata(m, off, elen, mtod(n, caddr_t));
 	n->m_len = elen;
 	return n;
 }
 #endif
 
 /*
  * Get pointer to the previous header followed by the header
  * currently processed.
  * XXX: This function supposes that
  *	M includes all headers,
  *	the next header field and the header length field of each header
  *	are valid, and
  *	the sum of each header length equals to OFF.
  * Because of these assumptions, this function must be called very
  * carefully. Moreover, it will not be used in the near future when
  * we develop `neater' mechanism to process extension headers.
  */
 char *
 ip6_get_prevhdr(const struct mbuf *m, int off)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 
 	if (off == sizeof(struct ip6_hdr))
 		return (&ip6->ip6_nxt);
 	else {
 		int len, nxt;
 		struct ip6_ext *ip6e = NULL;
 
 		nxt = ip6->ip6_nxt;
 		len = sizeof(struct ip6_hdr);
 		while (len < off) {
 			ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + len);
 
 			switch (nxt) {
 			case IPPROTO_FRAGMENT:
 				len += sizeof(struct ip6_frag);
 				break;
 			case IPPROTO_AH:
 				len += (ip6e->ip6e_len + 2) << 2;
 				break;
 			default:
 				len += (ip6e->ip6e_len + 1) << 3;
 				break;
 			}
 			nxt = ip6e->ip6e_nxt;
 		}
 		if (ip6e)
 			return (&ip6e->ip6e_nxt);
 		else
 			return NULL;
 	}
 }
 
 /*
  * get next header offset.  m will be retained.
  */
 int
 ip6_nexthdr(const struct mbuf *m, int off, int proto, int *nxtp)
 {
 	struct ip6_hdr ip6;
 	struct ip6_ext ip6e;
 	struct ip6_frag fh;
 
 	/* just in case */
 	if (m == NULL)
 		panic("ip6_nexthdr: m == NULL");
 	if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off)
 		return -1;
 
 	switch (proto) {
 	case IPPROTO_IPV6:
 		if (m->m_pkthdr.len < off + sizeof(ip6))
 			return -1;
 		m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6);
 		if (nxtp)
 			*nxtp = ip6.ip6_nxt;
 		off += sizeof(ip6);
 		return off;
 
 	case IPPROTO_FRAGMENT:
 		/*
 		 * terminate parsing if it is not the first fragment,
 		 * it does not make sense to parse through it.
 		 */
 		if (m->m_pkthdr.len < off + sizeof(fh))
 			return -1;
 		m_copydata(m, off, sizeof(fh), (caddr_t)&fh);
 		/* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */
 		if (fh.ip6f_offlg & IP6F_OFF_MASK)
 			return -1;
 		if (nxtp)
 			*nxtp = fh.ip6f_nxt;
 		off += sizeof(struct ip6_frag);
 		return off;
 
 	case IPPROTO_AH:
 		if (m->m_pkthdr.len < off + sizeof(ip6e))
 			return -1;
 		m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 		if (nxtp)
 			*nxtp = ip6e.ip6e_nxt;
 		off += (ip6e.ip6e_len + 2) << 2;
 		return off;
 
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_ROUTING:
 	case IPPROTO_DSTOPTS:
 		if (m->m_pkthdr.len < off + sizeof(ip6e))
 			return -1;
 		m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 		if (nxtp)
 			*nxtp = ip6e.ip6e_nxt;
 		off += (ip6e.ip6e_len + 1) << 3;
 		return off;
 
 	case IPPROTO_NONE:
 	case IPPROTO_ESP:
 	case IPPROTO_IPCOMP:
 		/* give up */
 		return -1;
 
 	default:
 		return -1;
 	}
 
 	/* NOTREACHED */
 }
 
 /*
  * get offset for the last header in the chain.  m will be kept untainted.
  */
 int
 ip6_lasthdr(const struct mbuf *m, int off, int proto, int *nxtp)
 {
 	int newoff;
 	int nxt;
 
 	if (!nxtp) {
 		nxt = -1;
 		nxtp = &nxt;
 	}
 	while (1) {
 		newoff = ip6_nexthdr(m, off, proto, nxtp);
 		if (newoff < 0)
 			return off;
 		else if (newoff < off)
 			return -1;	/* invalid */
 		else if (newoff == off)
 			return newoff;
 
 		off = newoff;
 		proto = *nxtp;
 	}
 }
 
 /*
  * System control for IP6
  */
 
 u_char	inet6ctlerrmap[PRC_NCMDS] = {
 	0,		0,		0,		0,
 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
 	0,		0,		0,		0,
 	ENOPROTOOPT
 };
Index: projects/vnet/sys/netinet6/nd6.c
===================================================================
--- projects/vnet/sys/netinet6/nd6.c	(revision 299172)
+++ projects/vnet/sys/netinet6/nd6.c	(revision 299173)
@@ -1,2682 +1,2694 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/protosw.h>
 #include <sys/errno.h>
 #include <sys/syslog.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/queue.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arc.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/iso88025.h>
 #include <net/fddi.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <net/if_llatbl.h>
 #include <netinet/if_ether.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet/icmp6.h>
 #include <netinet6/send.h>
 
 #include <sys/limits.h>
 
 #include <security/mac/mac_framework.h>
 
 #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */
 #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */
 
 #define SIN6(s) ((const struct sockaddr_in6 *)(s))
 
 /* timer values */
 VNET_DEFINE(int, nd6_prune)	= 1;	/* walk list every 1 seconds */
 VNET_DEFINE(int, nd6_delay)	= 5;	/* delay first probe time 5 second */
 VNET_DEFINE(int, nd6_umaxtries)	= 3;	/* maximum unicast query */
 VNET_DEFINE(int, nd6_mmaxtries)	= 3;	/* maximum multicast query */
 VNET_DEFINE(int, nd6_useloopback) = 1;	/* use loopback interface for
 					 * local traffic */
 VNET_DEFINE(int, nd6_gctimer)	= (60 * 60 * 24); /* 1 day: garbage
 					 * collection timer */
 
 /* preventing too many loops in ND option parsing */
 static VNET_DEFINE(int, nd6_maxndopt) = 10; /* max # of ND options allowed */
 
 VNET_DEFINE(int, nd6_maxnudhint) = 0;	/* max # of subsequent upper
 					 * layer hints */
 static VNET_DEFINE(int, nd6_maxqueuelen) = 1; /* max pkts cached in unresolved
 					 * ND entries */
 #define	V_nd6_maxndopt			VNET(nd6_maxndopt)
 #define	V_nd6_maxqueuelen		VNET(nd6_maxqueuelen)
 
 #ifdef ND6_DEBUG
 VNET_DEFINE(int, nd6_debug) = 1;
 #else
 VNET_DEFINE(int, nd6_debug) = 0;
 #endif
 
 static eventhandler_tag lle_event_eh, iflladdr_event_eh;
 
 VNET_DEFINE(struct nd_drhead, nd_defrouter);
 VNET_DEFINE(struct nd_prhead, nd_prefix);
 VNET_DEFINE(struct rwlock, nd6_lock);
 
 VNET_DEFINE(int, nd6_recalc_reachtm_interval) = ND6_RECALC_REACHTM_INTERVAL;
 #define	V_nd6_recalc_reachtm_interval	VNET(nd6_recalc_reachtm_interval)
 
 int	(*send_sendso_input_hook)(struct mbuf *, struct ifnet *, int, int);
 
 static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *,
 	struct ifnet *);
 static void nd6_setmtu0(struct ifnet *, struct nd_ifinfo *);
 static void nd6_slowtimo(void *);
 static int regen_tmpaddr(struct in6_ifaddr *);
 static void nd6_free(struct llentry **, int);
 static void nd6_free_redirect(const struct llentry *);
 static void nd6_llinfo_timer(void *);
 static void nd6_llinfo_settimer_locked(struct llentry *, long);
 static void clear_llinfo_pqueue(struct llentry *);
 static void nd6_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
 static int nd6_resolve_slow(struct ifnet *, int, struct mbuf *,
     const struct sockaddr_in6 *, u_char *, uint32_t *);
 static int nd6_need_cache(struct ifnet *);
  
 
 static VNET_DEFINE(struct callout, nd6_slowtimo_ch);
 #define	V_nd6_slowtimo_ch		VNET(nd6_slowtimo_ch)
 
 VNET_DEFINE(struct callout, nd6_timer_ch);
 #define	V_nd6_timer_ch			VNET(nd6_timer_ch)
 
 static void
 nd6_lle_event(void *arg __unused, struct llentry *lle, int evt)
 {
 	struct rt_addrinfo rtinfo;
 	struct sockaddr_in6 dst;
 	struct sockaddr_dl gw;
 	struct ifnet *ifp;
 	int type;
 
 	LLE_WLOCK_ASSERT(lle);
 
 	if (lltable_get_af(lle->lle_tbl) != AF_INET6)
 		return;
 
 	switch (evt) {
 	case LLENTRY_RESOLVED:
 		type = RTM_ADD;
 		KASSERT(lle->la_flags & LLE_VALID,
 		    ("%s: %p resolved but not valid?", __func__, lle));
 		break;
 	case LLENTRY_EXPIRED:
 		type = RTM_DELETE;
 		break;
 	default:
 		return;
 	}
 
 	ifp = lltable_get_ifp(lle->lle_tbl);
 
 	bzero(&dst, sizeof(dst));
 	bzero(&gw, sizeof(gw));
 	bzero(&rtinfo, sizeof(rtinfo));
 	lltable_fill_sa_entry(lle, (struct sockaddr *)&dst);
 	dst.sin6_scope_id = in6_getscopezone(ifp,
 	    in6_addrscope(&dst.sin6_addr));
 	gw.sdl_len = sizeof(struct sockaddr_dl);
 	gw.sdl_family = AF_LINK;
 	gw.sdl_alen = ifp->if_addrlen;
 	gw.sdl_index = ifp->if_index;
 	gw.sdl_type = ifp->if_type;
 	if (evt == LLENTRY_RESOLVED)
 		bcopy(lle->ll_addr, gw.sdl_data, ifp->if_addrlen);
 	rtinfo.rti_info[RTAX_DST] = (struct sockaddr *)&dst;
 	rtinfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gw;
 	rtinfo.rti_addrs = RTA_DST | RTA_GATEWAY;
 	rt_missmsg_fib(type, &rtinfo, RTF_HOST | RTF_LLDATA | (
 	    type == RTM_ADD ? RTF_UP: 0), 0, RT_DEFAULT_FIB);
 }
 
 /*
  * A handler for interface link layer address change event.
  */
 static void
 nd6_iflladdr(void *arg __unused, struct ifnet *ifp)
 {
 
 	lltable_update_ifaddr(LLTABLE6(ifp));
 }
 
 void
 nd6_init(void)
 {
 
 	rw_init(&V_nd6_lock, "nd6");
 
 	LIST_INIT(&V_nd_prefix);
 
 	/* initialization of the default router list */
 	TAILQ_INIT(&V_nd_defrouter);
 
 	/* Start timers. */
 	callout_init(&V_nd6_slowtimo_ch, 0);
 	callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
 	    nd6_slowtimo, curvnet);
 
 	callout_init(&V_nd6_timer_ch, 0);
 	callout_reset(&V_nd6_timer_ch, hz, nd6_timer, curvnet);
 
 	nd6_dad_init();
 	if (IS_DEFAULT_VNET(curvnet)) {
 		lle_event_eh = EVENTHANDLER_REGISTER(lle_event, nd6_lle_event,
 		    NULL, EVENTHANDLER_PRI_ANY);
 		iflladdr_event_eh = EVENTHANDLER_REGISTER(iflladdr_event,
 		    nd6_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
 	}
 }
 
 #ifdef VIMAGE
 void
 nd6_destroy()
 {
 
 	callout_drain(&V_nd6_slowtimo_ch);
 	callout_drain(&V_nd6_timer_ch);
 	if (IS_DEFAULT_VNET(curvnet)) {
 		EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh);
 		EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_event_eh);
 	}
 	rw_destroy(&V_nd6_lock);
 }
 #endif
 
 struct nd_ifinfo *
 nd6_ifattach(struct ifnet *ifp)
 {
 	struct nd_ifinfo *nd;
 
 	nd = malloc(sizeof(*nd), M_IP6NDP, M_WAITOK | M_ZERO);
 	nd->initialized = 1;
 
 	nd->chlim = IPV6_DEFHLIM;
 	nd->basereachable = REACHABLE_TIME;
 	nd->reachable = ND_COMPUTE_RTIME(nd->basereachable);
 	nd->retrans = RETRANS_TIMER;
 
 	nd->flags = ND6_IFF_PERFORMNUD;
 
 	/* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL.
 	 * XXXHRS: Clear ND6_IFF_AUTO_LINKLOCAL on an IFT_BRIDGE interface by
 	 * default regardless of the V_ip6_auto_linklocal configuration to
 	 * give a reasonable default behavior.
 	 */
 	if ((V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) ||
 	    (ifp->if_flags & IFF_LOOPBACK))
 		nd->flags |= ND6_IFF_AUTO_LINKLOCAL;
 	/*
 	 * A loopback interface does not need to accept RTADV.
 	 * XXXHRS: Clear ND6_IFF_ACCEPT_RTADV on an IFT_BRIDGE interface by
 	 * default regardless of the V_ip6_accept_rtadv configuration to
 	 * prevent the interface from accepting RA messages arrived
 	 * on one of the member interfaces with ND6_IFF_ACCEPT_RTADV.
 	 */
 	if (V_ip6_accept_rtadv &&
 	    !(ifp->if_flags & IFF_LOOPBACK) &&
 	    (ifp->if_type != IFT_BRIDGE))
 			nd->flags |= ND6_IFF_ACCEPT_RTADV;
 	if (V_ip6_no_radr && !(ifp->if_flags & IFF_LOOPBACK))
 		nd->flags |= ND6_IFF_NO_RADR;
 
 	/* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */
 	nd6_setmtu0(ifp, nd);
 
 	return nd;
 }
 
 void
-nd6_ifdetach(struct nd_ifinfo *nd)
+nd6_ifdetach(struct ifnet *ifp, struct nd_ifinfo *nd)
 {
+	struct ifaddr *ifa, *next;
 
+	IF_ADDR_RLOCK(ifp);
+	TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
+		if (ifa->ifa_addr->sa_family != AF_INET6)
+			continue;
+
+		/* stop DAD processing */
+		nd6_dad_stop(ifa);
+	}
+	IF_ADDR_RUNLOCK(ifp);
+
 	free(nd, M_IP6NDP);
 }
 
 /*
  * Reset ND level link MTU. This function is called when the physical MTU
  * changes, which means we might have to adjust the ND level MTU.
  */
 void
 nd6_setmtu(struct ifnet *ifp)
 {
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return;
 
 	nd6_setmtu0(ifp, ND_IFINFO(ifp));
 }
 
 /* XXX todo: do not maintain copy of ifp->if_mtu in ndi->maxmtu */
 void
 nd6_setmtu0(struct ifnet *ifp, struct nd_ifinfo *ndi)
 {
 	u_int32_t omaxmtu;
 
 	omaxmtu = ndi->maxmtu;
 
 	switch (ifp->if_type) {
 	case IFT_ARCNET:
 		ndi->maxmtu = MIN(ARC_PHDS_MAXMTU, ifp->if_mtu); /* RFC2497 */
 		break;
 	case IFT_FDDI:
 		ndi->maxmtu = MIN(FDDIIPMTU, ifp->if_mtu); /* RFC2467 */
 		break;
 	case IFT_ISO88025:
 		 ndi->maxmtu = MIN(ISO88025_MAX_MTU, ifp->if_mtu);
 		 break;
 	default:
 		ndi->maxmtu = ifp->if_mtu;
 		break;
 	}
 
 	/*
 	 * Decreasing the interface MTU under IPV6 minimum MTU may cause
 	 * undesirable situation.  We thus notify the operator of the change
 	 * explicitly.  The check for omaxmtu is necessary to restrict the
 	 * log to the case of changing the MTU, not initializing it.
 	 */
 	if (omaxmtu >= IPV6_MMTU && ndi->maxmtu < IPV6_MMTU) {
 		log(LOG_NOTICE, "nd6_setmtu0: "
 		    "new link MTU on %s (%lu) is too small for IPv6\n",
 		    if_name(ifp), (unsigned long)ndi->maxmtu);
 	}
 
 	if (ndi->maxmtu > V_in6_maxmtu)
 		in6_setmaxmtu(); /* check all interfaces just in case */
 
 }
 
 void
 nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts)
 {
 
 	bzero(ndopts, sizeof(*ndopts));
 	ndopts->nd_opts_search = (struct nd_opt_hdr *)opt;
 	ndopts->nd_opts_last
 		= (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len);
 
 	if (icmp6len == 0) {
 		ndopts->nd_opts_done = 1;
 		ndopts->nd_opts_search = NULL;
 	}
 }
 
 /*
  * Take one ND option.
  */
 struct nd_opt_hdr *
 nd6_option(union nd_opts *ndopts)
 {
 	struct nd_opt_hdr *nd_opt;
 	int olen;
 
 	KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__));
 	KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts",
 	    __func__));
 	if (ndopts->nd_opts_search == NULL)
 		return NULL;
 	if (ndopts->nd_opts_done)
 		return NULL;
 
 	nd_opt = ndopts->nd_opts_search;
 
 	/* make sure nd_opt_len is inside the buffer */
 	if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) {
 		bzero(ndopts, sizeof(*ndopts));
 		return NULL;
 	}
 
 	olen = nd_opt->nd_opt_len << 3;
 	if (olen == 0) {
 		/*
 		 * Message validation requires that all included
 		 * options have a length that is greater than zero.
 		 */
 		bzero(ndopts, sizeof(*ndopts));
 		return NULL;
 	}
 
 	ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen);
 	if (ndopts->nd_opts_search > ndopts->nd_opts_last) {
 		/* option overruns the end of buffer, invalid */
 		bzero(ndopts, sizeof(*ndopts));
 		return NULL;
 	} else if (ndopts->nd_opts_search == ndopts->nd_opts_last) {
 		/* reached the end of options chain */
 		ndopts->nd_opts_done = 1;
 		ndopts->nd_opts_search = NULL;
 	}
 	return nd_opt;
 }
 
 /*
  * Parse multiple ND options.
  * This function is much easier to use, for ND routines that do not need
  * multiple options of the same type.
  */
 int
 nd6_options(union nd_opts *ndopts)
 {
 	struct nd_opt_hdr *nd_opt;
 	int i = 0;
 
 	KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__));
 	KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts",
 	    __func__));
 	if (ndopts->nd_opts_search == NULL)
 		return 0;
 
 	while (1) {
 		nd_opt = nd6_option(ndopts);
 		if (nd_opt == NULL && ndopts->nd_opts_last == NULL) {
 			/*
 			 * Message validation requires that all included
 			 * options have a length that is greater than zero.
 			 */
 			ICMP6STAT_INC(icp6s_nd_badopt);
 			bzero(ndopts, sizeof(*ndopts));
 			return -1;
 		}
 
 		if (nd_opt == NULL)
 			goto skip1;
 
 		switch (nd_opt->nd_opt_type) {
 		case ND_OPT_SOURCE_LINKADDR:
 		case ND_OPT_TARGET_LINKADDR:
 		case ND_OPT_MTU:
 		case ND_OPT_REDIRECTED_HEADER:
 		case ND_OPT_NONCE:
 			if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
 				nd6log((LOG_INFO,
 				    "duplicated ND6 option found (type=%d)\n",
 				    nd_opt->nd_opt_type));
 				/* XXX bark? */
 			} else {
 				ndopts->nd_opt_array[nd_opt->nd_opt_type]
 					= nd_opt;
 			}
 			break;
 		case ND_OPT_PREFIX_INFORMATION:
 			if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) {
 				ndopts->nd_opt_array[nd_opt->nd_opt_type]
 					= nd_opt;
 			}
 			ndopts->nd_opts_pi_end =
 				(struct nd_opt_prefix_info *)nd_opt;
 			break;
 		/* What about ND_OPT_ROUTE_INFO? RFC 4191 */
 		case ND_OPT_RDNSS:	/* RFC 6106 */
 		case ND_OPT_DNSSL:	/* RFC 6106 */
 			/*
 			 * Silently ignore options we know and do not care about
 			 * in the kernel.
 			 */
 			break;
 		default:
 			/*
 			 * Unknown options must be silently ignored,
 			 * to accomodate future extension to the protocol.
 			 */
 			nd6log((LOG_DEBUG,
 			    "nd6_options: unsupported option %d - "
 			    "option ignored\n", nd_opt->nd_opt_type));
 		}
 
 skip1:
 		i++;
 		if (i > V_nd6_maxndopt) {
 			ICMP6STAT_INC(icp6s_nd_toomanyopt);
 			nd6log((LOG_INFO, "too many loop in nd opt\n"));
 			break;
 		}
 
 		if (ndopts->nd_opts_done)
 			break;
 	}
 
 	return 0;
 }
 
 /*
  * ND6 timer routine to handle ND6 entries
  */
 static void
 nd6_llinfo_settimer_locked(struct llentry *ln, long tick)
 {
 	int canceled;
 
 	LLE_WLOCK_ASSERT(ln);
 
 	if (tick < 0) {
 		ln->la_expire = 0;
 		ln->ln_ntick = 0;
 		canceled = callout_stop(&ln->lle_timer);
 	} else {
 		ln->la_expire = time_uptime + tick / hz;
 		LLE_ADDREF(ln);
 		if (tick > INT_MAX) {
 			ln->ln_ntick = tick - INT_MAX;
 			canceled = callout_reset(&ln->lle_timer, INT_MAX,
 			    nd6_llinfo_timer, ln);
 		} else {
 			ln->ln_ntick = 0;
 			canceled = callout_reset(&ln->lle_timer, tick,
 			    nd6_llinfo_timer, ln);
 		}
 	}
 	if (canceled > 0)
 		LLE_REMREF(ln);
 }
 
 /*
  * Gets source address of the first packet in hold queue
  * and stores it in @src.
  * Returns pointer to @src (if hold queue is not empty) or NULL.
  *
  * Set noinline to be dtrace-friendly
  */
 static __noinline struct in6_addr *
 nd6_llinfo_get_holdsrc(struct llentry *ln, struct in6_addr *src)
 {
 	struct ip6_hdr hdr;
 	struct mbuf *m;
 
 	if (ln->la_hold == NULL)
 		return (NULL);
 
 	/*
 	 * assume every packet in la_hold has the same IP header
 	 */
 	m = ln->la_hold;
 	if (sizeof(hdr) > m->m_len)
 		return (NULL);
 
 	m_copydata(m, 0, sizeof(hdr), (caddr_t)&hdr);
 	*src = hdr.ip6_src;
 
 	return (src);
 }
 
 /*
  * Checks if we need to switch from STALE state.
  *
  * RFC 4861 requires switching from STALE to DELAY state
  * on first packet matching entry, waiting V_nd6_delay and
  * transition to PROBE state (if upper layer confirmation was
  * not received).
  *
  * This code performs a bit differently:
  * On packet hit we don't change state (but desired state
  * can be guessed by control plane). However, after V_nd6_delay
  * seconds code will transition to PROBE state (so DELAY state
  * is kinda skipped in most situations).
  *
  * Typically, V_nd6_gctimer is bigger than V_nd6_delay, so
  * we perform the following upon entering STALE state:
  *
  * 1) Arm timer to run each V_nd6_delay seconds to make sure that
  * if packet was transmitted at the start of given interval, we
  * would be able to switch to PROBE state in V_nd6_delay seconds
  * as user expects.
  *
  * 2) Reschedule timer until original V_nd6_gctimer expires keeping
  * lle in STALE state (remaining timer value stored in lle_remtime).
  *
  * 3) Reschedule timer if packet was transmitted less that V_nd6_delay
  * seconds ago.
  *
  * Returns non-zero value if the entry is still STALE (storing
  * the next timer interval in @pdelay).
  *
  * Returns zero value if original timer expired or we need to switch to
  * PROBE (store that in @do_switch variable).
  */
 static int
 nd6_is_stale(struct llentry *lle, long *pdelay, int *do_switch)
 {
 	int nd_delay, nd_gctimer, r_skip_req;
 	time_t lle_hittime;
 	long delay;
 
 	*do_switch = 0;
 	nd_gctimer = V_nd6_gctimer;
 	nd_delay = V_nd6_delay;
 
 	LLE_REQ_LOCK(lle);
 	r_skip_req = lle->r_skip_req;
 	lle_hittime = lle->lle_hittime;
 	LLE_REQ_UNLOCK(lle);
 
 	if (r_skip_req > 0) {
 
 		/*
 		 * Nonzero r_skip_req value was set upon entering
 		 * STALE state. Since value was not changed, no
 		 * packets were passed using this lle. Ask for
 		 * timer reschedule and keep STALE state.
 		 */
 		delay = (long)(MIN(nd_gctimer, nd_delay));
 		delay *= hz;
 		if (lle->lle_remtime > delay)
 			lle->lle_remtime -= delay;
 		else {
 			delay = lle->lle_remtime;
 			lle->lle_remtime = 0;
 		}
 
 		if (delay == 0) {
 
 			/*
 			 * The original ng6_gctime timeout ended,
 			 * no more rescheduling.
 			 */
 			return (0);
 		}
 
 		*pdelay = delay;
 		return (1);
 	}
 
 	/*
 	 * Packet received. Verify timestamp
 	 */
 	delay = (long)(time_uptime - lle_hittime);
 	if (delay < nd_delay) {
 
 		/*
 		 * V_nd6_delay still not passed since the first
 		 * hit in STALE state.
 		 * Reshedule timer and return.
 		 */
 		*pdelay = (long)(nd_delay - delay) * hz;
 		return (1);
 	}
 
 	/* Request switching to probe */
 	*do_switch = 1;
 	return (0);
 }
 
 
 /*
  * Switch @lle state to new state optionally arming timers.
  *
  * Set noinline to be dtrace-friendly
  */
 __noinline void
 nd6_llinfo_setstate(struct llentry *lle, int newstate)
 {
 	struct ifnet *ifp;
 	int nd_gctimer, nd_delay;
 	long delay, remtime;
 
 	delay = 0;
 	remtime = 0;
 
 	switch (newstate) {
 	case ND6_LLINFO_INCOMPLETE:
 		ifp = lle->lle_tbl->llt_ifp;
 		delay = (long)ND_IFINFO(ifp)->retrans * hz / 1000;
 		break;
 	case ND6_LLINFO_REACHABLE:
 		if (!ND6_LLINFO_PERMANENT(lle)) {
 			ifp = lle->lle_tbl->llt_ifp;
 			delay = (long)ND_IFINFO(ifp)->reachable * hz;
 		}
 		break;
 	case ND6_LLINFO_STALE:
 
 		/*
 		 * Notify fast path that we want to know if any packet
 		 * is transmitted by setting r_skip_req.
 		 */
 		LLE_REQ_LOCK(lle);
 		lle->r_skip_req = 1;
 		LLE_REQ_UNLOCK(lle);
 		nd_delay = V_nd6_delay;
 		nd_gctimer = V_nd6_gctimer;
 
 		delay = (long)(MIN(nd_gctimer, nd_delay)) * hz;
 		remtime = (long)nd_gctimer * hz - delay;
 		break;
 	case ND6_LLINFO_DELAY:
 		lle->la_asked = 0;
 		delay = (long)V_nd6_delay * hz;
 		break;
 	}
 
 	if (delay > 0)
 		nd6_llinfo_settimer_locked(lle, delay);
 
 	lle->lle_remtime = remtime;
 	lle->ln_state = newstate;
 }
 
 /*
  * Timer-dependent part of nd state machine.
  *
  * Set noinline to be dtrace-friendly
  */
 static __noinline void
 nd6_llinfo_timer(void *arg)
 {
 	struct llentry *ln;
 	struct in6_addr *dst, *pdst, *psrc, src;
 	struct ifnet *ifp;
 	struct nd_ifinfo *ndi;
 	int do_switch, send_ns;
 	long delay;
 
 	KASSERT(arg != NULL, ("%s: arg NULL", __func__));
 	ln = (struct llentry *)arg;
 	ifp = lltable_get_ifp(ln->lle_tbl);
 	CURVNET_SET(ifp->if_vnet);
 
 	ND6_RLOCK();
 	LLE_WLOCK(ln);
 	if (callout_pending(&ln->lle_timer)) {
 		/*
 		 * Here we are a bit odd here in the treatment of 
 		 * active/pending. If the pending bit is set, it got
 		 * rescheduled before I ran. The active
 		 * bit we ignore, since if it was stopped
 		 * in ll_tablefree() and was currently running
 		 * it would have return 0 so the code would
 		 * not have deleted it since the callout could
 		 * not be stopped so we want to go through
 		 * with the delete here now. If the callout
 		 * was restarted, the pending bit will be back on and
 		 * we just want to bail since the callout_reset would
 		 * return 1 and our reference would have been removed
 		 * by nd6_llinfo_settimer_locked above since canceled
 		 * would have been 1.
 		 */
 		LLE_WUNLOCK(ln);
 		ND6_RUNLOCK();
 		CURVNET_RESTORE();
 		return;
 	}
 	ndi = ND_IFINFO(ifp);
 	send_ns = 0;
 	dst = &ln->r_l3addr.addr6;
 	pdst = dst;
 
 	if (ln->ln_ntick > 0) {
 		if (ln->ln_ntick > INT_MAX) {
 			ln->ln_ntick -= INT_MAX;
 			nd6_llinfo_settimer_locked(ln, INT_MAX);
 		} else {
 			ln->ln_ntick = 0;
 			nd6_llinfo_settimer_locked(ln, ln->ln_ntick);
 		}
 		goto done;
 	}
 
 	if (ln->la_flags & LLE_STATIC) {
 		goto done;
 	}
 
 	if (ln->la_flags & LLE_DELETED) {
 		nd6_free(&ln, 0);
 		goto done;
 	}
 
 	switch (ln->ln_state) {
 	case ND6_LLINFO_INCOMPLETE:
 		if (ln->la_asked < V_nd6_mmaxtries) {
 			ln->la_asked++;
 			send_ns = 1;
 			/* Send NS to multicast address */
 			pdst = NULL;
 		} else {
 			struct mbuf *m = ln->la_hold;
 			if (m) {
 				struct mbuf *m0;
 
 				/*
 				 * assuming every packet in la_hold has the
 				 * same IP header.  Send error after unlock.
 				 */
 				m0 = m->m_nextpkt;
 				m->m_nextpkt = NULL;
 				ln->la_hold = m0;
 				clear_llinfo_pqueue(ln);
 			}
 			nd6_free(&ln, 0);
 			if (m != NULL)
 				icmp6_error2(m, ICMP6_DST_UNREACH,
 				    ICMP6_DST_UNREACH_ADDR, 0, ifp);
 		}
 		break;
 	case ND6_LLINFO_REACHABLE:
 		if (!ND6_LLINFO_PERMANENT(ln))
 			nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
 		break;
 
 	case ND6_LLINFO_STALE:
 		if (nd6_is_stale(ln, &delay, &do_switch) != 0) {
 
 			/*
 			 * No packet has used this entry and GC timeout
 			 * has not been passed. Reshedule timer and
 			 * return.
 			 */
 			nd6_llinfo_settimer_locked(ln, delay);
 			break;
 		}
 
 		if (do_switch == 0) {
 
 			/*
 			 * GC timer has ended and entry hasn't been used.
 			 * Run Garbage collector (RFC 4861, 5.3)
 			 */
 			if (!ND6_LLINFO_PERMANENT(ln))
 				nd6_free(&ln, 1);
 			break;
 		}
 
 		/* Entry has been used AND delay timer has ended. */
 
 		/* FALLTHROUGH */
 
 	case ND6_LLINFO_DELAY:
 		if (ndi && (ndi->flags & ND6_IFF_PERFORMNUD) != 0) {
 			/* We need NUD */
 			ln->la_asked = 1;
 			nd6_llinfo_setstate(ln, ND6_LLINFO_PROBE);
 			send_ns = 1;
 		} else
 			nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); /* XXX */
 		break;
 	case ND6_LLINFO_PROBE:
 		if (ln->la_asked < V_nd6_umaxtries) {
 			ln->la_asked++;
 			send_ns = 1;
 		} else {
 			nd6_free(&ln, 0);
 		}
 		break;
 	default:
 		panic("%s: paths in a dark night can be confusing: %d",
 		    __func__, ln->ln_state);
 	}
 done:
 	if (ln != NULL)
 		ND6_RUNLOCK();
 	if (send_ns != 0) {
 		nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000);
 		psrc = nd6_llinfo_get_holdsrc(ln, &src);
 		LLE_FREE_LOCKED(ln);
 		ln = NULL;
 		nd6_ns_output(ifp, psrc, pdst, dst, NULL);
 	}
 
 	if (ln != NULL)
 		LLE_FREE_LOCKED(ln);
 	CURVNET_RESTORE();
 }
 
 
 /*
  * ND6 timer routine to expire default route list and prefix list
  */
 void
 nd6_timer(void *arg)
 {
 	CURVNET_SET((struct vnet *) arg);
 	struct nd_drhead drq;
 	struct nd_defrouter *dr, *ndr;
 	struct nd_prefix *pr, *npr;
 	struct in6_ifaddr *ia6, *nia6;
 
-	callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz,
-	    nd6_timer, curvnet);
-
 	TAILQ_INIT(&drq);
 
 	/* expire default router list */
 	ND6_WLOCK();
 	TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr)
 		if (dr->expire && dr->expire < time_uptime)
 			defrouter_unlink(dr, &drq);
 	ND6_WUNLOCK();
 
 	while ((dr = TAILQ_FIRST(&drq)) != NULL) {
 		TAILQ_REMOVE(&drq, dr, dr_entry);
 		defrouter_del(dr);
 	}
 
 	/*
 	 * expire interface addresses.
 	 * in the past the loop was inside prefix expiry processing.
 	 * However, from a stricter speci-confrmance standpoint, we should
 	 * rather separate address lifetimes and prefix lifetimes.
 	 *
 	 * XXXRW: in6_ifaddrhead locking.
 	 */
   addrloop:
 	TAILQ_FOREACH_SAFE(ia6, &V_in6_ifaddrhead, ia_link, nia6) {
 		/* check address lifetime */
 		if (IFA6_IS_INVALID(ia6)) {
 			int regen = 0;
 
 			/*
 			 * If the expiring address is temporary, try
 			 * regenerating a new one.  This would be useful when
 			 * we suspended a laptop PC, then turned it on after a
 			 * period that could invalidate all temporary
 			 * addresses.  Although we may have to restart the
 			 * loop (see below), it must be after purging the
 			 * address.  Otherwise, we'd see an infinite loop of
 			 * regeneration.
 			 */
 			if (V_ip6_use_tempaddr &&
 			    (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) {
 				if (regen_tmpaddr(ia6) == 0)
 					regen = 1;
 			}
 
 			in6_purgeaddr(&ia6->ia_ifa);
 
 			if (regen)
 				goto addrloop; /* XXX: see below */
 		} else if (IFA6_IS_DEPRECATED(ia6)) {
 			int oldflags = ia6->ia6_flags;
 
 			ia6->ia6_flags |= IN6_IFF_DEPRECATED;
 
 			/*
 			 * If a temporary address has just become deprecated,
 			 * regenerate a new one if possible.
 			 */
 			if (V_ip6_use_tempaddr &&
 			    (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (oldflags & IN6_IFF_DEPRECATED) == 0) {
 
 				if (regen_tmpaddr(ia6) == 0) {
 					/*
 					 * A new temporary address is
 					 * generated.
 					 * XXX: this means the address chain
 					 * has changed while we are still in
 					 * the loop.  Although the change
 					 * would not cause disaster (because
 					 * it's not a deletion, but an
 					 * addition,) we'd rather restart the
 					 * loop just for safety.  Or does this
 					 * significantly reduce performance??
 					 */
 					goto addrloop;
 				}
 			}
 		} else if ((ia6->ia6_flags & IN6_IFF_TENTATIVE) != 0) {
 			/*
 			 * Schedule DAD for a tentative address.  This happens
 			 * if the interface was down or not running
 			 * when the address was configured.
 			 */
 			int delay;
 
 			delay = arc4random() %
 			    (MAX_RTR_SOLICITATION_DELAY * hz);
 			nd6_dad_start((struct ifaddr *)ia6, delay);
 		} else {
 			/*
 			 * Check status of the interface.  If it is down,
 			 * mark the address as tentative for future DAD.
 			 */
 			if ((ia6->ia_ifp->if_flags & IFF_UP) == 0 ||
 			    (ia6->ia_ifp->if_drv_flags & IFF_DRV_RUNNING)
 				== 0 ||
 			    (ND_IFINFO(ia6->ia_ifp)->flags &
 				ND6_IFF_IFDISABLED) != 0) {
 				ia6->ia6_flags &= ~IN6_IFF_DUPLICATED;
 				ia6->ia6_flags |= IN6_IFF_TENTATIVE;
 			}
 			/*
 			 * A new RA might have made a deprecated address
 			 * preferred.
 			 */
 			ia6->ia6_flags &= ~IN6_IFF_DEPRECATED;
 		}
 	}
 
 	/* expire prefix list */
 	LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) {
 		/*
 		 * check prefix lifetime.
 		 * since pltime is just for autoconf, pltime processing for
 		 * prefix is not necessary.
 		 */
 		if (pr->ndpr_vltime != ND6_INFINITE_LIFETIME &&
 		    time_uptime - pr->ndpr_lastupdate > pr->ndpr_vltime) {
 
 			/*
 			 * address expiration and prefix expiration are
 			 * separate.  NEVER perform in6_purgeaddr here.
 			 */
 			prelist_remove(pr);
 		}
 	}
+
+	callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz,
+	    nd6_timer, curvnet);
+
 	CURVNET_RESTORE();
 }
 
 /*
  * ia6 - deprecated/invalidated temporary address
  */
 static int
 regen_tmpaddr(struct in6_ifaddr *ia6)
 {
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	struct in6_ifaddr *public_ifa6 = NULL;
 
 	ifp = ia6->ia_ifa.ifa_ifp;
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		struct in6_ifaddr *it6;
 
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 
 		it6 = (struct in6_ifaddr *)ifa;
 
 		/* ignore no autoconf addresses. */
 		if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 			continue;
 
 		/* ignore autoconf addresses with different prefixes. */
 		if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr)
 			continue;
 
 		/*
 		 * Now we are looking at an autoconf address with the same
 		 * prefix as ours.  If the address is temporary and is still
 		 * preferred, do not create another one.  It would be rare, but
 		 * could happen, for example, when we resume a laptop PC after
 		 * a long period.
 		 */
 		if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 		    !IFA6_IS_DEPRECATED(it6)) {
 			public_ifa6 = NULL;
 			break;
 		}
 
 		/*
 		 * This is a public autoconf address that has the same prefix
 		 * as ours.  If it is preferred, keep it.  We can't break the
 		 * loop here, because there may be a still-preferred temporary
 		 * address with the prefix.
 		 */
 		if (!IFA6_IS_DEPRECATED(it6))
 			public_ifa6 = it6;
 	}
 	if (public_ifa6 != NULL)
 		ifa_ref(&public_ifa6->ia_ifa);
 	IF_ADDR_RUNLOCK(ifp);
 
 	if (public_ifa6 != NULL) {
 		int e;
 
 		if ((e = in6_tmpifadd(public_ifa6, 0, 0)) != 0) {
 			ifa_free(&public_ifa6->ia_ifa);
 			log(LOG_NOTICE, "regen_tmpaddr: failed to create a new"
 			    " tmp addr,errno=%d\n", e);
 			return (-1);
 		}
 		ifa_free(&public_ifa6->ia_ifa);
 		return (0);
 	}
 
 	return (-1);
 }
 
 /*
  * Nuke neighbor cache/prefix/default router management table, right before
  * ifp goes away.
  */
 void
 nd6_purge(struct ifnet *ifp)
 {
 	struct nd_drhead drq;
 	struct nd_defrouter *dr, *ndr;
 	struct nd_prefix *pr, *npr;
 
 	TAILQ_INIT(&drq);
 
 	/*
 	 * Nuke default router list entries toward ifp.
 	 * We defer removal of default router list entries that is installed
 	 * in the routing table, in order to keep additional side effects as
 	 * small as possible.
 	 */
 	ND6_WLOCK();
 	TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) {
 		if (dr->installed)
 			continue;
 		if (dr->ifp == ifp)
 			defrouter_unlink(dr, &drq);
 	}
 
 	TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) {
 		if (!dr->installed)
 			continue;
 		if (dr->ifp == ifp)
 			defrouter_unlink(dr, &drq);
 	}
 	ND6_WUNLOCK();
 
 	while ((dr = TAILQ_FIRST(&drq)) != NULL) {
 		TAILQ_REMOVE(&drq, dr, dr_entry);
 		defrouter_del(dr);
 	}
 
 	/* Nuke prefix list entries toward ifp */
 	LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) {
 		if (pr->ndpr_ifp == ifp) {
 			/*
 			 * Because if_detach() does *not* release prefixes
 			 * while purging addresses the reference count will
 			 * still be above zero. We therefore reset it to
 			 * make sure that the prefix really gets purged.
 			 */
 			pr->ndpr_refcnt = 0;
 
 			/*
 			 * Previously, pr->ndpr_addr is removed as well,
 			 * but I strongly believe we don't have to do it.
 			 * nd6_purge() is only called from in6_ifdetach(),
 			 * which removes all the associated interface addresses
 			 * by itself.
 			 * (jinmei@kame.net 20010129)
 			 */
 			prelist_remove(pr);
 		}
 	}
 
 	/* cancel default outgoing interface setting */
 	if (V_nd6_defifindex == ifp->if_index)
 		nd6_setdefaultiface(0);
 
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
 		/* Refresh default router list. */
 		defrouter_select();
 	}
 
 	/* XXXXX
 	 * We do not nuke the neighbor cache entries here any more
 	 * because the neighbor cache is kept in if_afdata[AF_INET6].
 	 * nd6_purge() is invoked by in6_ifdetach() which is called
 	 * from if_detach() where everything gets purged. So let
 	 * in6_domifdetach() do the actual L2 table purging work.
 	 */
 }
 
 /* 
  * the caller acquires and releases the lock on the lltbls
  * Returns the llentry locked
  */
 struct llentry *
 nd6_lookup(const struct in6_addr *addr6, int flags, struct ifnet *ifp)
 {
 	struct sockaddr_in6 sin6;
 	struct llentry *ln;
 	
 	bzero(&sin6, sizeof(sin6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_addr = *addr6;
 
 	IF_AFDATA_LOCK_ASSERT(ifp);
 
 	ln = lla_lookup(LLTABLE6(ifp), flags, (struct sockaddr *)&sin6);
 
 	return (ln);
 }
 
 struct llentry *
 nd6_alloc(const struct in6_addr *addr6, int flags, struct ifnet *ifp)
 {
 	struct sockaddr_in6 sin6;
 	struct llentry *ln;
 
 	bzero(&sin6, sizeof(sin6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_addr = *addr6;
 
 	ln = lltable_alloc_entry(LLTABLE6(ifp), 0, (struct sockaddr *)&sin6);
 	if (ln != NULL)
 		ln->ln_state = ND6_LLINFO_NOSTATE;
 
 	return (ln);
 }
 
 /*
  * Test whether a given IPv6 address is a neighbor or not, ignoring
  * the actual neighbor cache.  The neighbor cache is ignored in order
  * to not reenter the routing code from within itself.
  */
 static int
 nd6_is_new_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp)
 {
 	struct nd_prefix *pr;
 	struct ifaddr *dstaddr;
 	struct rt_addrinfo info;
 	struct sockaddr_in6 rt_key;
 	struct sockaddr *dst6;
 	int fibnum;
 
 	/*
 	 * A link-local address is always a neighbor.
 	 * XXX: a link does not necessarily specify a single interface.
 	 */
 	if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) {
 		struct sockaddr_in6 sin6_copy;
 		u_int32_t zone;
 
 		/*
 		 * We need sin6_copy since sa6_recoverscope() may modify the
 		 * content (XXX).
 		 */
 		sin6_copy = *addr;
 		if (sa6_recoverscope(&sin6_copy))
 			return (0); /* XXX: should be impossible */
 		if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone))
 			return (0);
 		if (sin6_copy.sin6_scope_id == zone)
 			return (1);
 		else
 			return (0);
 	}
 
 	bzero(&rt_key, sizeof(rt_key));
 	bzero(&info, sizeof(info));
 	info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key;
 
 	/* Always use the default FIB here. XXME - why? */
 	fibnum = RT_DEFAULT_FIB;
 
 	/*
 	 * If the address matches one of our addresses,
 	 * it should be a neighbor.
 	 * If the address matches one of our on-link prefixes, it should be a
 	 * neighbor.
 	 */
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		if (pr->ndpr_ifp != ifp)
 			continue;
 
 		if (!(pr->ndpr_stateflags & NDPRF_ONLINK)) {
 
 			/* Always use the default FIB here. */
 			dst6 = (struct sockaddr *)&pr->ndpr_prefix;
 
 			/* Restore length field before retrying lookup */
 			rt_key.sin6_len = sizeof(rt_key);
 			if (rib_lookup_info(fibnum, dst6, 0, 0, &info) != 0)
 				continue;
 			/*
 			 * This is the case where multiple interfaces
 			 * have the same prefix, but only one is installed 
 			 * into the routing table and that prefix entry
 			 * is not the one being examined here. In the case
 			 * where RADIX_MPATH is enabled, multiple route
 			 * entries (of the same rt_key value) will be 
 			 * installed because the interface addresses all
 			 * differ.
 			 */
 			if (!IN6_ARE_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr,
 			       &rt_key.sin6_addr))
 				continue;
 		}
 
 		if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr,
 		    &addr->sin6_addr, &pr->ndpr_mask))
 			return (1);
 	}
 
 	/*
 	 * If the address is assigned on the node of the other side of
 	 * a p2p interface, the address should be a neighbor.
 	 */
 	dstaddr = ifa_ifwithdstaddr((const struct sockaddr *)addr, RT_ALL_FIBS);
 	if (dstaddr != NULL) {
 		if (dstaddr->ifa_ifp == ifp) {
 			ifa_free(dstaddr);
 			return (1);
 		}
 		ifa_free(dstaddr);
 	}
 
 	/*
 	 * If the default router list is empty, all addresses are regarded
 	 * as on-link, and thus, as a neighbor.
 	 */
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV &&
 	    TAILQ_EMPTY(&V_nd_defrouter) &&
 	    V_nd6_defifindex == ifp->if_index) {
 		return (1);
 	}
 
 	return (0);
 }
 
 
 /*
  * Detect if a given IPv6 address identifies a neighbor on a given link.
  * XXX: should take care of the destination of a p2p link?
  */
 int
 nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp)
 {
 	struct llentry *lle;
 	int rc = 0;
 
 	IF_AFDATA_UNLOCK_ASSERT(ifp);
 	if (nd6_is_new_addr_neighbor(addr, ifp))
 		return (1);
 
 	/*
 	 * Even if the address matches none of our addresses, it might be
 	 * in the neighbor cache.
 	 */
 	IF_AFDATA_RLOCK(ifp);
 	if ((lle = nd6_lookup(&addr->sin6_addr, 0, ifp)) != NULL) {
 		LLE_RUNLOCK(lle);
 		rc = 1;
 	}
 	IF_AFDATA_RUNLOCK(ifp);
 	return (rc);
 }
 
 /*
  * Free an nd6 llinfo entry.
  * Since the function would cause significant changes in the kernel, DO NOT
  * make it global, unless you have a strong reason for the change, and are sure
  * that the change is safe.
  *
  * Set noinline to be dtrace-friendly
  */
 static __noinline void
 nd6_free(struct llentry **lnp, int gc)
 {
 	struct ifnet *ifp;
 	struct llentry *ln;
 	struct nd_defrouter *dr;
 
 	ln = *lnp;
 	*lnp = NULL;
 
 	LLE_WLOCK_ASSERT(ln);
 	ND6_RLOCK_ASSERT();
 
 	ifp = lltable_get_ifp(ln->lle_tbl);
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) != 0)
 		dr = defrouter_lookup_locked(&ln->r_l3addr.addr6, ifp);
 	else
 		dr = NULL;
 	ND6_RUNLOCK();
 
 	if ((ln->la_flags & LLE_DELETED) == 0)
 		EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED);
 
 	/*
 	 * we used to have pfctlinput(PRC_HOSTDEAD) here.
 	 * even though it is not harmful, it was not really necessary.
 	 */
 
 	/* cancel timer */
 	nd6_llinfo_settimer_locked(ln, -1);
 
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
 		if (dr != NULL && dr->expire &&
 		    ln->ln_state == ND6_LLINFO_STALE && gc) {
 			/*
 			 * If the reason for the deletion is just garbage
 			 * collection, and the neighbor is an active default
 			 * router, do not delete it.  Instead, reset the GC
 			 * timer using the router's lifetime.
 			 * Simply deleting the entry would affect default
 			 * router selection, which is not necessarily a good
 			 * thing, especially when we're using router preference
 			 * values.
 			 * XXX: the check for ln_state would be redundant,
 			 *      but we intentionally keep it just in case.
 			 */
 			if (dr->expire > time_uptime)
 				nd6_llinfo_settimer_locked(ln,
 				    (dr->expire - time_uptime) * hz);
 			else
 				nd6_llinfo_settimer_locked(ln,
 				    (long)V_nd6_gctimer * hz);
 
 			LLE_REMREF(ln);
 			LLE_WUNLOCK(ln);
 			defrouter_rele(dr);
 			return;
 		}
 
 		if (dr) {
 			/*
 			 * Unreachablity of a router might affect the default
 			 * router selection and on-link detection of advertised
 			 * prefixes.
 			 */
 
 			/*
 			 * Temporarily fake the state to choose a new default
 			 * router and to perform on-link determination of
 			 * prefixes correctly.
 			 * Below the state will be set correctly,
 			 * or the entry itself will be deleted.
 			 */
 			ln->ln_state = ND6_LLINFO_INCOMPLETE;
 		}
 
 		if (ln->ln_router || dr) {
 
 			/*
 			 * We need to unlock to avoid a LOR with rt6_flush() with the
 			 * rnh and for the calls to pfxlist_onlink_check() and
 			 * defrouter_select() in the block further down for calls
 			 * into nd6_lookup().  We still hold a ref.
 			 */
 			LLE_WUNLOCK(ln);
 
 			/*
 			 * rt6_flush must be called whether or not the neighbor
 			 * is in the Default Router List.
 			 * See a corresponding comment in nd6_na_input().
 			 */
 			rt6_flush(&ln->r_l3addr.addr6, ifp);
 		}
 
 		if (dr) {
 			/*
 			 * Since defrouter_select() does not affect the
 			 * on-link determination and MIP6 needs the check
 			 * before the default router selection, we perform
 			 * the check now.
 			 */
 			pfxlist_onlink_check();
 
 			/*
 			 * Refresh default router list.
 			 */
 			defrouter_select();
 		}
 
 		/*
 		 * If this entry was added by an on-link redirect, remove the
 		 * corresponding host route.
 		 */
 		if (ln->la_flags & LLE_REDIRECT)
 			nd6_free_redirect(ln);
 
 		if (ln->ln_router || dr)
 			LLE_WLOCK(ln);
 	}
 
 	/*
 	 * Save to unlock. We still hold an extra reference and will not
 	 * free(9) in llentry_free() if someone else holds one as well.
 	 */
 	LLE_WUNLOCK(ln);
 	IF_AFDATA_LOCK(ifp);
 	LLE_WLOCK(ln);
 	/* Guard against race with other llentry_free(). */
 	if (ln->la_flags & LLE_LINKED) {
 		/* Remove callout reference */
 		LLE_REMREF(ln);
 		lltable_unlink_entry(ln->lle_tbl, ln);
 	}
 	IF_AFDATA_UNLOCK(ifp);
 
 	llentry_free(ln);
 	if (dr != NULL)
 		defrouter_rele(dr);
 }
 
 static int
 nd6_isdynrte(const struct rtentry *rt, void *xap)
 {
 
 	if (rt->rt_flags == (RTF_UP | RTF_HOST | RTF_DYNAMIC))
 		return (1);
 
 	return (0);
 }
 /*
  * Remove the rtentry for the given llentry,
  * both of which were installed by a redirect.
  */
 static void
 nd6_free_redirect(const struct llentry *ln)
 {
 	int fibnum;
 	struct sockaddr_in6 sin6;
 	struct rt_addrinfo info;
 
 	lltable_fill_sa_entry(ln, (struct sockaddr *)&sin6);
 	memset(&info, 0, sizeof(info));
 	info.rti_info[RTAX_DST] = (struct sockaddr *)&sin6;
 	info.rti_filter = nd6_isdynrte;
 
 	for (fibnum = 0; fibnum < rt_numfibs; fibnum++)
 		rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum);
 }
 
 /*
  * Rejuvenate this function for routing operations related
  * processing.
  */
 void
 nd6_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info)
 {
 	struct sockaddr_in6 *gateway;
 	struct nd_defrouter *dr;
 	struct ifnet *ifp;
 
 	gateway = (struct sockaddr_in6 *)rt->rt_gateway;
 	ifp = rt->rt_ifp;
 
 	switch (req) {
 	case RTM_ADD:
 		break;
 
 	case RTM_DELETE:
 		if (!ifp)
 			return;
 		/*
 		 * Only indirect routes are interesting.
 		 */
 		if ((rt->rt_flags & RTF_GATEWAY) == 0)
 			return;
 		/*
 		 * check for default route
 		 */
 		if (IN6_ARE_ADDR_EQUAL(&in6addr_any,
 		    &SIN6(rt_key(rt))->sin6_addr)) {
 			dr = defrouter_lookup(&gateway->sin6_addr, ifp);
 			if (dr != NULL) {
 				dr->installed = 0;
 				defrouter_rele(dr);
 			}
 		}
 		break;
 	}
 }
 
 
 int
 nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
 {
 	struct in6_ndireq *ndi = (struct in6_ndireq *)data;
 	struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data;
 	struct in6_ndifreq *ndif = (struct in6_ndifreq *)data;
 	int error = 0;
 
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return (EPFNOSUPPORT);
 	switch (cmd) {
 	case OSIOCGIFINFO_IN6:
 #define ND	ndi->ndi
 		/* XXX: old ndp(8) assumes a positive value for linkmtu. */
 		bzero(&ND, sizeof(ND));
 		ND.linkmtu = IN6_LINKMTU(ifp);
 		ND.maxmtu = ND_IFINFO(ifp)->maxmtu;
 		ND.basereachable = ND_IFINFO(ifp)->basereachable;
 		ND.reachable = ND_IFINFO(ifp)->reachable;
 		ND.retrans = ND_IFINFO(ifp)->retrans;
 		ND.flags = ND_IFINFO(ifp)->flags;
 		ND.recalctm = ND_IFINFO(ifp)->recalctm;
 		ND.chlim = ND_IFINFO(ifp)->chlim;
 		break;
 	case SIOCGIFINFO_IN6:
 		ND = *ND_IFINFO(ifp);
 		break;
 	case SIOCSIFINFO_IN6:
 		/*
 		 * used to change host variables from userland.
 		 * intented for a use on router to reflect RA configurations.
 		 */
 		/* 0 means 'unspecified' */
 		if (ND.linkmtu != 0) {
 			if (ND.linkmtu < IPV6_MMTU ||
 			    ND.linkmtu > IN6_LINKMTU(ifp)) {
 				error = EINVAL;
 				break;
 			}
 			ND_IFINFO(ifp)->linkmtu = ND.linkmtu;
 		}
 
 		if (ND.basereachable != 0) {
 			int obasereachable = ND_IFINFO(ifp)->basereachable;
 
 			ND_IFINFO(ifp)->basereachable = ND.basereachable;
 			if (ND.basereachable != obasereachable)
 				ND_IFINFO(ifp)->reachable =
 				    ND_COMPUTE_RTIME(ND.basereachable);
 		}
 		if (ND.retrans != 0)
 			ND_IFINFO(ifp)->retrans = ND.retrans;
 		if (ND.chlim != 0)
 			ND_IFINFO(ifp)->chlim = ND.chlim;
 		/* FALLTHROUGH */
 	case SIOCSIFINFO_FLAGS:
 	{
 		struct ifaddr *ifa;
 		struct in6_ifaddr *ia;
 
 		if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
 		    !(ND.flags & ND6_IFF_IFDISABLED)) {
 			/* ifdisabled 1->0 transision */
 
 			/*
 			 * If the interface is marked as ND6_IFF_IFDISABLED and
 			 * has an link-local address with IN6_IFF_DUPLICATED,
 			 * do not clear ND6_IFF_IFDISABLED.
 			 * See RFC 4862, Section 5.4.5.
 			 */
 			IF_ADDR_RLOCK(ifp);
 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 				if (ifa->ifa_addr->sa_family != AF_INET6)
 					continue;
 				ia = (struct in6_ifaddr *)ifa;
 				if ((ia->ia6_flags & IN6_IFF_DUPLICATED) &&
 				    IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia)))
 					break;
 			}
 			IF_ADDR_RUNLOCK(ifp);
 
 			if (ifa != NULL) {
 				/* LLA is duplicated. */
 				ND.flags |= ND6_IFF_IFDISABLED;
 				log(LOG_ERR, "Cannot enable an interface"
 				    " with a link-local address marked"
 				    " duplicate.\n");
 			} else {
 				ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED;
 				if (ifp->if_flags & IFF_UP)
 					in6_if_up(ifp);
 			}
 		} else if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
 			    (ND.flags & ND6_IFF_IFDISABLED)) {
 			/* ifdisabled 0->1 transision */
 			/* Mark all IPv6 address as tentative. */
 
 			ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED;
 			if (V_ip6_dad_count > 0 &&
 			    (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0) {
 				IF_ADDR_RLOCK(ifp);
 				TAILQ_FOREACH(ifa, &ifp->if_addrhead,
 				    ifa_link) {
 					if (ifa->ifa_addr->sa_family !=
 					    AF_INET6)
 						continue;
 					ia = (struct in6_ifaddr *)ifa;
 					ia->ia6_flags |= IN6_IFF_TENTATIVE;
 				}
 				IF_ADDR_RUNLOCK(ifp);
 			}
 		}
 
 		if (ND.flags & ND6_IFF_AUTO_LINKLOCAL) {
 			if (!(ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL)) {
 				/* auto_linklocal 0->1 transision */
 
 				/* If no link-local address on ifp, configure */
 				ND_IFINFO(ifp)->flags |= ND6_IFF_AUTO_LINKLOCAL;
 				in6_ifattach(ifp, NULL);
 			} else if (!(ND.flags & ND6_IFF_IFDISABLED) &&
 			    ifp->if_flags & IFF_UP) {
 				/*
 				 * When the IF already has
 				 * ND6_IFF_AUTO_LINKLOCAL, no link-local
 				 * address is assigned, and IFF_UP, try to
 				 * assign one.
 				 */
 				IF_ADDR_RLOCK(ifp);
 				TAILQ_FOREACH(ifa, &ifp->if_addrhead,
 				    ifa_link) {
 					if (ifa->ifa_addr->sa_family !=
 					    AF_INET6)
 						continue;
 					ia = (struct in6_ifaddr *)ifa;
 					if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia)))
 						break;
 				}
 				IF_ADDR_RUNLOCK(ifp);
 				if (ifa != NULL)
 					/* No LLA is configured. */
 					in6_ifattach(ifp, NULL);
 			}
 		}
 	}
 		ND_IFINFO(ifp)->flags = ND.flags;
 		break;
 #undef ND
 	case SIOCSNDFLUSH_IN6:	/* XXX: the ioctl name is confusing... */
 		/* sync kernel routing table with the default router list */
 		defrouter_reset();
 		defrouter_select();
 		break;
 	case SIOCSPFXFLUSH_IN6:
 	{
 		/* flush all the prefix advertised by routers */
 		struct nd_prefix *pr, *next;
 
 		LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, next) {
 			struct in6_ifaddr *ia, *ia_next;
 
 			if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr))
 				continue; /* XXX */
 
 			/* do we really have to remove addresses as well? */
 			/* XXXRW: in6_ifaddrhead locking. */
 			TAILQ_FOREACH_SAFE(ia, &V_in6_ifaddrhead, ia_link,
 			    ia_next) {
 				if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 					continue;
 
 				if (ia->ia6_ndpr == pr)
 					in6_purgeaddr(&ia->ia_ifa);
 			}
 			prelist_remove(pr);
 		}
 		break;
 	}
 	case SIOCSRTRFLUSH_IN6:
 	{
 		/* flush all the default routers */
 		struct nd_drhead drq;
 		struct nd_defrouter *dr;
 
 		TAILQ_INIT(&drq);
 
 		defrouter_reset();
 
 		ND6_WLOCK();
 		while ((dr = TAILQ_FIRST(&V_nd_defrouter)) != NULL)
 			defrouter_unlink(dr, &drq);
 		ND6_WUNLOCK();
 		while ((dr = TAILQ_FIRST(&drq)) != NULL) {
 			TAILQ_REMOVE(&drq, dr, dr_entry);
 			defrouter_del(dr);
 		}
 
 		defrouter_select();
 		break;
 	}
 	case SIOCGNBRINFO_IN6:
 	{
 		struct llentry *ln;
 		struct in6_addr nb_addr = nbi->addr; /* make local for safety */
 
 		if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0)
 			return (error);
 
 		IF_AFDATA_RLOCK(ifp);
 		ln = nd6_lookup(&nb_addr, 0, ifp);
 		IF_AFDATA_RUNLOCK(ifp);
 
 		if (ln == NULL) {
 			error = EINVAL;
 			break;
 		}
 		nbi->state = ln->ln_state;
 		nbi->asked = ln->la_asked;
 		nbi->isrouter = ln->ln_router;
 		if (ln->la_expire == 0)
 			nbi->expire = 0;
 		else
 			nbi->expire = ln->la_expire + ln->lle_remtime / hz +
 			    (time_second - time_uptime);
 		LLE_RUNLOCK(ln);
 		break;
 	}
 	case SIOCGDEFIFACE_IN6:	/* XXX: should be implemented as a sysctl? */
 		ndif->ifindex = V_nd6_defifindex;
 		break;
 	case SIOCSDEFIFACE_IN6:	/* XXX: should be implemented as a sysctl? */
 		return (nd6_setdefaultiface(ndif->ifindex));
 	}
 	return (error);
 }
 
 /*
  * Calculates new isRouter value based on provided parameters and
  * returns it.
  */
 static int
 nd6_is_router(int type, int code, int is_new, int old_addr, int new_addr,
     int ln_router)
 {
 
 	/*
 	 * ICMP6 type dependent behavior.
 	 *
 	 * NS: clear IsRouter if new entry
 	 * RS: clear IsRouter
 	 * RA: set IsRouter if there's lladdr
 	 * redir: clear IsRouter if new entry
 	 *
 	 * RA case, (1):
 	 * The spec says that we must set IsRouter in the following cases:
 	 * - If lladdr exist, set IsRouter.  This means (1-5).
 	 * - If it is old entry (!newentry), set IsRouter.  This means (7).
 	 * So, based on the spec, in (1-5) and (7) cases we must set IsRouter.
 	 * A quetion arises for (1) case.  (1) case has no lladdr in the
 	 * neighbor cache, this is similar to (6).
 	 * This case is rare but we figured that we MUST NOT set IsRouter.
 	 *
 	 *   is_new  old_addr new_addr 	    NS  RS  RA	redir
 	 *							D R
 	 *	0	n	n	(1)	c   ?     s
 	 *	0	y	n	(2)	c   s     s
 	 *	0	n	y	(3)	c   s     s
 	 *	0	y	y	(4)	c   s     s
 	 *	0	y	y	(5)	c   s     s
 	 *	1	--	n	(6) c	c	c s
 	 *	1	--	y	(7) c	c   s	c s
 	 *
 	 *					(c=clear s=set)
 	 */
 	switch (type & 0xff) {
 	case ND_NEIGHBOR_SOLICIT:
 		/*
 		 * New entry must have is_router flag cleared.
 		 */
 		if (is_new)					/* (6-7) */
 			ln_router = 0;
 		break;
 	case ND_REDIRECT:
 		/*
 		 * If the icmp is a redirect to a better router, always set the
 		 * is_router flag.  Otherwise, if the entry is newly created,
 		 * clear the flag.  [RFC 2461, sec 8.3]
 		 */
 		if (code == ND_REDIRECT_ROUTER)
 			ln_router = 1;
 		else {
 			if (is_new)				/* (6-7) */
 				ln_router = 0;
 		}
 		break;
 	case ND_ROUTER_SOLICIT:
 		/*
 		 * is_router flag must always be cleared.
 		 */
 		ln_router = 0;
 		break;
 	case ND_ROUTER_ADVERT:
 		/*
 		 * Mark an entry with lladdr as a router.
 		 */
 		if ((!is_new && (old_addr || new_addr)) ||	/* (2-5) */
 		    (is_new && new_addr)) {			/* (7) */
 			ln_router = 1;
 		}
 		break;
 	}
 
 	return (ln_router);
 }
 
 /*
  * Create neighbor cache entry and cache link-layer address,
  * on reception of inbound ND6 packets.  (RS/RA/NS/redirect)
  *
  * type - ICMP6 type
  * code - type dependent information
  *
  */
 void
 nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr,
     int lladdrlen, int type, int code)
 {
 	struct llentry *ln = NULL, *ln_tmp;
 	int is_newentry;
 	int do_update;
 	int olladdr;
 	int llchange;
 	int flags;
 	uint16_t router = 0;
 	struct sockaddr_in6 sin6;
 	struct mbuf *chain = NULL;
 	u_char linkhdr[LLE_MAX_LINKHDR];
 	size_t linkhdrsize;
 	int lladdr_off;
 
 	IF_AFDATA_UNLOCK_ASSERT(ifp);
 
 	KASSERT(ifp != NULL, ("%s: ifp == NULL", __func__));
 	KASSERT(from != NULL, ("%s: from == NULL", __func__));
 
 	/* nothing must be updated for unspecified address */
 	if (IN6_IS_ADDR_UNSPECIFIED(from))
 		return;
 
 	/*
 	 * Validation about ifp->if_addrlen and lladdrlen must be done in
 	 * the caller.
 	 *
 	 * XXX If the link does not have link-layer adderss, what should
 	 * we do? (ifp->if_addrlen == 0)
 	 * Spec says nothing in sections for RA, RS and NA.  There's small
 	 * description on it in NS section (RFC 2461 7.2.3).
 	 */
 	flags = lladdr ? LLE_EXCLUSIVE : 0;
 	IF_AFDATA_RLOCK(ifp);
 	ln = nd6_lookup(from, flags, ifp);
 	IF_AFDATA_RUNLOCK(ifp);
 	is_newentry = 0;
 	if (ln == NULL) {
 		flags |= LLE_EXCLUSIVE;
 		ln = nd6_alloc(from, 0, ifp);
 		if (ln == NULL)
 			return;
 
 		/*
 		 * Since we already know all the data for the new entry,
 		 * fill it before insertion.
 		 */
 		if (lladdr != NULL) {
 			linkhdrsize = sizeof(linkhdr);
 			if (lltable_calc_llheader(ifp, AF_INET6, lladdr,
 			    linkhdr, &linkhdrsize, &lladdr_off) != 0)
 				return;
 			lltable_set_entry_addr(ifp, ln, linkhdr, linkhdrsize,
 			    lladdr_off);
 		}
 
 		IF_AFDATA_WLOCK(ifp);
 		LLE_WLOCK(ln);
 		/* Prefer any existing lle over newly-created one */
 		ln_tmp = nd6_lookup(from, LLE_EXCLUSIVE, ifp);
 		if (ln_tmp == NULL)
 			lltable_link_entry(LLTABLE6(ifp), ln);
 		IF_AFDATA_WUNLOCK(ifp);
 		if (ln_tmp == NULL) {
 			/* No existing lle, mark as new entry (6,7) */
 			is_newentry = 1;
 			nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
 			if (lladdr != NULL)	/* (7) */
 				EVENTHANDLER_INVOKE(lle_event, ln,
 				    LLENTRY_RESOLVED);
 		} else {
 			lltable_free_entry(LLTABLE6(ifp), ln);
 			ln = ln_tmp;
 			ln_tmp = NULL;
 		}
 	} 
 	/* do nothing if static ndp is set */
 	if ((ln->la_flags & LLE_STATIC)) {
 		if (flags & LLE_EXCLUSIVE)
 			LLE_WUNLOCK(ln);
 		else
 			LLE_RUNLOCK(ln);
 		return;
 	}
 
 	olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0;
 	if (olladdr && lladdr) {
 		llchange = bcmp(lladdr, ln->ll_addr,
 		    ifp->if_addrlen);
 	} else if (!olladdr && lladdr)
 		llchange = 1;
 	else
 		llchange = 0;
 
 	/*
 	 * newentry olladdr  lladdr  llchange	(*=record)
 	 *	0	n	n	--	(1)
 	 *	0	y	n	--	(2)
 	 *	0	n	y	y	(3) * STALE
 	 *	0	y	y	n	(4) *
 	 *	0	y	y	y	(5) * STALE
 	 *	1	--	n	--	(6)   NOSTATE(= PASSIVE)
 	 *	1	--	y	--	(7) * STALE
 	 */
 
 	do_update = 0;
 	if (is_newentry == 0 && llchange != 0) {
 		do_update = 1;	/* (3,5) */
 
 		/*
 		 * Record source link-layer address
 		 * XXX is it dependent to ifp->if_type?
 		 */
 		linkhdrsize = sizeof(linkhdr);
 		if (lltable_calc_llheader(ifp, AF_INET6, lladdr,
 		    linkhdr, &linkhdrsize, &lladdr_off) != 0)
 			return;
 
 		if (lltable_try_set_entry_addr(ifp, ln, linkhdr, linkhdrsize,
 		    lladdr_off) == 0) {
 			/* Entry was deleted */
 			return;
 		}
 
 		nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
 
 		EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);
 
 		if (ln->la_hold != NULL)
 			nd6_grab_holdchain(ln, &chain, &sin6);
 	}
 
 	/* Calculates new router status */
 	router = nd6_is_router(type, code, is_newentry, olladdr,
 	    lladdr != NULL ? 1 : 0, ln->ln_router);
 
 	ln->ln_router = router;
 	/* Mark non-router redirects with special flag */
 	if ((type & 0xFF) == ND_REDIRECT && code != ND_REDIRECT_ROUTER)
 		ln->la_flags |= LLE_REDIRECT;
 
 	if (flags & LLE_EXCLUSIVE)
 		LLE_WUNLOCK(ln);
 	else
 		LLE_RUNLOCK(ln);
 
 	if (chain != NULL)
 		nd6_flush_holdchain(ifp, ifp, chain, &sin6);
 	
 	/*
 	 * When the link-layer address of a router changes, select the
 	 * best router again.  In particular, when the neighbor entry is newly
 	 * created, it might affect the selection policy.
 	 * Question: can we restrict the first condition to the "is_newentry"
 	 * case?
 	 * XXX: when we hear an RA from a new router with the link-layer
 	 * address option, defrouter_select() is called twice, since
 	 * defrtrlist_update called the function as well.  However, I believe
 	 * we can compromise the overhead, since it only happens the first
 	 * time.
 	 * XXX: although defrouter_select() should not have a bad effect
 	 * for those are not autoconfigured hosts, we explicitly avoid such
 	 * cases for safety.
 	 */
 	if ((do_update || is_newentry) && router &&
 	    ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
 		/*
 		 * guaranteed recursion
 		 */
 		defrouter_select();
 	}
 }
 
 static void
 nd6_slowtimo(void *arg)
 {
 	CURVNET_SET((struct vnet *) arg);
 	struct nd_ifinfo *nd6if;
 	struct ifnet *ifp;
 
 	callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
 	    nd6_slowtimo, curvnet);
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (ifp->if_afdata[AF_INET6] == NULL)
 			continue;
 		nd6if = ND_IFINFO(ifp);
 		if (nd6if->basereachable && /* already initialized */
 		    (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) {
 			/*
 			 * Since reachable time rarely changes by router
 			 * advertisements, we SHOULD insure that a new random
 			 * value gets recomputed at least once every few hours.
 			 * (RFC 2461, 6.3.4)
 			 */
 			nd6if->recalctm = V_nd6_recalc_reachtm_interval;
 			nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable);
 		}
 	}
 	IFNET_RUNLOCK_NOSLEEP();
 	CURVNET_RESTORE();
 }
 
 void
 nd6_grab_holdchain(struct llentry *ln, struct mbuf **chain,
     struct sockaddr_in6 *sin6)
 {
 
 	LLE_WLOCK_ASSERT(ln);
 
 	*chain = ln->la_hold;
 	ln->la_hold = NULL;
 	lltable_fill_sa_entry(ln, (struct sockaddr *)sin6);
 
 	if (ln->ln_state == ND6_LLINFO_STALE) {
 
 		/*
 		 * The first time we send a packet to a
 		 * neighbor whose entry is STALE, we have
 		 * to change the state to DELAY and a sets
 		 * a timer to expire in DELAY_FIRST_PROBE_TIME
 		 * seconds to ensure do neighbor unreachability
 		 * detection on expiration.
 		 * (RFC 2461 7.3.3)
 		 */
 		nd6_llinfo_setstate(ln, ND6_LLINFO_DELAY);
 	}
 }
 
 int
 nd6_output_ifp(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m,
     struct sockaddr_in6 *dst, struct route *ro)
 {
 	int error;
 	int ip6len;
 	struct ip6_hdr *ip6;
 	struct m_tag *mtag;
 
 #ifdef MAC
 	mac_netinet6_nd6_send(ifp, m);
 #endif
 
 	/*
 	 * If called from nd6_ns_output() (NS), nd6_na_output() (NA),
 	 * icmp6_redirect_output() (REDIRECT) or from rip6_output() (RS, RA
 	 * as handled by rtsol and rtadvd), mbufs will be tagged for SeND
 	 * to be diverted to user space.  When re-injected into the kernel,
 	 * send_output() will directly dispatch them to the outgoing interface.
 	 */
 	if (send_sendso_input_hook != NULL) {
 		mtag = m_tag_find(m, PACKET_TAG_ND_OUTGOING, NULL);
 		if (mtag != NULL) {
 			ip6 = mtod(m, struct ip6_hdr *);
 			ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen);
 			/* Use the SEND socket */
 			error = send_sendso_input_hook(m, ifp, SND_OUT,
 			    ip6len);
 			/* -1 == no app on SEND socket */
 			if (error == 0 || error != -1)
 			    return (error);
 		}
 	}
 
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	IP_PROBE(send, NULL, NULL, mtod(m, struct ip6_hdr *), ifp, NULL,
 	    mtod(m, struct ip6_hdr *));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) == 0)
 		origifp = ifp;
 
 	error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, ro);
 	return (error);
 }
 
 /*
  * Lookup link headerfor @sa_dst address. Stores found
  * data in @desten buffer. Copy of lle ln_flags can be also
  * saved in @pflags if @pflags is non-NULL.
  *
  * If destination LLE does not exists or lle state modification
  * is required, call "slow" version.
  *
  * Return values:
  * - 0 on success (address copied to buffer).
  * - EWOULDBLOCK (no local error, but address is still unresolved)
  * - other errors (alloc failure, etc)
  */
 int
 nd6_resolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
     const struct sockaddr *sa_dst, u_char *desten, uint32_t *pflags)
 {
 	struct llentry *ln = NULL;
 	const struct sockaddr_in6 *dst6;
 
 	if (pflags != NULL)
 		*pflags = 0;
 
 	dst6 = (const struct sockaddr_in6 *)sa_dst;
 
 	/* discard the packet if IPv6 operation is disabled on the interface */
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) {
 		m_freem(m);
 		return (ENETDOWN); /* better error? */
 	}
 
 	if (m != NULL && m->m_flags & M_MCAST) {
 		switch (ifp->if_type) {
 		case IFT_ETHER:
 		case IFT_FDDI:
 		case IFT_L2VLAN:
 		case IFT_IEEE80211:
 		case IFT_BRIDGE:
 		case IFT_ISO88025:
 			ETHER_MAP_IPV6_MULTICAST(&dst6->sin6_addr,
 						 desten);
 			return (0);
 		default:
 			m_freem(m);
 			return (EAFNOSUPPORT);
 		}
 	}
 
 	IF_AFDATA_RLOCK(ifp);
 	ln = nd6_lookup(&dst6->sin6_addr, LLE_UNLOCKED, ifp);
 	if (ln != NULL && (ln->r_flags & RLLE_VALID) != 0) {
 		/* Entry found, let's copy lle info */
 		bcopy(ln->r_linkdata, desten, ln->r_hdrlen);
 		if (pflags != NULL)
 			*pflags = LLE_VALID | (ln->r_flags & RLLE_IFADDR);
 		/* Check if we have feedback request from nd6 timer */
 		if (ln->r_skip_req != 0) {
 			LLE_REQ_LOCK(ln);
 			ln->r_skip_req = 0; /* Notify that entry was used */
 			ln->lle_hittime = time_uptime;
 			LLE_REQ_UNLOCK(ln);
 		}
 		IF_AFDATA_RUNLOCK(ifp);
 		return (0);
 	}
 	IF_AFDATA_RUNLOCK(ifp);
 
 	return (nd6_resolve_slow(ifp, 0, m, dst6, desten, pflags));
 }
 
 
 /*
  * Do L2 address resolution for @sa_dst address. Stores found
  * address in @desten buffer. Copy of lle ln_flags can be also
  * saved in @pflags if @pflags is non-NULL.
  *
  * Heavy version.
  * Function assume that destination LLE does not exist,
  * is invalid or stale, so LLE_EXCLUSIVE lock needs to be acquired.
  *
  * Set noinline to be dtrace-friendly
  */
 static __noinline int
 nd6_resolve_slow(struct ifnet *ifp, int flags, struct mbuf *m,
     const struct sockaddr_in6 *dst, u_char *desten, uint32_t *pflags)
 {
 	struct llentry *lle = NULL, *lle_tmp;
 	struct in6_addr *psrc, src;
 	int send_ns, ll_len;
 	char *lladdr;
 
 	/*
 	 * Address resolution or Neighbor Unreachability Detection
 	 * for the next hop.
 	 * At this point, the destination of the packet must be a unicast
 	 * or an anycast address(i.e. not a multicast).
 	 */
 	if (lle == NULL) {
 		IF_AFDATA_RLOCK(ifp);
 		lle = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp);
 		IF_AFDATA_RUNLOCK(ifp);
 		if ((lle == NULL) && nd6_is_addr_neighbor(dst, ifp))  {
 			/*
 			 * Since nd6_is_addr_neighbor() internally calls nd6_lookup(),
 			 * the condition below is not very efficient.  But we believe
 			 * it is tolerable, because this should be a rare case.
 			 */
 			lle = nd6_alloc(&dst->sin6_addr, 0, ifp);
 			if (lle == NULL) {
 				char ip6buf[INET6_ADDRSTRLEN];
 				log(LOG_DEBUG,
 				    "nd6_output: can't allocate llinfo for %s "
 				    "(ln=%p)\n",
 				    ip6_sprintf(ip6buf, &dst->sin6_addr), lle);
 				m_freem(m);
 				return (ENOBUFS);
 			}
 
 			IF_AFDATA_WLOCK(ifp);
 			LLE_WLOCK(lle);
 			/* Prefer any existing entry over newly-created one */
 			lle_tmp = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp);
 			if (lle_tmp == NULL)
 				lltable_link_entry(LLTABLE6(ifp), lle);
 			IF_AFDATA_WUNLOCK(ifp);
 			if (lle_tmp != NULL) {
 				lltable_free_entry(LLTABLE6(ifp), lle);
 				lle = lle_tmp;
 				lle_tmp = NULL;
 			}
 		}
 	} 
 	if (lle == NULL) {
 		if (!(ND_IFINFO(ifp)->flags & ND6_IFF_PERFORMNUD)) {
 			m_freem(m);
 			return (ENOBUFS);
 		}
 
 		if (m != NULL)
 			m_freem(m);
 		return (ENOBUFS);
 	}
 
 	LLE_WLOCK_ASSERT(lle);
 
 	/*
 	 * The first time we send a packet to a neighbor whose entry is
 	 * STALE, we have to change the state to DELAY and a sets a timer to
 	 * expire in DELAY_FIRST_PROBE_TIME seconds to ensure do
 	 * neighbor unreachability detection on expiration.
 	 * (RFC 2461 7.3.3)
 	 */
 	if (lle->ln_state == ND6_LLINFO_STALE)
 		nd6_llinfo_setstate(lle, ND6_LLINFO_DELAY);
 
 	/*
 	 * If the neighbor cache entry has a state other than INCOMPLETE
 	 * (i.e. its link-layer address is already resolved), just
 	 * send the packet.
 	 */
 	if (lle->ln_state > ND6_LLINFO_INCOMPLETE) {
 		if (flags & LLE_ADDRONLY) {
 			lladdr = lle->ll_addr;
 			ll_len = ifp->if_addrlen;
 		} else {
 			lladdr = lle->r_linkdata;
 			ll_len = lle->r_hdrlen;
 		}
 		bcopy(lladdr, desten, ll_len);
 		if (pflags != NULL)
 			*pflags = lle->la_flags;
 		LLE_WUNLOCK(lle);
 		return (0);
 	}
 
 	/*
 	 * There is a neighbor cache entry, but no ethernet address
 	 * response yet.  Append this latest packet to the end of the
 	 * packet queue in the mbuf, unless the number of the packet
 	 * does not exceed nd6_maxqueuelen.  When it exceeds nd6_maxqueuelen,
 	 * the oldest packet in the queue will be removed.
 	 */
 
 	if (lle->la_hold != NULL) {
 		struct mbuf *m_hold;
 		int i;
 		
 		i = 0;
 		for (m_hold = lle->la_hold; m_hold; m_hold = m_hold->m_nextpkt){
 			i++;
 			if (m_hold->m_nextpkt == NULL) {
 				m_hold->m_nextpkt = m;
 				break;
 			}
 		}
 		while (i >= V_nd6_maxqueuelen) {
 			m_hold = lle->la_hold;
 			lle->la_hold = lle->la_hold->m_nextpkt;
 			m_freem(m_hold);
 			i--;
 		}
 	} else {
 		lle->la_hold = m;
 	}
 
 	/*
 	 * If there has been no NS for the neighbor after entering the
 	 * INCOMPLETE state, send the first solicitation.
 	 * Note that for newly-created lle la_asked will be 0,
 	 * so we will transition from ND6_LLINFO_NOSTATE to
 	 * ND6_LLINFO_INCOMPLETE state here.
 	 */
 	psrc = NULL;
 	send_ns = 0;
 	if (lle->la_asked == 0) {
 		lle->la_asked++;
 		send_ns = 1;
 		psrc = nd6_llinfo_get_holdsrc(lle, &src);
 
 		nd6_llinfo_setstate(lle, ND6_LLINFO_INCOMPLETE);
 	}
 	LLE_WUNLOCK(lle);
 	if (send_ns != 0)
 		nd6_ns_output(ifp, psrc, NULL, &dst->sin6_addr, NULL);
 
 	return (EWOULDBLOCK);
 }
 
 /*
  * Do L2 address resolution for @sa_dst address. Stores found
  * address in @desten buffer. Copy of lle ln_flags can be also
  * saved in @pflags if @pflags is non-NULL.
  *
  * Return values:
  * - 0 on success (address copied to buffer).
  * - EWOULDBLOCK (no local error, but address is still unresolved)
  * - other errors (alloc failure, etc)
  */
 int
 nd6_resolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst,
     char *desten, uint32_t *pflags)
 {
 	int error;
 
 	flags |= LLE_ADDRONLY;
 	error = nd6_resolve_slow(ifp, flags, NULL,
 	    (const struct sockaddr_in6 *)dst, desten, pflags);
 	return (error);
 }
 
 int
 nd6_flush_holdchain(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *chain,
     struct sockaddr_in6 *dst)
 {
 	struct mbuf *m, *m_head;
 	struct ifnet *outifp;
 	int error = 0;
 
 	m_head = chain;
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0)
 		outifp = origifp;
 	else
 		outifp = ifp;
 	
 	while (m_head) {
 		m = m_head;
 		m_head = m_head->m_nextpkt;
 		error = nd6_output_ifp(ifp, origifp, m, dst, NULL);
 	}
 
 	/*
 	 * XXX
 	 * note that intermediate errors are blindly ignored
 	 */
 	return (error);
 }	
 
 static int
 nd6_need_cache(struct ifnet *ifp)
 {
 	/*
 	 * XXX: we currently do not make neighbor cache on any interface
 	 * other than ARCnet, Ethernet, FDDI and GIF.
 	 *
 	 * RFC2893 says:
 	 * - unidirectional tunnels needs no ND
 	 */
 	switch (ifp->if_type) {
 	case IFT_ARCNET:
 	case IFT_ETHER:
 	case IFT_FDDI:
 	case IFT_IEEE1394:
 	case IFT_L2VLAN:
 	case IFT_IEEE80211:
 	case IFT_INFINIBAND:
 	case IFT_BRIDGE:
 	case IFT_PROPVIRTUAL:
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 /*
  * Add pernament ND6 link-layer record for given
  * interface address.
  *
  * Very similar to IPv4 arp_ifinit(), but:
  * 1) IPv6 DAD is performed in different place
  * 2) It is called by IPv6 protocol stack in contrast to
  * arp_ifinit() which is typically called in SIOCSIFADDR
  * driver ioctl handler.
  *
  */
 int
 nd6_add_ifa_lle(struct in6_ifaddr *ia)
 {
 	struct ifnet *ifp;
 	struct llentry *ln, *ln_tmp;
 	struct sockaddr *dst;
 
 	ifp = ia->ia_ifa.ifa_ifp;
 	if (nd6_need_cache(ifp) == 0)
 		return (0);
 
 	ia->ia_ifa.ifa_rtrequest = nd6_rtrequest;
 	dst = (struct sockaddr *)&ia->ia_addr;
 	ln = lltable_alloc_entry(LLTABLE6(ifp), LLE_IFADDR, dst);
 	if (ln == NULL)
 		return (ENOBUFS);
 
 	IF_AFDATA_WLOCK(ifp);
 	LLE_WLOCK(ln);
 	/* Unlink any entry if exists */
 	ln_tmp = lla_lookup(LLTABLE6(ifp), LLE_EXCLUSIVE, dst);
 	if (ln_tmp != NULL)
 		lltable_unlink_entry(LLTABLE6(ifp), ln_tmp);
 	lltable_link_entry(LLTABLE6(ifp), ln);
 	IF_AFDATA_WUNLOCK(ifp);
 
 	if (ln_tmp != NULL)
 		EVENTHANDLER_INVOKE(lle_event, ln_tmp, LLENTRY_EXPIRED);
 	EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);
 
 	LLE_WUNLOCK(ln);
 	if (ln_tmp != NULL)
 		llentry_free(ln_tmp);
 
 	return (0);
 }
 
 /*
  * Removes either all lle entries for given @ia, or lle
  * corresponding to @ia address.
  */
 void
 nd6_rem_ifa_lle(struct in6_ifaddr *ia, int all)
 {
 	struct sockaddr_in6 mask, addr;
 	struct sockaddr *saddr, *smask;
 	struct ifnet *ifp;
 
 	ifp = ia->ia_ifa.ifa_ifp;
 	memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr));
 	memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask));
 	saddr = (struct sockaddr *)&addr;
 	smask = (struct sockaddr *)&mask;
 
 	if (all != 0)
 		lltable_prefix_free(AF_INET6, saddr, smask, LLE_STATIC);
 	else
 		lltable_delete_addr(LLTABLE6(ifp), LLE_IFADDR, saddr);
 }
 
 static void 
 clear_llinfo_pqueue(struct llentry *ln)
 {
 	struct mbuf *m_hold, *m_hold_next;
 
 	for (m_hold = ln->la_hold; m_hold; m_hold = m_hold_next) {
 		m_hold_next = m_hold->m_nextpkt;
 		m_freem(m_hold);
 	}
 
 	ln->la_hold = NULL;
 }
 
 static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS);
 static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS);
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_net_inet6_icmp6);
 #endif
 SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist,
 	CTLFLAG_RD, nd6_sysctl_drlist, "");
 SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist,
 	CTLFLAG_RD, nd6_sysctl_prlist, "");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, "");
 SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), "");
 
 static int
 nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS)
 {
 	struct in6_defrouter d;
 	struct nd_defrouter *dr;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 
 	bzero(&d, sizeof(d));
 	d.rtaddr.sin6_family = AF_INET6;
 	d.rtaddr.sin6_len = sizeof(d.rtaddr);
 
 	ND6_RLOCK();
 	TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) {
 		d.rtaddr.sin6_addr = dr->rtaddr;
 		error = sa6_recoverscope(&d.rtaddr);
 		if (error != 0)
 			break;
 		d.flags = dr->raflags;
 		d.rtlifetime = dr->rtlifetime;
 		d.expire = dr->expire + (time_second - time_uptime);
 		d.if_index = dr->ifp->if_index;
 		error = SYSCTL_OUT(req, &d, sizeof(d));
 		if (error != 0)
 			break;
 	}
 	ND6_RUNLOCK();
 	return (error);
 }
 
 static int
 nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS)
 {
 	struct in6_prefix p;
 	struct sockaddr_in6 s6;
 	struct nd_prefix *pr;
 	struct nd_pfxrouter *pfr;
 	time_t maxexpire;
 	int error;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	if (req->newptr)
 		return (EPERM);
 
 	bzero(&p, sizeof(p));
 	p.origin = PR_ORIG_RA;
 	bzero(&s6, sizeof(s6));
 	s6.sin6_family = AF_INET6;
 	s6.sin6_len = sizeof(s6);
 
 	/*
 	 * XXX locking
 	 */
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		p.prefix = pr->ndpr_prefix;
 		if (sa6_recoverscope(&p.prefix)) {
 			log(LOG_ERR, "scope error in prefix list (%s)\n",
 			    ip6_sprintf(ip6buf, &p.prefix.sin6_addr));
 			/* XXX: press on... */
 		}
 		p.raflags = pr->ndpr_raf;
 		p.prefixlen = pr->ndpr_plen;
 		p.vltime = pr->ndpr_vltime;
 		p.pltime = pr->ndpr_pltime;
 		p.if_index = pr->ndpr_ifp->if_index;
 		if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME)
 			p.expire = 0;
 		else {
 			/* XXX: we assume time_t is signed. */
 			maxexpire = (-1) &
 			    ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
 			if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate)
 				p.expire = pr->ndpr_lastupdate +
 				    pr->ndpr_vltime +
 				    (time_second - time_uptime);
 			else
 				p.expire = maxexpire;
 		}
 		p.refcnt = pr->ndpr_refcnt;
 		p.flags = pr->ndpr_stateflags;
 		p.advrtrs = 0;
 		LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry)
 			p.advrtrs++;
 		error = SYSCTL_OUT(req, &p, sizeof(p));
 		if (error != 0)
 			return (error);
 		LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) {
 			s6.sin6_addr = pfr->router->rtaddr;
 			if (sa6_recoverscope(&s6))
 				log(LOG_ERR,
 				    "scope error in prefix list (%s)\n",
 				    ip6_sprintf(ip6buf, &pfr->router->rtaddr));
 			error = SYSCTL_OUT(req, &s6, sizeof(s6));
 			if (error != 0)
 				return (error);
 		}
 	}
 	return (0);
 }
Index: projects/vnet/sys/netinet6/nd6.h
===================================================================
--- projects/vnet/sys/netinet6/nd6.h	(revision 299172)
+++ projects/vnet/sys/netinet6/nd6.h	(revision 299173)
@@ -1,475 +1,475 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: nd6.h,v 1.76 2001/12/18 02:10:31 itojun Exp $
  * $FreeBSD$
  */
 
 #ifndef _NETINET6_ND6_H_
 #define _NETINET6_ND6_H_
 
 /* see net/route.h, or net/if_inarp.h */
 #ifndef RTF_ANNOUNCE
 #define RTF_ANNOUNCE	RTF_PROTO2
 #endif
 
 #include <sys/queue.h>
 #include <sys/callout.h>
 
 struct llentry;
 
 #define ND6_LLINFO_NOSTATE	-2
 /*
  * We don't need the WAITDELETE state any more, but we keep the definition
  * in a comment line instead of removing it. This is necessary to avoid
  * unintentionally reusing the value for another purpose, which might
  * affect backward compatibility with old applications.
  * (20000711 jinmei@kame.net)
  */
 /* #define ND6_LLINFO_WAITDELETE	-1 */
 #define ND6_LLINFO_INCOMPLETE	0
 #define ND6_LLINFO_REACHABLE	1
 #define ND6_LLINFO_STALE	2
 #define ND6_LLINFO_DELAY	3
 #define ND6_LLINFO_PROBE	4
 
 #define ND6_IS_LLINFO_PROBREACH(n) ((n)->ln_state > ND6_LLINFO_INCOMPLETE)
 #define ND6_LLINFO_PERMANENT(n) (((n)->la_expire == 0) && ((n)->ln_state > ND6_LLINFO_INCOMPLETE))
 
 struct nd_ifinfo {
 	u_int32_t linkmtu;		/* LinkMTU */
 	u_int32_t maxmtu;		/* Upper bound of LinkMTU */
 	u_int32_t basereachable;	/* BaseReachableTime */
 	u_int32_t reachable;		/* Reachable Time */
 	u_int32_t retrans;		/* Retrans Timer */
 	u_int32_t flags;		/* Flags */
 	int recalctm;			/* BaseReacable re-calculation timer */
 	u_int8_t chlim;			/* CurHopLimit */
 	u_int8_t initialized; /* Flag to see the entry is initialized */
 	/* the following 3 members are for privacy extension for addrconf */
 	u_int8_t randomseed0[8]; /* upper 64 bits of MD5 digest */
 	u_int8_t randomseed1[8]; /* lower 64 bits (usually the EUI64 IFID) */
 	u_int8_t randomid[8];	/* current random ID */
 };
 
 #define ND6_IFF_PERFORMNUD	0x1
 #define ND6_IFF_ACCEPT_RTADV	0x2
 #define ND6_IFF_PREFER_SOURCE	0x4 /* Not used in FreeBSD. */
 #define ND6_IFF_IFDISABLED	0x8 /* IPv6 operation is disabled due to
 				     * DAD failure.  (XXX: not ND-specific)
 				     */
 #define ND6_IFF_DONT_SET_IFROUTE	0x10
 #define ND6_IFF_AUTO_LINKLOCAL	0x20
 #define	ND6_IFF_NO_RADR		0x40
 #define ND6_IFF_NO_PREFER_IFACE	0x80 /* XXX: not related to ND. */
 #define ND6_IFF_NO_DAD		0x100
 
 #ifdef _KERNEL
 #define ND_IFINFO(ifp) \
 	(((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->nd_ifinfo)
 #define IN6_LINKMTU(ifp) \
 	((ND_IFINFO(ifp)->linkmtu && ND_IFINFO(ifp)->linkmtu < (ifp)->if_mtu) \
 	    ? ND_IFINFO(ifp)->linkmtu \
 	    : ((ND_IFINFO(ifp)->maxmtu && ND_IFINFO(ifp)->maxmtu < (ifp)->if_mtu) \
 		? ND_IFINFO(ifp)->maxmtu : (ifp)->if_mtu))
 #endif
 
 struct in6_nbrinfo {
 	char ifname[IFNAMSIZ];	/* if name, e.g. "en0" */
 	struct in6_addr addr;	/* IPv6 address of the neighbor */
 	long	asked;		/* number of queries already sent for this addr */
 	int	isrouter;	/* if it acts as a router */
 	int	state;		/* reachability state */
 	int	expire;		/* lifetime for NDP state transition */
 };
 
 #define DRLSTSIZ 10
 #define PRLSTSIZ 10
 struct	in6_drlist {
 	char ifname[IFNAMSIZ];
 	struct {
 		struct	in6_addr rtaddr;
 		u_char	flags;
 		u_short	rtlifetime;
 		u_long	expire;
 		u_short if_index;
 	} defrouter[DRLSTSIZ];
 };
 
 struct	in6_defrouter {
 	struct	sockaddr_in6 rtaddr;
 	u_char	flags;
 	u_short	rtlifetime;
 	u_long	expire;
 	u_short if_index;
 };
 
 #ifdef _KERNEL
 struct	in6_oprlist {
 	char ifname[IFNAMSIZ];
 	struct {
 		struct	in6_addr prefix;
 		struct prf_ra raflags;
 		u_char	prefixlen;
 		u_char	origin;
 		u_long vltime;
 		u_long pltime;
 		u_long expire;
 		u_short if_index;
 		u_short advrtrs; /* number of advertisement routers */
 		struct	in6_addr advrtr[DRLSTSIZ]; /* XXX: explicit limit */
 	} prefix[PRLSTSIZ];
 };
 #endif
 
 struct	in6_prlist {
 	char ifname[IFNAMSIZ];
 	struct {
 		struct	in6_addr prefix;
 		struct prf_ra raflags;
 		u_char	prefixlen;
 		u_char	origin;
 		u_int32_t vltime;
 		u_int32_t pltime;
 		time_t expire;
 		u_short if_index;
 		u_short advrtrs; /* number of advertisement routers */
 		struct	in6_addr advrtr[DRLSTSIZ]; /* XXX: explicit limit */
 	} prefix[PRLSTSIZ];
 };
 
 struct in6_prefix {
 	struct	sockaddr_in6 prefix;
 	struct prf_ra raflags;
 	u_char	prefixlen;
 	u_char	origin;
 	u_int32_t vltime;
 	u_int32_t pltime;
 	time_t expire;
 	u_int32_t flags;
 	int refcnt;
 	u_short if_index;
 	u_short advrtrs; /* number of advertisement routers */
 	/* struct sockaddr_in6 advrtr[] */
 };
 
 #ifdef _KERNEL
 struct	in6_ondireq {
 	char ifname[IFNAMSIZ];
 	struct {
 		u_int32_t linkmtu;	/* LinkMTU */
 		u_int32_t maxmtu;	/* Upper bound of LinkMTU */
 		u_int32_t basereachable; /* BaseReachableTime */
 		u_int32_t reachable;	/* Reachable Time */
 		u_int32_t retrans;	/* Retrans Timer */
 		u_int32_t flags;	/* Flags */
 		int recalctm;		/* BaseReacable re-calculation timer */
 		u_int8_t chlim;		/* CurHopLimit */
 		u_int8_t receivedra;
 	} ndi;
 };
 #endif
 
 struct	in6_ndireq {
 	char ifname[IFNAMSIZ];
 	struct nd_ifinfo ndi;
 };
 
 struct	in6_ndifreq {
 	char ifname[IFNAMSIZ];
 	u_long ifindex;
 };
 
 /* Prefix status */
 #define NDPRF_ONLINK		0x1
 #define NDPRF_DETACHED		0x2
 
 /* protocol constants */
 #define MAX_RTR_SOLICITATION_DELAY	1	/* 1sec */
 #define RTR_SOLICITATION_INTERVAL	4	/* 4sec */
 #define MAX_RTR_SOLICITATIONS		3
 
 #define ND6_INFINITE_LIFETIME		0xffffffff
 
 #ifdef _KERNEL
 /* node constants */
 #define MAX_REACHABLE_TIME		3600000	/* msec */
 #define REACHABLE_TIME			30000	/* msec */
 #define RETRANS_TIMER			1000	/* msec */
 #define MIN_RANDOM_FACTOR		512	/* 1024 * 0.5 */
 #define MAX_RANDOM_FACTOR		1536	/* 1024 * 1.5 */
 #define DEF_TEMP_VALID_LIFETIME		604800	/* 1 week */
 #define DEF_TEMP_PREFERRED_LIFETIME	86400	/* 1 day */
 #define TEMPADDR_REGEN_ADVANCE		5	/* sec */
 #define MAX_TEMP_DESYNC_FACTOR		600	/* 10 min */
 #define ND_COMPUTE_RTIME(x) \
 		(((MIN_RANDOM_FACTOR * (x >> 10)) + (arc4random() & \
 		((MAX_RANDOM_FACTOR - MIN_RANDOM_FACTOR) * (x >> 10)))) /1000)
 
 TAILQ_HEAD(nd_drhead, nd_defrouter);
 struct nd_defrouter {
 	TAILQ_ENTRY(nd_defrouter) dr_entry;
 	struct in6_addr rtaddr;
 	u_char	raflags;	/* flags on RA message */
 	u_short	rtlifetime;
 	u_long	expire;
 	struct ifnet *ifp;
 	int	installed;	/* is installed into kernel routing table */
 	u_int	refcnt;
 };
 
 struct nd_prefixctl {
 	struct ifnet *ndpr_ifp;
 
 	/* prefix */
 	struct sockaddr_in6 ndpr_prefix;
 	u_char	ndpr_plen;
 
 	u_int32_t ndpr_vltime;	/* advertised valid lifetime */
 	u_int32_t ndpr_pltime;	/* advertised preferred lifetime */
 
 	struct prf_ra ndpr_flags;
 };
 
 
 struct nd_prefix {
 	struct ifnet *ndpr_ifp;
 	LIST_ENTRY(nd_prefix) ndpr_entry;
 	struct sockaddr_in6 ndpr_prefix;	/* prefix */
 	struct in6_addr ndpr_mask; /* netmask derived from the prefix */
 
 	u_int32_t ndpr_vltime;	/* advertised valid lifetime */
 	u_int32_t ndpr_pltime;	/* advertised preferred lifetime */
 
 	time_t ndpr_expire;	/* expiration time of the prefix */
 	time_t ndpr_preferred;	/* preferred time of the prefix */
 	time_t ndpr_lastupdate; /* reception time of last advertisement */
 
 	struct prf_ra ndpr_flags;
 	u_int32_t ndpr_stateflags; /* actual state flags */
 	/* list of routers that advertise the prefix: */
 	LIST_HEAD(pr_rtrhead, nd_pfxrouter) ndpr_advrtrs;
 	u_char	ndpr_plen;
 	int	ndpr_refcnt;	/* reference couter from addresses */
 };
 
 #define ndpr_raf		ndpr_flags
 #define ndpr_raf_onlink		ndpr_flags.onlink
 #define ndpr_raf_auto		ndpr_flags.autonomous
 #define ndpr_raf_router		ndpr_flags.router
 
 /*
  * Message format for use in obtaining information about prefixes
  * from inet6 sysctl function
  */
 struct inet6_ndpr_msghdr {
 	u_short	inpm_msglen;	/* to skip over non-understood messages */
 	u_char	inpm_version;	/* future binary compatibility */
 	u_char	inpm_type;	/* message type */
 	struct in6_addr inpm_prefix;
 	u_long	prm_vltim;
 	u_long	prm_pltime;
 	u_long	prm_expire;
 	u_long	prm_preferred;
 	struct in6_prflags prm_flags;
 	u_short	prm_index;	/* index for associated ifp */
 	u_char	prm_plen;	/* length of prefix in bits */
 };
 
 #define prm_raf_onlink		prm_flags.prf_ra.onlink
 #define prm_raf_auto		prm_flags.prf_ra.autonomous
 
 #define prm_statef_onlink	prm_flags.prf_state.onlink
 
 #define prm_rrf_decrvalid	prm_flags.prf_rr.decrvalid
 #define prm_rrf_decrprefd	prm_flags.prf_rr.decrprefd
 
 struct nd_pfxrouter {
 	LIST_ENTRY(nd_pfxrouter) pfr_entry;
 	struct nd_defrouter *router;
 };
 
 LIST_HEAD(nd_prhead, nd_prefix);
 
 /* nd6.c */
 VNET_DECLARE(int, nd6_prune);
 VNET_DECLARE(int, nd6_delay);
 VNET_DECLARE(int, nd6_umaxtries);
 VNET_DECLARE(int, nd6_mmaxtries);
 VNET_DECLARE(int, nd6_useloopback);
 VNET_DECLARE(int, nd6_maxnudhint);
 VNET_DECLARE(int, nd6_gctimer);
 VNET_DECLARE(struct nd_drhead, nd_defrouter);
 VNET_DECLARE(struct nd_prhead, nd_prefix);
 VNET_DECLARE(int, nd6_debug);
 VNET_DECLARE(int, nd6_onlink_ns_rfc4861);
 #define	V_nd6_prune			VNET(nd6_prune)
 #define	V_nd6_delay			VNET(nd6_delay)
 #define	V_nd6_umaxtries			VNET(nd6_umaxtries)
 #define	V_nd6_mmaxtries			VNET(nd6_mmaxtries)
 #define	V_nd6_useloopback		VNET(nd6_useloopback)
 #define	V_nd6_maxnudhint		VNET(nd6_maxnudhint)
 #define	V_nd6_gctimer			VNET(nd6_gctimer)
 #define	V_nd_defrouter			VNET(nd_defrouter)
 #define	V_nd_prefix			VNET(nd_prefix)
 #define	V_nd6_debug			VNET(nd6_debug)
 #define	V_nd6_onlink_ns_rfc4861		VNET(nd6_onlink_ns_rfc4861)
 
 /* Lock for the prefix and default router lists. */
 VNET_DECLARE(struct rwlock, nd6_lock);
 #define	V_nd6_lock			VNET(nd6_lock)
 
 #define	ND6_RLOCK()			rw_rlock(&V_nd6_lock)
 #define	ND6_RUNLOCK()			rw_runlock(&V_nd6_lock)
 #define	ND6_WLOCK()			rw_wlock(&V_nd6_lock)
 #define	ND6_WUNLOCK()			rw_wunlock(&V_nd6_lock)
 #define	ND6_WLOCK_ASSERT()		rw_assert(&V_nd6_lock, RA_WLOCKED)
 #define	ND6_RLOCK_ASSERT()		rw_assert(&V_nd6_lock, RA_RLOCKED)
 #define	ND6_LOCK_ASSERT()		rw_assert(&V_nd6_lock, RA_LOCKED)
 #define	ND6_UNLOCK_ASSERT()		rw_assert(&V_nd6_lock, RA_UNLOCKED)
 
 #define nd6log(x)	do { if (V_nd6_debug) log x; } while (/*CONSTCOND*/ 0)
 
 /* nd6_rtr.c */
 VNET_DECLARE(int, nd6_defifindex);
 VNET_DECLARE(int, ip6_desync_factor);	/* seconds */
 VNET_DECLARE(u_int32_t, ip6_temp_preferred_lifetime); /* seconds */
 VNET_DECLARE(u_int32_t, ip6_temp_valid_lifetime); /* seconds */
 VNET_DECLARE(int, ip6_temp_regen_advance); /* seconds */
 #define	V_nd6_defifindex		VNET(nd6_defifindex)
 #define	V_ip6_desync_factor		VNET(ip6_desync_factor)
 #define	V_ip6_temp_preferred_lifetime	VNET(ip6_temp_preferred_lifetime)
 #define	V_ip6_temp_valid_lifetime	VNET(ip6_temp_valid_lifetime)
 #define	V_ip6_temp_regen_advance	VNET(ip6_temp_regen_advance)
 
 union nd_opts {
 	struct nd_opt_hdr *nd_opt_array[16];	/* max = ND_OPT_NONCE */
 	struct {
 		struct nd_opt_hdr *zero;
 		struct nd_opt_hdr *src_lladdr;
 		struct nd_opt_hdr *tgt_lladdr;
 		struct nd_opt_prefix_info *pi_beg; /* multiple opts, start */
 		struct nd_opt_rd_hdr *rh;
 		struct nd_opt_mtu *mtu;
 		struct nd_opt_hdr *__res6;
 		struct nd_opt_hdr *__res7;
 		struct nd_opt_hdr *__res8;
 		struct nd_opt_hdr *__res9;
 		struct nd_opt_hdr *__res10;
 		struct nd_opt_hdr *__res11;
 		struct nd_opt_hdr *__res12;
 		struct nd_opt_hdr *__res13;
 		struct nd_opt_nonce *nonce;
 		struct nd_opt_hdr *__res15;
 		struct nd_opt_hdr *search;	/* multiple opts */
 		struct nd_opt_hdr *last;	/* multiple opts */
 		int done;
 		struct nd_opt_prefix_info *pi_end;/* multiple opts, end */
 	} nd_opt_each;
 };
 #define nd_opts_src_lladdr	nd_opt_each.src_lladdr
 #define nd_opts_tgt_lladdr	nd_opt_each.tgt_lladdr
 #define nd_opts_pi		nd_opt_each.pi_beg
 #define nd_opts_pi_end		nd_opt_each.pi_end
 #define nd_opts_rh		nd_opt_each.rh
 #define nd_opts_mtu		nd_opt_each.mtu
 #define nd_opts_nonce		nd_opt_each.nonce
 #define nd_opts_search		nd_opt_each.search
 #define nd_opts_last		nd_opt_each.last
 #define nd_opts_done		nd_opt_each.done
 
 /* XXX: need nd6_var.h?? */
 /* nd6.c */
 void nd6_init(void);
 #ifdef VIMAGE
 void nd6_destroy(void);
 #endif
 struct nd_ifinfo *nd6_ifattach(struct ifnet *);
-void nd6_ifdetach(struct nd_ifinfo *);
+void nd6_ifdetach(struct ifnet *, struct nd_ifinfo *);
 int nd6_is_addr_neighbor(const struct sockaddr_in6 *, struct ifnet *);
 void nd6_option_init(void *, int, union nd_opts *);
 struct nd_opt_hdr *nd6_option(union nd_opts *);
 int nd6_options(union nd_opts *);
 struct llentry *nd6_lookup(const struct in6_addr *, int, struct ifnet *);
 struct llentry *nd6_alloc(const struct in6_addr *, int, struct ifnet *);
 void nd6_setmtu(struct ifnet *);
 void nd6_llinfo_setstate(struct llentry *lle, int newstate);
 void nd6_timer(void *);
 void nd6_purge(struct ifnet *);
 int nd6_resolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst,
     char *desten, uint32_t *pflags);
 int nd6_resolve(struct ifnet *, int, struct mbuf *,
     const struct sockaddr *, u_char *, uint32_t *);
 int nd6_ioctl(u_long, caddr_t, struct ifnet *);
 void nd6_cache_lladdr(struct ifnet *, struct in6_addr *,
 	char *, int, int, int);
 void nd6_grab_holdchain(struct llentry *, struct mbuf **,
     struct sockaddr_in6 *);
 int nd6_flush_holdchain(struct ifnet *, struct ifnet *, struct mbuf *,
     struct sockaddr_in6 *);
 int nd6_add_ifa_lle(struct in6_ifaddr *);
 void nd6_rem_ifa_lle(struct in6_ifaddr *, int);
 int nd6_output_ifp(struct ifnet *, struct ifnet *, struct mbuf *,
     struct sockaddr_in6 *, struct route *);
 
 /* nd6_nbr.c */
 void nd6_na_input(struct mbuf *, int, int);
 void nd6_na_output(struct ifnet *, const struct in6_addr *,
 	const struct in6_addr *, u_long, int, struct sockaddr *);
 void nd6_ns_input(struct mbuf *, int, int);
 void nd6_ns_output(struct ifnet *, const struct in6_addr *,
 	const struct in6_addr *, const struct in6_addr *, uint8_t *);
 caddr_t nd6_ifptomac(struct ifnet *);
 void nd6_dad_init(void);
 void nd6_dad_start(struct ifaddr *, int);
 void nd6_dad_stop(struct ifaddr *);
 
 /* nd6_rtr.c */
 void nd6_rs_input(struct mbuf *, int, int);
 void nd6_ra_input(struct mbuf *, int, int);
 void defrouter_reset(void);
 void defrouter_select(void);
 void defrouter_ref(struct nd_defrouter *);
 void defrouter_rele(struct nd_defrouter *);
 bool defrouter_remove(struct in6_addr *, struct ifnet *);
 void defrouter_unlink(struct nd_defrouter *, struct nd_drhead *);
 void defrouter_del(struct nd_defrouter *);
 void prelist_remove(struct nd_prefix *);
 int nd6_prelist_add(struct nd_prefixctl *, struct nd_defrouter *,
 	struct nd_prefix **);
 void pfxlist_onlink_check(void);
 struct nd_defrouter *defrouter_lookup(struct in6_addr *, struct ifnet *);
 struct nd_defrouter *defrouter_lookup_locked(struct in6_addr *, struct ifnet *);
 struct nd_prefix *nd6_prefix_lookup(struct nd_prefixctl *);
 void rt6_flush(struct in6_addr *, struct ifnet *);
 int nd6_setdefaultiface(int);
 int in6_tmpifadd(const struct in6_ifaddr *, int, int);
 
 #endif /* _KERNEL */
 
 #endif /* _NETINET6_ND6_H_ */