Index: projects/vnet/sys/net/bpf.c
===================================================================
--- projects/vnet/sys/net/bpf.c	(revision 297511)
+++ projects/vnet/sys/net/bpf.c	(revision 297512)
@@ -1,2979 +1,3041 @@
 /*-
  * Copyright (c) 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from the Stanford/CMU enet packet filter,
  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
  * Berkeley Laboratory.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      @(#)bpf.c	8.4 (Berkeley) 1/9/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bpf.h"
 #include "opt_compat.h"
+#include "opt_ddb.h"
 #include "opt_netgraph.h"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/time.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/filio.h>
 #include <sys/sockio.h>
 #include <sys/ttycom.h>
 #include <sys/uio.h>
 
 #include <sys/event.h>
 #include <sys/file.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 
 #include <sys/socket.h>
 
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/bpf.h>
 #include <net/bpf_buffer.h>
 #ifdef BPF_JITTER
 #include <net/bpf_jitter.h>
 #endif
 #include <net/bpf_zerocopy.h>
 #include <net/bpfdesc.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #include <net80211/ieee80211_freebsd.h>
 
 #include <security/mac/mac_framework.h>
 
 MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
 
 struct bpf_if {
 #define	bif_next	bif_ext.bif_next
 #define	bif_dlist	bif_ext.bif_dlist
 	struct bpf_if_ext bif_ext;	/* public members */
 	u_int		bif_dlt;	/* link layer type */
 	u_int		bif_hdrlen;	/* length of link header */
 	struct ifnet	*bif_ifp;	/* corresponding interface */
 	struct rwlock	bif_lock;	/* interface lock */
 	LIST_HEAD(, bpf_d) bif_wlist;	/* writer-only list */
 	int		bif_flags;	/* Interface flags */
 };
 
 CTASSERT(offsetof(struct bpf_if, bif_ext) == 0);
 
 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
 
 #define PRINET  26			/* interruptible */
 
 #define	SIZEOF_BPF_HDR(type)	\
     (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen))
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 #define BPF_ALIGNMENT32 sizeof(int32_t)
 #define BPF_WORDALIGN32(x) (((x)+(BPF_ALIGNMENT32-1))&~(BPF_ALIGNMENT32-1))
 
 #ifndef BURN_BRIDGES
 /*
  * 32-bit version of structure prepended to each packet.  We use this header
  * instead of the standard one for 32-bit streams.  We mark the a stream as
  * 32-bit the first time we see a 32-bit compat ioctl request.
  */
 struct bpf_hdr32 {
 	struct timeval32 bh_tstamp;	/* time stamp */
 	uint32_t	bh_caplen;	/* length of captured portion */
 	uint32_t	bh_datalen;	/* original length of packet */
 	uint16_t	bh_hdrlen;	/* length of bpf header (this struct
 					   plus alignment padding) */
 };
 #endif
 
 struct bpf_program32 {
 	u_int bf_len;
 	uint32_t bf_insns;
 };
 
 struct bpf_dltlist32 {
 	u_int	bfl_len;
 	u_int	bfl_list;
 };
 
 #define	BIOCSETF32	_IOW('B', 103, struct bpf_program32)
 #define	BIOCSRTIMEOUT32	_IOW('B', 109, struct timeval32)
 #define	BIOCGRTIMEOUT32	_IOR('B', 110, struct timeval32)
 #define	BIOCGDLTLIST32	_IOWR('B', 121, struct bpf_dltlist32)
 #define	BIOCSETWF32	_IOW('B', 123, struct bpf_program32)
 #define	BIOCSETFNR32	_IOW('B', 130, struct bpf_program32)
 #endif
 
 /*
  * bpf_iflist is a list of BPF interface structures, each corresponding to a
  * specific DLT.  The same network interface might have several BPF interface
  * structures registered by different layers in the stack (i.e., 802.11
  * frames, ethernet frames, etc).
  */
 static LIST_HEAD(, bpf_if)	bpf_iflist, bpf_freelist;
 static struct mtx	bpf_mtx;		/* bpf global lock */
 static int		bpf_bpfd_cnt;
 
 static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
 static void	bpf_detachd(struct bpf_d *);
 static void	bpf_detachd_locked(struct bpf_d *);
 static void	bpf_freed(struct bpf_d *);
 static int	bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
 		    struct sockaddr *, int *, struct bpf_d *);
 static int	bpf_setif(struct bpf_d *, struct ifreq *);
 static void	bpf_timed_out(void *);
 static __inline void
 		bpf_wakeup(struct bpf_d *);
 static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
 		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
 		    struct bintime *);
 static void	reset_d(struct bpf_d *);
 static int	bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
 static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 static int	bpf_setdlt(struct bpf_d *, u_int);
 static void	filt_bpfdetach(struct knote *);
 static int	filt_bpfread(struct knote *, long);
 static void	bpf_drvinit(void *);
 static int	bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl");
 int bpf_maxinsns = BPF_MAXINSNS;
 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
     &bpf_maxinsns, 0, "Maximum bpf program instructions");
 static int bpf_zerocopy_enable = 0;
 SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
     &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
 static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW,
     bpf_stats_sysctl, "bpf statistics portal");
 
 static VNET_DEFINE(int, bpf_optimize_writers) = 0;
 #define	V_bpf_optimize_writers VNET(bpf_optimize_writers)
 SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(bpf_optimize_writers), 0,
     "Do not send packets until BPF program is set");
 
 static	d_open_t	bpfopen;
 static	d_read_t	bpfread;
 static	d_write_t	bpfwrite;
 static	d_ioctl_t	bpfioctl;
 static	d_poll_t	bpfpoll;
 static	d_kqfilter_t	bpfkqfilter;
 
 static struct cdevsw bpf_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	bpfopen,
 	.d_read =	bpfread,
 	.d_write =	bpfwrite,
 	.d_ioctl =	bpfioctl,
 	.d_poll =	bpfpoll,
 	.d_name =	"bpf",
 	.d_kqfilter =	bpfkqfilter,
 };
 
 static struct filterops bpfread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_bpfdetach,
 	.f_event = filt_bpfread,
 };
 
 eventhandler_tag	bpf_ifdetach_cookie = NULL;
 
 /*
  * LOCKING MODEL USED BY BPF:
  * Locks:
  * 1) global lock (BPF_LOCK). Mutex, used to protect interface addition/removal,
  * some global counters and every bpf_if reference.
  * 2) Interface lock. Rwlock, used to protect list of BPF descriptors and their filters.
  * 3) Descriptor lock. Mutex, used to protect BPF buffers and various structure fields
  *   used by bpf_mtap code.
  *
  * Lock order:
  *
  * Global lock, interface lock, descriptor lock
  *
  * We have to acquire interface lock before descriptor main lock due to BPF_MTAP[2]
  * working model. In many places (like bpf_detachd) we start with BPF descriptor
  * (and we need to at least rlock it to get reliable interface pointer). This
  * gives us potential LOR. As a result, we use global lock to protect from bpf_if
  * change in every such place.
  *
  * Changing d->bd_bif is protected by 1) global lock, 2) interface lock and
  * 3) descriptor main wlock.
  * Reading bd_bif can be protected by any of these locks, typically global lock.
  *
  * Changing read/write BPF filter is protected by the same three locks,
  * the same applies for reading.
  *
  * Sleeping in global lock is not allowed due to bpfdetach() using it.
  */
 
 /*
  * Wrapper functions for various buffering methods.  If the set of buffer
  * modes expands, we will probably want to introduce a switch data structure
  * similar to protosw, et.
  */
 static void
 bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
     u_int len)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
 
 	case BPF_BUFMODE_ZBUF:
 		d->bd_zcopy++;
 		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
 
 	default:
 		panic("bpf_buf_append_bytes");
 	}
 }
 
 static void
 bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
     u_int len)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
 
 	case BPF_BUFMODE_ZBUF:
 		d->bd_zcopy++;
 		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
 
 	default:
 		panic("bpf_buf_append_mbuf");
 	}
 }
 
 /*
  * This function gets called when the free buffer is re-assigned.
  */
 static void
 bpf_buf_reclaimed(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return;
 
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_buf_reclaimed(d);
 		return;
 
 	default:
 		panic("bpf_buf_reclaimed");
 	}
 }
 
 /*
  * If the buffer mechanism has a way to decide that a held buffer can be made
  * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
  * returned if the buffer can be discarded, (0) is returned if it cannot.
  */
 static int
 bpf_canfreebuf(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_canfreebuf(d));
 	}
 	return (0);
 }
 
 /*
  * Allow the buffer model to indicate that the current store buffer is
  * immutable, regardless of the appearance of space.  Return (1) if the
  * buffer is writable, and (0) if not.
  */
 static int
 bpf_canwritebuf(struct bpf_d *d)
 {
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_canwritebuf(d));
 	}
 	return (1);
 }
 
 /*
  * Notify buffer model that an attempt to write to the store buffer has
  * resulted in a dropped packet, in which case the buffer may be considered
  * full.
  */
 static void
 bpf_buffull(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_buffull(d);
 		break;
 	}
 }
 
 /*
  * Notify the buffer model that a buffer has moved into the hold position.
  */
 void
 bpf_bufheld(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_bufheld(d);
 		break;
 	}
 }
 
 static void
 bpf_free(struct bpf_d *d)
 {
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_free(d));
 
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_free(d));
 
 	default:
 		panic("bpf_buf_free");
 	}
 }
 
 static int
 bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
 		return (EOPNOTSUPP);
 	return (bpf_buffer_uiomove(d, buf, len, uio));
 }
 
 static int
 bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
 		return (EOPNOTSUPP);
 	return (bpf_buffer_ioctl_sblen(d, i));
 }
 
 static int
 bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
 }
 
 static int
 bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
 }
 
 static int
 bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
 }
 
 /*
  * General BPF functions.
  */
 static int
 bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
     struct sockaddr *sockp, int *hdrlen, struct bpf_d *d)
 {
 	const struct ieee80211_bpf_params *p;
 	struct ether_header *eh;
 	struct mbuf *m;
 	int error;
 	int len;
 	int hlen;
 	int slen;
 
 	/*
 	 * Build a sockaddr based on the data link layer type.
 	 * We do this at this level because the ethernet header
 	 * is copied directly into the data field of the sockaddr.
 	 * In the case of SLIP, there is no header and the packet
 	 * is forwarded as is.
 	 * Also, we are careful to leave room at the front of the mbuf
 	 * for the link level header.
 	 */
 	switch (linktype) {
 
 	case DLT_SLIP:
 		sockp->sa_family = AF_INET;
 		hlen = 0;
 		break;
 
 	case DLT_EN10MB:
 		sockp->sa_family = AF_UNSPEC;
 		/* XXX Would MAXLINKHDR be better? */
 		hlen = ETHER_HDR_LEN;
 		break;
 
 	case DLT_FDDI:
 		sockp->sa_family = AF_IMPLINK;
 		hlen = 0;
 		break;
 
 	case DLT_RAW:
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 0;
 		break;
 
 	case DLT_NULL:
 		/*
 		 * null interface types require a 4 byte pseudo header which
 		 * corresponds to the address family of the packet.
 		 */
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 4;
 		break;
 
 	case DLT_ATM_RFC1483:
 		/*
 		 * en atm driver requires 4-byte atm pseudo header.
 		 * though it isn't standard, vpi:vci needs to be
 		 * specified anyway.
 		 */
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 12;	/* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
 		break;
 
 	case DLT_PPP:
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 4;	/* This should match PPP_HDRLEN */
 		break;
 
 	case DLT_IEEE802_11:		/* IEEE 802.11 wireless */
 		sockp->sa_family = AF_IEEE80211;
 		hlen = 0;
 		break;
 
 	case DLT_IEEE802_11_RADIO:	/* IEEE 802.11 wireless w/ phy params */
 		sockp->sa_family = AF_IEEE80211;
 		sockp->sa_len = 12;	/* XXX != 0 */
 		hlen = sizeof(struct ieee80211_bpf_params);
 		break;
 
 	default:
 		return (EIO);
 	}
 
 	len = uio->uio_resid;
 	if (len < hlen || len - hlen > ifp->if_mtu)
 		return (EMSGSIZE);
 
 	m = m_get2(len, M_WAITOK, MT_DATA, M_PKTHDR);
 	if (m == NULL)
 		return (EIO);
 	m->m_pkthdr.len = m->m_len = len;
 	*mp = m;
 
 	error = uiomove(mtod(m, u_char *), len, uio);
 	if (error)
 		goto bad;
 
 	slen = bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len);
 	if (slen == 0) {
 		error = EPERM;
 		goto bad;
 	}
 
 	/* Check for multicast destination */
 	switch (linktype) {
 	case DLT_EN10MB:
 		eh = mtod(m, struct ether_header *);
 		if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 			if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost,
 			    ETHER_ADDR_LEN) == 0)
 				m->m_flags |= M_BCAST;
 			else
 				m->m_flags |= M_MCAST;
 		}
 		if (d->bd_hdrcmplt == 0) {
 			memcpy(eh->ether_shost, IF_LLADDR(ifp),
 			    sizeof(eh->ether_shost));
 		}
 		break;
 	}
 
 	/*
 	 * Make room for link header, and copy it to sockaddr
 	 */
 	if (hlen != 0) {
 		if (sockp->sa_family == AF_IEEE80211) {
 			/*
 			 * Collect true length from the parameter header
 			 * NB: sockp is known to be zero'd so if we do a
 			 *     short copy unspecified parameters will be
 			 *     zero.
 			 * NB: packet may not be aligned after stripping
 			 *     bpf params
 			 * XXX check ibp_vers
 			 */
 			p = mtod(m, const struct ieee80211_bpf_params *);
 			hlen = p->ibp_len;
 			if (hlen > sizeof(sockp->sa_data)) {
 				error = EINVAL;
 				goto bad;
 			}
 		}
 		bcopy(mtod(m, const void *), sockp->sa_data, hlen);
 	}
 	*hdrlen = hlen;
 
 	return (0);
 bad:
 	m_freem(m);
 	return (error);
 }
 
 /*
  * Attach file to the bpf interface, i.e. make d listen on bp.
  */
 static void
 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 {
 	int op_w;
 
 	BPF_LOCK_ASSERT();
 
 	/*
 	 * Save sysctl value to protect from sysctl change
 	 * between reads
 	 */
 	op_w = V_bpf_optimize_writers || d->bd_writer;
 
 	if (d->bd_bif != NULL)
 		bpf_detachd_locked(d);
 	/*
 	 * Point d at bp, and add d to the interface's list.
 	 * Since there are many applications using BPF for
 	 * sending raw packets only (dhcpd, cdpd are good examples)
 	 * we can delay adding d to the list of active listeners until
 	 * some filter is configured.
 	 */
 
 	BPFIF_WLOCK(bp);
 	BPFD_LOCK(d);
 
 	d->bd_bif = bp;
 
 	if (op_w != 0) {
 		/* Add to writers-only list */
 		LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
 		/*
 		 * We decrement bd_writer on every filter set operation.
 		 * First BIOCSETF is done by pcap_open_live() to set up
 		 * snap length. After that appliation usually sets its own filter
 		 */
 		d->bd_writer = 2;
 	} else
 		LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
 
 	BPFD_UNLOCK(d);
 	BPFIF_WUNLOCK(bp);
 
 	bpf_bpfd_cnt++;
 
 	CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list",
 	    __func__, d->bd_pid, d->bd_writer ? "writer" : "active");
 
 	if (op_w == 0)
 		EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
 }
 
 /*
  * Check if we need to upgrade our descriptor @d from write-only mode.
  */
 static int
 bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen)
 {
 	int is_snap, need_upgrade;
 
 	/*
 	 * Check if we've already upgraded or new filter is empty.
 	 */
 	if (d->bd_writer == 0 || fcode == NULL)
 		return (0);
 
 	need_upgrade = 0;
 
 	/*
 	 * Check if cmd looks like snaplen setting from
 	 * pcap_bpf.c:pcap_open_live().
 	 * Note we're not checking .k value here:
 	 * while pcap_open_live() definitely sets to to non-zero value,
 	 * we'd prefer to treat k=0 (deny ALL) case the same way: e.g.
 	 * do not consider upgrading immediately
 	 */
 	if (cmd == BIOCSETF && flen == 1 && fcode[0].code == (BPF_RET | BPF_K))
 		is_snap = 1;
 	else
 		is_snap = 0;
 
 	if (is_snap == 0) {
 		/*
 		 * We're setting first filter and it doesn't look like
 		 * setting snaplen.  We're probably using bpf directly.
 		 * Upgrade immediately.
 		 */
 		need_upgrade = 1;
 	} else {
 		/*
 		 * Do not require upgrade by first BIOCSETF
 		 * (used to set snaplen) by pcap_open_live().
 		 */
 
 		if (--d->bd_writer == 0) {
 			/*
 			 * First snaplen filter has already
 			 * been set. This is probably catch-all
 			 * filter
 			 */
 			need_upgrade = 1;
 		}
 	}
 
 	CTR5(KTR_NET,
 	    "%s: filter function set by pid %d, "
 	    "bd_writer counter %d, snap %d upgrade %d",
 	    __func__, d->bd_pid, d->bd_writer,
 	    is_snap, need_upgrade);
 
 	return (need_upgrade);
 }
 
 /*
  * Add d to the list of active bp filters.
  * Requires bpf_attachd() to be called before.
  */
 static void
 bpf_upgraded(struct bpf_d *d)
 {
 	struct bpf_if *bp;
 
 	BPF_LOCK_ASSERT();
 
 	bp = d->bd_bif;
 
 	/*
 	 * Filter can be set several times without specifying interface.
 	 * Mark d as reader and exit.
 	 */
 	if (bp == NULL) {
 		BPFD_LOCK(d);
 		d->bd_writer = 0;
 		BPFD_UNLOCK(d);
 		return;
 	}
 
 	BPFIF_WLOCK(bp);
 	BPFD_LOCK(d);
 
 	/* Remove from writers-only list */
 	LIST_REMOVE(d, bd_next);
 	LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
 	/* Mark d as reader */
 	d->bd_writer = 0;
 
 	BPFD_UNLOCK(d);
 	BPFIF_WUNLOCK(bp);
 
 	CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid);
 
 	EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
 }
 
 /*
  * Detach a file from its interface.
  */
 static void
 bpf_detachd(struct bpf_d *d)
 {
 	BPF_LOCK();
 	bpf_detachd_locked(d);
 	BPF_UNLOCK();
 }
 
 static void
 bpf_detachd_locked(struct bpf_d *d)
 {
 	int error;
 	struct bpf_if *bp;
 	struct ifnet *ifp;
 
 	CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
 
 	BPF_LOCK_ASSERT();
 
 	/* Check if descriptor is attached */
 	if ((bp = d->bd_bif) == NULL)
 		return;
 
 	BPFIF_WLOCK(bp);
 	BPFD_LOCK(d);
 
 	/* Save bd_writer value */
 	error = d->bd_writer;
 
 	/*
 	 * Remove d from the interface's descriptor list.
 	 */
 	LIST_REMOVE(d, bd_next);
 
 	ifp = bp->bif_ifp;
 	d->bd_bif = NULL;
 	BPFD_UNLOCK(d);
 	BPFIF_WUNLOCK(bp);
 
 	bpf_bpfd_cnt--;
 
 	/* Call event handler iff d is attached */
 	if (error == 0)
 		EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0);
 
 	/*
 	 * Check if this descriptor had requested promiscuous mode.
 	 * If so, turn it off.
 	 */
 	if (d->bd_promisc) {
 		d->bd_promisc = 0;
 		CURVNET_SET(ifp->if_vnet);
 		error = ifpromisc(ifp, 0);
 		CURVNET_RESTORE();
 		if (error != 0 && error != ENXIO) {
 			/*
 			 * ENXIO can happen if a pccard is unplugged
 			 * Something is really wrong if we were able to put
 			 * the driver into promiscuous mode, but can't
 			 * take it out.
 			 */
 			if_printf(bp->bif_ifp,
 				"bpf_detach: ifpromisc failed (%d)\n", error);
 		}
 	}
 }
 
 /*
  * Close the descriptor by detaching it from its interface,
  * deallocating its buffers, and marking it free.
  */
 static void
 bpf_dtor(void *data)
 {
 	struct bpf_d *d = data;
 
 	BPFD_LOCK(d);
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	d->bd_state = BPF_IDLE;
 	BPFD_UNLOCK(d);
 	funsetown(&d->bd_sigio);
 	bpf_detachd(d);
 #ifdef MAC
 	mac_bpfdesc_destroy(d);
 #endif /* MAC */
 	seldrain(&d->bd_sel);
 	knlist_destroy(&d->bd_sel.si_note);
 	callout_drain(&d->bd_callout);
 	bpf_freed(d);
 	free(d, M_BPF);
 }
 
 /*
  * Open ethernet device.  Returns ENXIO for illegal minor device number,
  * EBUSY if file is open by another process.
  */
 /* ARGSUSED */
 static	int
 bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct bpf_d *d;
 	int error;
 
 	d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
 	error = devfs_set_cdevpriv(d, bpf_dtor);
 	if (error != 0) {
 		free(d, M_BPF);
 		return (error);
 	}
 
 	/*
 	 * For historical reasons, perform a one-time initialization call to
 	 * the buffer routines, even though we're not yet committed to a
 	 * particular buffer method.
 	 */
 	bpf_buffer_init(d);
 	if ((flags & FREAD) == 0)
 		d->bd_writer = 2;
 	d->bd_hbuf_in_use = 0;
 	d->bd_bufmode = BPF_BUFMODE_BUFFER;
 	d->bd_sig = SIGIO;
 	d->bd_direction = BPF_D_INOUT;
 	BPF_PID_REFRESH(d, td);
 #ifdef MAC
 	mac_bpfdesc_init(d);
 	mac_bpfdesc_create(td->td_ucred, d);
 #endif
 	mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF);
 	callout_init_mtx(&d->bd_callout, &d->bd_lock, 0);
 	knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock);
 
 	return (0);
 }
 
 /*
  *  bpfread - read next chunk of packets from buffers
  */
 static	int
 bpfread(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct bpf_d *d;
 	int error;
 	int non_block;
 	int timed_out;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Restrict application to use a buffer the same size as
 	 * as kernel buffers.
 	 */
 	if (uio->uio_resid != d->bd_bufsize)
 		return (EINVAL);
 
 	non_block = ((ioflag & O_NONBLOCK) != 0);
 
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH_CUR(d);
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
 		BPFD_UNLOCK(d);
 		return (EOPNOTSUPP);
 	}
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	timed_out = (d->bd_state == BPF_TIMED_OUT);
 	d->bd_state = BPF_IDLE;
 	while (d->bd_hbuf_in_use) {
 		error = mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
 		    PRINET|PCATCH, "bd_hbuf", 0);
 		if (error != 0) {
 			BPFD_UNLOCK(d);
 			return (error);
 		}
 	}
 	/*
 	 * If the hold buffer is empty, then do a timed sleep, which
 	 * ends when the timeout expires or when enough packets
 	 * have arrived to fill the store buffer.
 	 */
 	while (d->bd_hbuf == NULL) {
 		if (d->bd_slen != 0) {
 			/*
 			 * A packet(s) either arrived since the previous
 			 * read or arrived while we were asleep.
 			 */
 			if (d->bd_immediate || non_block || timed_out) {
 				/*
 				 * Rotate the buffers and return what's here
 				 * if we are in immediate mode, non-blocking
 				 * flag is set, or this descriptor timed out.
 				 */
 				ROTATE_BUFFERS(d);
 				break;
 			}
 		}
 
 		/*
 		 * No data is available, check to see if the bpf device
 		 * is still pointed at a real interface.  If not, return
 		 * ENXIO so that the userland process knows to rebind
 		 * it before using it again.
 		 */
 		if (d->bd_bif == NULL) {
 			BPFD_UNLOCK(d);
 			return (ENXIO);
 		}
 
 		if (non_block) {
 			BPFD_UNLOCK(d);
 			return (EWOULDBLOCK);
 		}
 		error = msleep(d, &d->bd_lock, PRINET|PCATCH,
 		     "bpf", d->bd_rtout);
 		if (error == EINTR || error == ERESTART) {
 			BPFD_UNLOCK(d);
 			return (error);
 		}
 		if (error == EWOULDBLOCK) {
 			/*
 			 * On a timeout, return what's in the buffer,
 			 * which may be nothing.  If there is something
 			 * in the store buffer, we can rotate the buffers.
 			 */
 			if (d->bd_hbuf)
 				/*
 				 * We filled up the buffer in between
 				 * getting the timeout and arriving
 				 * here, so we don't need to rotate.
 				 */
 				break;
 
 			if (d->bd_slen == 0) {
 				BPFD_UNLOCK(d);
 				return (0);
 			}
 			ROTATE_BUFFERS(d);
 			break;
 		}
 	}
 	/*
 	 * At this point, we know we have something in the hold slot.
 	 */
 	d->bd_hbuf_in_use = 1;
 	BPFD_UNLOCK(d);
 
 	/*
 	 * Move data from hold buffer into user space.
 	 * We know the entire buffer is transferred since
 	 * we checked above that the read buffer is bpf_bufsize bytes.
   	 *
 	 * We do not have to worry about simultaneous reads because
 	 * we waited for sole access to the hold buffer above.
 	 */
 	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
 
 	BPFD_LOCK(d);
 	KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf"));
 	d->bd_fbuf = d->bd_hbuf;
 	d->bd_hbuf = NULL;
 	d->bd_hlen = 0;
 	bpf_buf_reclaimed(d);
 	d->bd_hbuf_in_use = 0;
 	wakeup(&d->bd_hbuf_in_use);
 	BPFD_UNLOCK(d);
 
 	return (error);
 }
 
 /*
  * If there are processes sleeping on this descriptor, wake them up.
  */
 static __inline void
 bpf_wakeup(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 	if (d->bd_state == BPF_WAITING) {
 		callout_stop(&d->bd_callout);
 		d->bd_state = BPF_IDLE;
 	}
 	wakeup(d);
 	if (d->bd_async && d->bd_sig && d->bd_sigio)
 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
 
 	selwakeuppri(&d->bd_sel, PRINET);
 	KNOTE_LOCKED(&d->bd_sel.si_note, 0);
 }
 
 static void
 bpf_timed_out(void *arg)
 {
 	struct bpf_d *d = (struct bpf_d *)arg;
 
 	BPFD_LOCK_ASSERT(d);
 
 	if (callout_pending(&d->bd_callout) || !callout_active(&d->bd_callout))
 		return;
 	if (d->bd_state == BPF_WAITING) {
 		d->bd_state = BPF_TIMED_OUT;
 		if (d->bd_slen != 0)
 			bpf_wakeup(d);
 	}
 }
 
 static int
 bpf_ready(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
 		return (1);
 	if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
 	    d->bd_slen != 0)
 		return (1);
 	return (0);
 }
 
 static int
 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct bpf_d *d;
 	struct ifnet *ifp;
 	struct mbuf *m, *mc;
 	struct sockaddr dst;
 	struct route ro;
 	int error, hlen;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	BPF_PID_REFRESH_CUR(d);
 	d->bd_wcount++;
 	/* XXX: locking required */
 	if (d->bd_bif == NULL) {
 		d->bd_wdcount++;
 		return (ENXIO);
 	}
 
 	ifp = d->bd_bif->bif_ifp;
 
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		d->bd_wdcount++;
 		return (ENETDOWN);
 	}
 
 	if (uio->uio_resid == 0) {
 		d->bd_wdcount++;
 		return (0);
 	}
 
 	bzero(&dst, sizeof(dst));
 	m = NULL;
 	hlen = 0;
 	/* XXX: bpf_movein() can sleep */
 	error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp,
 	    &m, &dst, &hlen, d);
 	if (error) {
 		d->bd_wdcount++;
 		return (error);
 	}
 	d->bd_wfcount++;
 	if (d->bd_hdrcmplt)
 		dst.sa_family = pseudo_AF_HDRCMPLT;
 
 	if (d->bd_feedback) {
 		mc = m_dup(m, M_NOWAIT);
 		if (mc != NULL)
 			mc->m_pkthdr.rcvif = ifp;
 		/* Set M_PROMISC for outgoing packets to be discarded. */
 		if (d->bd_direction == BPF_D_INOUT)
 			m->m_flags |= M_PROMISC;
 	} else
 		mc = NULL;
 
 	m->m_pkthdr.len -= hlen;
 	m->m_len -= hlen;
 	m->m_data += hlen;	/* XXX */
 
 	CURVNET_SET(ifp->if_vnet);
 #ifdef MAC
 	BPFD_LOCK(d);
 	mac_bpfdesc_create_mbuf(d, m);
 	if (mc != NULL)
 		mac_bpfdesc_create_mbuf(d, mc);
 	BPFD_UNLOCK(d);
 #endif
 
 	bzero(&ro, sizeof(ro));
 	if (hlen != 0) {
 		ro.ro_prepend = (u_char *)&dst.sa_data;
 		ro.ro_plen = hlen;
 		ro.ro_flags = RT_HAS_HEADER;
 	}
 
 	error = (*ifp->if_output)(ifp, m, &dst, &ro);
 	if (error)
 		d->bd_wdcount++;
 
 	if (mc != NULL) {
 		if (error == 0)
 			(*ifp->if_input)(ifp, mc);
 		else
 			m_freem(mc);
 	}
 	CURVNET_RESTORE();
 
 	return (error);
 }
 
 /*
  * Reset a descriptor by flushing its packet buffer and clearing the receive
  * and drop counts.  This is doable for kernel-only buffers, but with
  * zero-copy buffers, we can't write to (or rotate) buffers that are
  * currently owned by userspace.  It would be nice if we could encapsulate
  * this logic in the buffer code rather than here.
  */
 static void
 reset_d(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	while (d->bd_hbuf_in_use)
 		mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET,
 		    "bd_hbuf", 0);
 	if ((d->bd_hbuf != NULL) &&
 	    (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) {
 		/* Free the hold buffer. */
 		d->bd_fbuf = d->bd_hbuf;
 		d->bd_hbuf = NULL;
 		d->bd_hlen = 0;
 		bpf_buf_reclaimed(d);
 	}
 	if (bpf_canwritebuf(d))
 		d->bd_slen = 0;
 	d->bd_rcount = 0;
 	d->bd_dcount = 0;
 	d->bd_fcount = 0;
 	d->bd_wcount = 0;
 	d->bd_wfcount = 0;
 	d->bd_wdcount = 0;
 	d->bd_zcopy = 0;
 }
 
 /*
  *  FIONREAD		Check for read packet available.
  *  BIOCGBLEN		Get buffer len [for read()].
  *  BIOCSETF		Set read filter.
  *  BIOCSETFNR		Set read filter without resetting descriptor.
  *  BIOCSETWF		Set write filter.
  *  BIOCFLUSH		Flush read packet buffer.
  *  BIOCPROMISC		Put interface into promiscuous mode.
  *  BIOCGDLT		Get link layer type.
  *  BIOCGETIF		Get interface name.
  *  BIOCSETIF		Set interface.
  *  BIOCSRTIMEOUT	Set read timeout.
  *  BIOCGRTIMEOUT	Get read timeout.
  *  BIOCGSTATS		Get packet stats.
  *  BIOCIMMEDIATE	Set immediate mode.
  *  BIOCVERSION		Get filter language version.
  *  BIOCGHDRCMPLT	Get "header already complete" flag
  *  BIOCSHDRCMPLT	Set "header already complete" flag
  *  BIOCGDIRECTION	Get packet direction flag
  *  BIOCSDIRECTION	Set packet direction flag
  *  BIOCGTSTAMP		Get time stamp format and resolution.
  *  BIOCSTSTAMP		Set time stamp format and resolution.
  *  BIOCLOCK		Set "locked" flag
  *  BIOCFEEDBACK	Set packet feedback mode.
  *  BIOCSETZBUF		Set current zero-copy buffer locations.
  *  BIOCGETZMAX		Get maximum zero-copy buffer size.
  *  BIOCROTZBUF		Force rotation of zero-copy buffer
  *  BIOCSETBUFMODE	Set buffer mode.
  *  BIOCGETBUFMODE	Get current buffer mode.
  */
 /* ARGSUSED */
 static	int
 bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
     struct thread *td)
 {
 	struct bpf_d *d;
 	int error;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH(d, td);
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	d->bd_state = BPF_IDLE;
 	BPFD_UNLOCK(d);
 
 	if (d->bd_locked == 1) {
 		switch (cmd) {
 		case BIOCGBLEN:
 		case BIOCFLUSH:
 		case BIOCGDLT:
 		case BIOCGDLTLIST:
 #ifdef COMPAT_FREEBSD32
 		case BIOCGDLTLIST32:
 #endif
 		case BIOCGETIF:
 		case BIOCGRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
 		case BIOCGRTIMEOUT32:
 #endif
 		case BIOCGSTATS:
 		case BIOCVERSION:
 		case BIOCGRSIG:
 		case BIOCGHDRCMPLT:
 		case BIOCSTSTAMP:
 		case BIOCFEEDBACK:
 		case FIONREAD:
 		case BIOCLOCK:
 		case BIOCSRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
 		case BIOCSRTIMEOUT32:
 #endif
 		case BIOCIMMEDIATE:
 		case TIOCGPGRP:
 		case BIOCROTZBUF:
 			break;
 		default:
 			return (EPERM);
 		}
 	}
 #ifdef COMPAT_FREEBSD32
 	/*
 	 * If we see a 32-bit compat ioctl, mark the stream as 32-bit so
 	 * that it will get 32-bit packet headers.
 	 */
 	switch (cmd) {
 	case BIOCSETF32:
 	case BIOCSETFNR32:
 	case BIOCSETWF32:
 	case BIOCGDLTLIST32:
 	case BIOCGRTIMEOUT32:
 	case BIOCSRTIMEOUT32:
 		BPFD_LOCK(d);
 		d->bd_compat32 = 1;
 		BPFD_UNLOCK(d);
 	}
 #endif
 
 	CURVNET_SET(TD_TO_VNET(td));
 	switch (cmd) {
 
 	default:
 		error = EINVAL;
 		break;
 
 	/*
 	 * Check for read packet available.
 	 */
 	case FIONREAD:
 		{
 			int n;
 
 			BPFD_LOCK(d);
 			n = d->bd_slen;
 			while (d->bd_hbuf_in_use)
 				mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
 				    PRINET, "bd_hbuf", 0);
 			if (d->bd_hbuf)
 				n += d->bd_hlen;
 			BPFD_UNLOCK(d);
 
 			*(int *)addr = n;
 			break;
 		}
 
 	/*
 	 * Get buffer len [for read()].
 	 */
 	case BIOCGBLEN:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_bufsize;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set buffer length.
 	 */
 	case BIOCSBLEN:
 		error = bpf_ioctl_sblen(d, (u_int *)addr);
 		break;
 
 	/*
 	 * Set link layer read filter.
 	 */
 	case BIOCSETF:
 	case BIOCSETFNR:
 	case BIOCSETWF:
 #ifdef COMPAT_FREEBSD32
 	case BIOCSETF32:
 	case BIOCSETFNR32:
 	case BIOCSETWF32:
 #endif
 		error = bpf_setf(d, (struct bpf_program *)addr, cmd);
 		break;
 
 	/*
 	 * Flush read packet buffer.
 	 */
 	case BIOCFLUSH:
 		BPFD_LOCK(d);
 		reset_d(d);
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Put interface into promiscuous mode.
 	 */
 	case BIOCPROMISC:
 		if (d->bd_bif == NULL) {
 			/*
 			 * No interface attached yet.
 			 */
 			error = EINVAL;
 			break;
 		}
 		if (d->bd_promisc == 0) {
 			error = ifpromisc(d->bd_bif->bif_ifp, 1);
 			if (error == 0)
 				d->bd_promisc = 1;
 		}
 		break;
 
 	/*
 	 * Get current data link type.
 	 */
 	case BIOCGDLT:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			*(u_int *)addr = d->bd_bif->bif_dlt;
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Get a list of supported data link types.
 	 */
 #ifdef COMPAT_FREEBSD32
 	case BIOCGDLTLIST32:
 		{
 			struct bpf_dltlist32 *list32;
 			struct bpf_dltlist dltlist;
 
 			list32 = (struct bpf_dltlist32 *)addr;
 			dltlist.bfl_len = list32->bfl_len;
 			dltlist.bfl_list = PTRIN(list32->bfl_list);
 			BPF_LOCK();
 			if (d->bd_bif == NULL)
 				error = EINVAL;
 			else {
 				error = bpf_getdltlist(d, &dltlist);
 				if (error == 0)
 					list32->bfl_len = dltlist.bfl_len;
 			}
 			BPF_UNLOCK();
 			break;
 		}
 #endif
 
 	case BIOCGDLTLIST:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Set data link type.
 	 */
 	case BIOCSDLT:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			error = bpf_setdlt(d, *(u_int *)addr);
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Get interface name.
 	 */
 	case BIOCGETIF:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else {
 			struct ifnet *const ifp = d->bd_bif->bif_ifp;
 			struct ifreq *const ifr = (struct ifreq *)addr;
 
 			strlcpy(ifr->ifr_name, ifp->if_xname,
 			    sizeof(ifr->ifr_name));
 		}
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Set interface.
 	 */
 	case BIOCSETIF:
 		{
 			int alloc_buf, size;
 
 			/*
 			 * Behavior here depends on the buffering model.  If
 			 * we're using kernel memory buffers, then we can
 			 * allocate them here.  If we're using zero-copy,
 			 * then the user process must have registered buffers
 			 * by the time we get here.
 			 */
 			alloc_buf = 0;
 			BPFD_LOCK(d);
 			if (d->bd_bufmode == BPF_BUFMODE_BUFFER &&
 			    d->bd_sbuf == NULL)
 				alloc_buf = 1;
 			BPFD_UNLOCK(d);
 			if (alloc_buf) {
 				size = d->bd_bufsize;
 				error = bpf_buffer_ioctl_sblen(d, &size);
 				if (error != 0)
 					break;
 			}
 			BPF_LOCK();
 			error = bpf_setif(d, (struct ifreq *)addr);
 			BPF_UNLOCK();
 			break;
 		}
 
 	/*
 	 * Set read timeout.
 	 */
 	case BIOCSRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
 	case BIOCSRTIMEOUT32:
 #endif
 		{
 			struct timeval *tv = (struct timeval *)addr;
 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
 			struct timeval32 *tv32;
 			struct timeval tv64;
 
 			if (cmd == BIOCSRTIMEOUT32) {
 				tv32 = (struct timeval32 *)addr;
 				tv = &tv64;
 				tv->tv_sec = tv32->tv_sec;
 				tv->tv_usec = tv32->tv_usec;
 			} else
 #endif
 				tv = (struct timeval *)addr;
 
 			/*
 			 * Subtract 1 tick from tvtohz() since this isn't
 			 * a one-shot timer.
 			 */
 			if ((error = itimerfix(tv)) == 0)
 				d->bd_rtout = tvtohz(tv) - 1;
 			break;
 		}
 
 	/*
 	 * Get read timeout.
 	 */
 	case BIOCGRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
 	case BIOCGRTIMEOUT32:
 #endif
 		{
 			struct timeval *tv;
 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
 			struct timeval32 *tv32;
 			struct timeval tv64;
 
 			if (cmd == BIOCGRTIMEOUT32)
 				tv = &tv64;
 			else
 #endif
 				tv = (struct timeval *)addr;
 
 			tv->tv_sec = d->bd_rtout / hz;
 			tv->tv_usec = (d->bd_rtout % hz) * tick;
 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
 			if (cmd == BIOCGRTIMEOUT32) {
 				tv32 = (struct timeval32 *)addr;
 				tv32->tv_sec = tv->tv_sec;
 				tv32->tv_usec = tv->tv_usec;
 			}
 #endif
 
 			break;
 		}
 
 	/*
 	 * Get packet stats.
 	 */
 	case BIOCGSTATS:
 		{
 			struct bpf_stat *bs = (struct bpf_stat *)addr;
 
 			/* XXXCSJP overflow */
 			bs->bs_recv = d->bd_rcount;
 			bs->bs_drop = d->bd_dcount;
 			break;
 		}
 
 	/*
 	 * Set immediate mode.
 	 */
 	case BIOCIMMEDIATE:
 		BPFD_LOCK(d);
 		d->bd_immediate = *(u_int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCVERSION:
 		{
 			struct bpf_version *bv = (struct bpf_version *)addr;
 
 			bv->bv_major = BPF_MAJOR_VERSION;
 			bv->bv_minor = BPF_MINOR_VERSION;
 			break;
 		}
 
 	/*
 	 * Get "header already complete" flag
 	 */
 	case BIOCGHDRCMPLT:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_hdrcmplt;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set "header already complete" flag
 	 */
 	case BIOCSHDRCMPLT:
 		BPFD_LOCK(d);
 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Get packet direction flag
 	 */
 	case BIOCGDIRECTION:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_direction;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set packet direction flag
 	 */
 	case BIOCSDIRECTION:
 		{
 			u_int	direction;
 
 			direction = *(u_int *)addr;
 			switch (direction) {
 			case BPF_D_IN:
 			case BPF_D_INOUT:
 			case BPF_D_OUT:
 				BPFD_LOCK(d);
 				d->bd_direction = direction;
 				BPFD_UNLOCK(d);
 				break;
 			default:
 				error = EINVAL;
 			}
 		}
 		break;
 
 	/*
 	 * Get packet timestamp format and resolution.
 	 */
 	case BIOCGTSTAMP:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_tstamp;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set packet timestamp format and resolution.
 	 */
 	case BIOCSTSTAMP:
 		{
 			u_int	func;
 
 			func = *(u_int *)addr;
 			if (BPF_T_VALID(func))
 				d->bd_tstamp = func;
 			else
 				error = EINVAL;
 		}
 		break;
 
 	case BIOCFEEDBACK:
 		BPFD_LOCK(d);
 		d->bd_feedback = *(u_int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCLOCK:
 		BPFD_LOCK(d);
 		d->bd_locked = 1;
 		BPFD_UNLOCK(d);
 		break;
 
 	case FIONBIO:		/* Non-blocking I/O */
 		break;
 
 	case FIOASYNC:		/* Send signal on receive packets */
 		BPFD_LOCK(d);
 		d->bd_async = *(int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case FIOSETOWN:
 		/*
 		 * XXX: Add some sort of locking here?
 		 * fsetown() can sleep.
 		 */
 		error = fsetown(*(int *)addr, &d->bd_sigio);
 		break;
 
 	case FIOGETOWN:
 		BPFD_LOCK(d);
 		*(int *)addr = fgetown(&d->bd_sigio);
 		BPFD_UNLOCK(d);
 		break;
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		error = fsetown(-(*(int *)addr), &d->bd_sigio);
 		break;
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)addr = -fgetown(&d->bd_sigio);
 		break;
 
 	case BIOCSRSIG:		/* Set receive signal */
 		{
 			u_int sig;
 
 			sig = *(u_int *)addr;
 
 			if (sig >= NSIG)
 				error = EINVAL;
 			else {
 				BPFD_LOCK(d);
 				d->bd_sig = sig;
 				BPFD_UNLOCK(d);
 			}
 			break;
 		}
 	case BIOCGRSIG:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_sig;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCGETBUFMODE:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_bufmode;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCSETBUFMODE:
 		/*
 		 * Allow the buffering mode to be changed as long as we
 		 * haven't yet committed to a particular mode.  Our
 		 * definition of commitment, for now, is whether or not a
 		 * buffer has been allocated or an interface attached, since
 		 * that's the point where things get tricky.
 		 */
 		switch (*(u_int *)addr) {
 		case BPF_BUFMODE_BUFFER:
 			break;
 
 		case BPF_BUFMODE_ZBUF:
 			if (bpf_zerocopy_enable)
 				break;
 			/* FALLSTHROUGH */
 
 		default:
 			CURVNET_RESTORE();
 			return (EINVAL);
 		}
 
 		BPFD_LOCK(d);
 		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
 		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
 			BPFD_UNLOCK(d);
 			CURVNET_RESTORE();
 			return (EBUSY);
 		}
 		d->bd_bufmode = *(u_int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCGETZMAX:
 		error = bpf_ioctl_getzmax(td, d, (size_t *)addr);
 		break;
 
 	case BIOCSETZBUF:
 		error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr);
 		break;
 
 	case BIOCROTZBUF:
 		error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr);
 		break;
 	}
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * Set d's packet filter program to fp.  If this file already has a filter,
  * free it and replace it.  Returns EINVAL for bogus requests.
  *
  * Note we need global lock here to serialize bpf_setf() and bpf_setif() calls
  * since reading d->bd_bif can't be protected by d or interface lock due to
  * lock order.
  *
  * Additionally, we have to acquire interface write lock due to bpf_mtap() uses
  * interface read lock to read all filers.
  *
  */
 static int
 bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
 {
 #ifdef COMPAT_FREEBSD32
 	struct bpf_program fp_swab;
 	struct bpf_program32 *fp32;
 #endif
 	struct bpf_insn *fcode, *old;
 #ifdef BPF_JITTER
 	bpf_jit_filter *jfunc, *ofunc;
 #endif
 	size_t size;
 	u_int flen;
 	int need_upgrade;
 
 #ifdef COMPAT_FREEBSD32
 	switch (cmd) {
 	case BIOCSETF32:
 	case BIOCSETWF32:
 	case BIOCSETFNR32:
 		fp32 = (struct bpf_program32 *)fp;
 		fp_swab.bf_len = fp32->bf_len;
 		fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns;
 		fp = &fp_swab;
 		switch (cmd) {
 		case BIOCSETF32:
 			cmd = BIOCSETF;
 			break;
 		case BIOCSETWF32:
 			cmd = BIOCSETWF;
 			break;
 		}
 		break;
 	}
 #endif
 
 	fcode = NULL;
 #ifdef BPF_JITTER
 	jfunc = ofunc = NULL;
 #endif
 	need_upgrade = 0;
 
 	/*
 	 * Check new filter validness before acquiring any locks.
 	 * Allocate memory for new filter, if needed.
 	 */
 	flen = fp->bf_len;
 	if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0))
 		return (EINVAL);
 	size = flen * sizeof(*fp->bf_insns);
 	if (size > 0) {
 		/* We're setting up new filter.  Copy and check actual data. */
 		fcode = malloc(size, M_BPF, M_WAITOK);
 		if (copyin(fp->bf_insns, fcode, size) != 0 ||
 		    !bpf_validate(fcode, flen)) {
 			free(fcode, M_BPF);
 			return (EINVAL);
 		}
 #ifdef BPF_JITTER
 		/* Filter is copied inside fcode and is perfectly valid. */
 		jfunc = bpf_jitter(fcode, flen);
 #endif
 	}
 
 	BPF_LOCK();
 
 	/*
 	 * Set up new filter.
 	 * Protect filter change by interface lock.
 	 * Additionally, we are protected by global lock here.
 	 */
 	if (d->bd_bif != NULL)
 		BPFIF_WLOCK(d->bd_bif);
 	BPFD_LOCK(d);
 	if (cmd == BIOCSETWF) {
 		old = d->bd_wfilter;
 		d->bd_wfilter = fcode;
 	} else {
 		old = d->bd_rfilter;
 		d->bd_rfilter = fcode;
 #ifdef BPF_JITTER
 		ofunc = d->bd_bfilter;
 		d->bd_bfilter = jfunc;
 #endif
 		if (cmd == BIOCSETF)
 			reset_d(d);
 
 		need_upgrade = bpf_check_upgrade(cmd, d, fcode, flen);
 	}
 	BPFD_UNLOCK(d);
 	if (d->bd_bif != NULL)
 		BPFIF_WUNLOCK(d->bd_bif);
 	if (old != NULL)
 		free(old, M_BPF);
 #ifdef BPF_JITTER
 	if (ofunc != NULL)
 		bpf_destroy_jit_filter(ofunc);
 #endif
 
 	/* Move d to active readers list. */
 	if (need_upgrade != 0)
 		bpf_upgraded(d);
 
 	BPF_UNLOCK();
 	return (0);
 }
 
 /*
  * Detach a file from its current interface (if attached at all) and attach
  * to the interface indicated by the name stored in ifr.
  * Return an errno or 0.
  */
 static int
 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
 {
 	struct bpf_if *bp;
 	struct ifnet *theywant;
 
 	BPF_LOCK_ASSERT();
 
 	theywant = ifunit(ifr->ifr_name);
 	if (theywant == NULL || theywant->if_bpf == NULL)
 		return (ENXIO);
 
 	bp = theywant->if_bpf;
 
 	/* Check if interface is not being detached from BPF */
 	BPFIF_RLOCK(bp);
 	if (bp->bif_flags & BPFIF_FLAG_DYING) {
 		BPFIF_RUNLOCK(bp);
 		return (ENXIO);
 	}
 	BPFIF_RUNLOCK(bp);
 
 	/*
 	 * At this point, we expect the buffer is already allocated.  If not,
 	 * return an error.
 	 */
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 	case BPF_BUFMODE_ZBUF:
 		if (d->bd_sbuf == NULL)
 			return (EINVAL);
 		break;
 
 	default:
 		panic("bpf_setif: bufmode %d", d->bd_bufmode);
 	}
 	if (bp != d->bd_bif)
 		bpf_attachd(d, bp);
 	BPFD_LOCK(d);
 	reset_d(d);
 	BPFD_UNLOCK(d);
 	return (0);
 }
 
 /*
  * Support for select() and poll() system calls
  *
  * Return true iff the specific operation will not block indefinitely.
  * Otherwise, return false but make a note that a selwakeup() must be done.
  */
 static int
 bpfpoll(struct cdev *dev, int events, struct thread *td)
 {
 	struct bpf_d *d;
 	int revents;
 
 	if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL)
 		return (events &
 		    (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM));
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	revents = events & (POLLOUT | POLLWRNORM);
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH(d, td);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (bpf_ready(d))
 			revents |= events & (POLLIN | POLLRDNORM);
 		else {
 			selrecord(td, &d->bd_sel);
 			/* Start the read timeout if necessary. */
 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
 				callout_reset(&d->bd_callout, d->bd_rtout,
 				    bpf_timed_out, d);
 				d->bd_state = BPF_WAITING;
 			}
 		}
 	}
 	BPFD_UNLOCK(d);
 	return (revents);
 }
 
 /*
  * Support for kevent() system call.  Register EVFILT_READ filters and
  * reject all others.
  */
 int
 bpfkqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct bpf_d *d;
 
 	if (devfs_get_cdevpriv((void **)&d) != 0 ||
 	    kn->kn_filter != EVFILT_READ)
 		return (1);
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH_CUR(d);
 	kn->kn_fop = &bpfread_filtops;
 	kn->kn_hook = d;
 	knlist_add(&d->bd_sel.si_note, kn, 1);
 	BPFD_UNLOCK(d);
 
 	return (0);
 }
 
 static void
 filt_bpfdetach(struct knote *kn)
 {
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 
 	knlist_remove(&d->bd_sel.si_note, kn, 0);
 }
 
 static int
 filt_bpfread(struct knote *kn, long hint)
 {
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 	int ready;
 
 	BPFD_LOCK_ASSERT(d);
 	ready = bpf_ready(d);
 	if (ready) {
 		kn->kn_data = d->bd_slen;
 		/*
 		 * Ignore the hold buffer if it is being copied to user space.
 		 */
 		if (!d->bd_hbuf_in_use && d->bd_hbuf)
 			kn->kn_data += d->bd_hlen;
 	} else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
 		callout_reset(&d->bd_callout, d->bd_rtout,
 		    bpf_timed_out, d);
 		d->bd_state = BPF_WAITING;
 	}
 
 	return (ready);
 }
 
 #define	BPF_TSTAMP_NONE		0
 #define	BPF_TSTAMP_FAST		1
 #define	BPF_TSTAMP_NORMAL	2
 #define	BPF_TSTAMP_EXTERN	3
 
 static int
 bpf_ts_quality(int tstype)
 {
 
 	if (tstype == BPF_T_NONE)
 		return (BPF_TSTAMP_NONE);
 	if ((tstype & BPF_T_FAST) != 0)
 		return (BPF_TSTAMP_FAST);
 
 	return (BPF_TSTAMP_NORMAL);
 }
 
 static int
 bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m)
 {
 	struct m_tag *tag;
 	int quality;
 
 	quality = bpf_ts_quality(tstype);
 	if (quality == BPF_TSTAMP_NONE)
 		return (quality);
 
 	if (m != NULL) {
 		tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL);
 		if (tag != NULL) {
 			*bt = *(struct bintime *)(tag + 1);
 			return (BPF_TSTAMP_EXTERN);
 		}
 	}
 	if (quality == BPF_TSTAMP_NORMAL)
 		binuptime(bt);
 	else
 		getbinuptime(bt);
 
 	return (quality);
 }
 
 /*
  * Incoming linkage from device drivers.  Process the packet pkt, of length
  * pktlen, which is stored in a contiguous buffer.  The packet is parsed
  * by each process' filter, and if accepted, stashed into the corresponding
  * buffer.
  */
 void
 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 {
 	struct bintime bt;
 	struct bpf_d *d;
 #ifdef BPF_JITTER
 	bpf_jit_filter *bf;
 #endif
 	u_int slen;
 	int gottime;
 
 	gottime = BPF_TSTAMP_NONE;
 
 	BPFIF_RLOCK(bp);
 
 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		/*
 		 * We are not using any locks for d here because:
 		 * 1) any filter change is protected by interface
 		 * write lock
 		 * 2) destroying/detaching d is protected by interface
 		 * write lock, too
 		 */
 
 		/* XXX: Do not protect counter for the sake of performance. */
 		++d->bd_rcount;
 		/*
 		 * NB: We dont call BPF_CHECK_DIRECTION() here since there is no
 		 * way for the caller to indiciate to us whether this packet
 		 * is inbound or outbound.  In the bpf_mtap() routines, we use
 		 * the interface pointers on the mbuf to figure it out.
 		 */
 #ifdef BPF_JITTER
 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
 		if (bf != NULL)
 			slen = (*(bf->func))(pkt, pktlen, pktlen);
 		else
 #endif
 		slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
 		if (slen != 0) {
 			/*
 			 * Filter matches. Let's to acquire write lock.
 			 */
 			BPFD_LOCK(d);
 
 			d->bd_fcount++;
 			if (gottime < bpf_ts_quality(d->bd_tstamp))
 				gottime = bpf_gettime(&bt, d->bd_tstamp, NULL);
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, pkt, pktlen, slen,
 				    bpf_append_bytes, &bt);
 			BPFD_UNLOCK(d);
 		}
 	}
 	BPFIF_RUNLOCK(bp);
 }
 
 #define	BPF_CHECK_DIRECTION(d, r, i)				\
 	    (((d)->bd_direction == BPF_D_IN && (r) != (i)) ||	\
 	    ((d)->bd_direction == BPF_D_OUT && (r) == (i)))
 
 /*
  * Incoming linkage from device drivers, when packet is in an mbuf chain.
  * Locking model is explained in bpf_tap().
  */
 void
 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 {
 	struct bintime bt;
 	struct bpf_d *d;
 #ifdef BPF_JITTER
 	bpf_jit_filter *bf;
 #endif
 	u_int pktlen, slen;
 	int gottime;
 
 	/* Skip outgoing duplicate packets. */
 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
 		m->m_flags &= ~M_PROMISC;
 		return;
 	}
 
 	pktlen = m_length(m, NULL);
 	gottime = BPF_TSTAMP_NONE;
 
 	BPFIF_RLOCK(bp);
 
 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
 			continue;
 		++d->bd_rcount;
 #ifdef BPF_JITTER
 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
 		/* XXX We cannot handle multiple mbufs. */
 		if (bf != NULL && m->m_next == NULL)
 			slen = (*(bf->func))(mtod(m, u_char *), pktlen, pktlen);
 		else
 #endif
 		slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
 		if (slen != 0) {
 			BPFD_LOCK(d);
 
 			d->bd_fcount++;
 			if (gottime < bpf_ts_quality(d->bd_tstamp))
 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)m, pktlen, slen,
 				    bpf_append_mbuf, &bt);
 			BPFD_UNLOCK(d);
 		}
 	}
 	BPFIF_RUNLOCK(bp);
 }
 
 /*
  * Incoming linkage from device drivers, when packet is in
  * an mbuf chain and to be prepended by a contiguous header.
  */
 void
 bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
 {
 	struct bintime bt;
 	struct mbuf mb;
 	struct bpf_d *d;
 	u_int pktlen, slen;
 	int gottime;
 
 	/* Skip outgoing duplicate packets. */
 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
 		m->m_flags &= ~M_PROMISC;
 		return;
 	}
 
 	pktlen = m_length(m, NULL);
 	/*
 	 * Craft on-stack mbuf suitable for passing to bpf_filter.
 	 * Note that we cut corners here; we only setup what's
 	 * absolutely needed--this mbuf should never go anywhere else.
 	 */
 	mb.m_next = m;
 	mb.m_data = data;
 	mb.m_len = dlen;
 	pktlen += dlen;
 
 	gottime = BPF_TSTAMP_NONE;
 
 	BPFIF_RLOCK(bp);
 
 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
 			continue;
 		++d->bd_rcount;
 		slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
 		if (slen != 0) {
 			BPFD_LOCK(d);
 
 			d->bd_fcount++;
 			if (gottime < bpf_ts_quality(d->bd_tstamp))
 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)&mb, pktlen, slen,
 				    bpf_append_mbuf, &bt);
 			BPFD_UNLOCK(d);
 		}
 	}
 	BPFIF_RUNLOCK(bp);
 }
 
 #undef	BPF_CHECK_DIRECTION
 
 #undef	BPF_TSTAMP_NONE
 #undef	BPF_TSTAMP_FAST
 #undef	BPF_TSTAMP_NORMAL
 #undef	BPF_TSTAMP_EXTERN
 
 static int
 bpf_hdrlen(struct bpf_d *d)
 {
 	int hdrlen;
 
 	hdrlen = d->bd_bif->bif_hdrlen;
 #ifndef BURN_BRIDGES
 	if (d->bd_tstamp == BPF_T_NONE ||
 	    BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME)
 #ifdef COMPAT_FREEBSD32
 		if (d->bd_compat32)
 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32);
 		else
 #endif
 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr);
 	else
 #endif
 		hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr);
 #ifdef COMPAT_FREEBSD32
 	if (d->bd_compat32)
 		hdrlen = BPF_WORDALIGN32(hdrlen);
 	else
 #endif
 		hdrlen = BPF_WORDALIGN(hdrlen);
 
 	return (hdrlen - d->bd_bif->bif_hdrlen);
 }
 
 static void
 bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype)
 {
 	struct bintime bt2;
 	struct timeval tsm;
 	struct timespec tsn;
 
 	if ((tstype & BPF_T_MONOTONIC) == 0) {
 		bt2 = *bt;
 		bintime_add(&bt2, &boottimebin);
 		bt = &bt2;
 	}
 	switch (BPF_T_FORMAT(tstype)) {
 	case BPF_T_MICROTIME:
 		bintime2timeval(bt, &tsm);
 		ts->bt_sec = tsm.tv_sec;
 		ts->bt_frac = tsm.tv_usec;
 		break;
 	case BPF_T_NANOTIME:
 		bintime2timespec(bt, &tsn);
 		ts->bt_sec = tsn.tv_sec;
 		ts->bt_frac = tsn.tv_nsec;
 		break;
 	case BPF_T_BINTIME:
 		ts->bt_sec = bt->sec;
 		ts->bt_frac = bt->frac;
 		break;
 	}
 }
 
 /*
  * Move the packet data from interface memory (pkt) into the
  * store buffer.  "cpfn" is the routine called to do the actual data
  * transfer.  bcopy is passed in to copy contiguous chunks, while
  * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
  * pkt is really an mbuf.
  */
 static void
 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
     void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
     struct bintime *bt)
 {
 	struct bpf_xhdr hdr;
 #ifndef BURN_BRIDGES
 	struct bpf_hdr hdr_old;
 #ifdef COMPAT_FREEBSD32
 	struct bpf_hdr32 hdr32_old;
 #endif
 #endif
 	int caplen, curlen, hdrlen, totlen;
 	int do_wakeup = 0;
 	int do_timestamp;
 	int tstype;
 
 	BPFD_LOCK_ASSERT(d);
 
 	/*
 	 * Detect whether user space has released a buffer back to us, and if
 	 * so, move it from being a hold buffer to a free buffer.  This may
 	 * not be the best place to do it (for example, we might only want to
 	 * run this check if we need the space), but for now it's a reliable
 	 * spot to do it.
 	 */
 	if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) {
 		d->bd_fbuf = d->bd_hbuf;
 		d->bd_hbuf = NULL;
 		d->bd_hlen = 0;
 		bpf_buf_reclaimed(d);
 	}
 
 	/*
 	 * Figure out how many bytes to move.  If the packet is
 	 * greater or equal to the snapshot length, transfer that
 	 * much.  Otherwise, transfer the whole packet (unless
 	 * we hit the buffer size limit).
 	 */
 	hdrlen = bpf_hdrlen(d);
 	totlen = hdrlen + min(snaplen, pktlen);
 	if (totlen > d->bd_bufsize)
 		totlen = d->bd_bufsize;
 
 	/*
 	 * Round up the end of the previous packet to the next longword.
 	 *
 	 * Drop the packet if there's no room and no hope of room
 	 * If the packet would overflow the storage buffer or the storage
 	 * buffer is considered immutable by the buffer model, try to rotate
 	 * the buffer and wakeup pending processes.
 	 */
 #ifdef COMPAT_FREEBSD32
 	if (d->bd_compat32)
 		curlen = BPF_WORDALIGN32(d->bd_slen);
 	else
 #endif
 		curlen = BPF_WORDALIGN(d->bd_slen);
 	if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
 		if (d->bd_fbuf == NULL) {
 			/*
 			 * There's no room in the store buffer, and no
 			 * prospect of room, so drop the packet.  Notify the
 			 * buffer model.
 			 */
 			bpf_buffull(d);
 			++d->bd_dcount;
 			return;
 		}
 		KASSERT(!d->bd_hbuf_in_use, ("hold buffer is in use"));
 		ROTATE_BUFFERS(d);
 		do_wakeup = 1;
 		curlen = 0;
 	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
 		/*
 		 * Immediate mode is set, or the read timeout has already
 		 * expired during a select call.  A packet arrived, so the
 		 * reader should be woken up.
 		 */
 		do_wakeup = 1;
 	caplen = totlen - hdrlen;
 	tstype = d->bd_tstamp;
 	do_timestamp = tstype != BPF_T_NONE;
 #ifndef BURN_BRIDGES
 	if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) {
 		struct bpf_ts ts;
 		if (do_timestamp)
 			bpf_bintime2ts(bt, &ts, tstype);
 #ifdef COMPAT_FREEBSD32
 		if (d->bd_compat32) {
 			bzero(&hdr32_old, sizeof(hdr32_old));
 			if (do_timestamp) {
 				hdr32_old.bh_tstamp.tv_sec = ts.bt_sec;
 				hdr32_old.bh_tstamp.tv_usec = ts.bt_frac;
 			}
 			hdr32_old.bh_datalen = pktlen;
 			hdr32_old.bh_hdrlen = hdrlen;
 			hdr32_old.bh_caplen = caplen;
 			bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old,
 			    sizeof(hdr32_old));
 			goto copy;
 		}
 #endif
 		bzero(&hdr_old, sizeof(hdr_old));
 		if (do_timestamp) {
 			hdr_old.bh_tstamp.tv_sec = ts.bt_sec;
 			hdr_old.bh_tstamp.tv_usec = ts.bt_frac;
 		}
 		hdr_old.bh_datalen = pktlen;
 		hdr_old.bh_hdrlen = hdrlen;
 		hdr_old.bh_caplen = caplen;
 		bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old,
 		    sizeof(hdr_old));
 		goto copy;
 	}
 #endif
 
 	/*
 	 * Append the bpf header.  Note we append the actual header size, but
 	 * move forward the length of the header plus padding.
 	 */
 	bzero(&hdr, sizeof(hdr));
 	if (do_timestamp)
 		bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype);
 	hdr.bh_datalen = pktlen;
 	hdr.bh_hdrlen = hdrlen;
 	hdr.bh_caplen = caplen;
 	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
 
 	/*
 	 * Copy the packet data into the store buffer and update its length.
 	 */
 #ifndef BURN_BRIDGES
 copy:
 #endif
 	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen);
 	d->bd_slen = curlen + totlen;
 
 	if (do_wakeup)
 		bpf_wakeup(d);
 }
 
 /*
  * Free buffers currently in use by a descriptor.
  * Called on close.
  */
 static void
 bpf_freed(struct bpf_d *d)
 {
 
 	/*
 	 * We don't need to lock out interrupts since this descriptor has
 	 * been detached from its interface and it yet hasn't been marked
 	 * free.
 	 */
 	bpf_free(d);
 	if (d->bd_rfilter != NULL) {
 		free((caddr_t)d->bd_rfilter, M_BPF);
 #ifdef BPF_JITTER
 		if (d->bd_bfilter != NULL)
 			bpf_destroy_jit_filter(d->bd_bfilter);
 #endif
 	}
 	if (d->bd_wfilter != NULL)
 		free((caddr_t)d->bd_wfilter, M_BPF);
 	mtx_destroy(&d->bd_lock);
 }
 
 /*
  * Attach an interface to bpf.  dlt is the link layer type; hdrlen is the
  * fixed size of the link header (variable length headers not yet supported).
  */
 void
 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
 {
 
 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
 }
 
 /*
  * Attach an interface to bpf.  ifp is a pointer to the structure
  * defining the interface to be attached, dlt is the link layer type,
  * and hdrlen is the fixed size of the link header (variable length
  * headers are not yet supporrted).
  */
 void
 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
 {
 	struct bpf_if *bp;
 
 	bp = malloc(sizeof(*bp), M_BPF, M_NOWAIT | M_ZERO);
 	if (bp == NULL)
 		panic("bpfattach");
 
 	LIST_INIT(&bp->bif_dlist);
 	LIST_INIT(&bp->bif_wlist);
 	bp->bif_ifp = ifp;
 	bp->bif_dlt = dlt;
 	rw_init(&bp->bif_lock, "bpf interface lock");
 	KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized"));
 	*driverp = bp;
 
 	BPF_LOCK();
 	LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
 	BPF_UNLOCK();
 
 	bp->bif_hdrlen = hdrlen;
 
 	if (bootverbose && IS_DEFAULT_VNET(curvnet))
 		if_printf(ifp, "bpf attached\n");
 }
 
+#ifdef VIMAGE
 /*
+ * When moving interfaces between vnet instances we need a way to
+ * query the dlt and hdrlen before detach so we can re-attch the if_bpf
+ * after the vmove.  We unfortunately have no device driver infrastructure
+ * to query the interface for these values after creation/attach, thus
+ * add this as a workaround.
+ */
+int
+bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u_int *bif_hdrlen)
+{
+
+	if (bp == NULL)
+		return (ENXIO);
+	if (bif_dlt == NULL && bif_hdrlen == NULL)
+		return (0);
+
+	if (bif_dlt != NULL)
+		*bif_dlt = bp->bif_dlt;
+	if (bif_hdrlen != NULL)
+		*bif_hdrlen = bp->bif_hdrlen;
+
+	return (0);
+}
+#endif
+
+/*
  * Detach bpf from an interface. This involves detaching each descriptor
  * associated with the interface. Notify each descriptor as it's detached
  * so that any sleepers wake up and get ENXIO.
  */
 void
 bpfdetach(struct ifnet *ifp)
 {
 	struct bpf_if	*bp, *bp_temp;
 	struct bpf_d	*d;
 	int ndetached;
 
 	ndetached = 0;
 
 	BPF_LOCK();
 	/* Find all bpf_if struct's which reference ifp and detach them. */
 	LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) {
 		if (ifp != bp->bif_ifp)
 			continue;
 
 		LIST_REMOVE(bp, bif_next);
 		/* Add to to-be-freed list */
 		LIST_INSERT_HEAD(&bpf_freelist, bp, bif_next);
 
 		ndetached++;
 		/*
 		 * Delay freeing bp till interface is detached
 		 * and all routes through this interface are removed.
 		 * Mark bp as detached to restrict new consumers.
 		 */
 		BPFIF_WLOCK(bp);
 		bp->bif_flags |= BPFIF_FLAG_DYING;
 		BPFIF_WUNLOCK(bp);
 
 		CTR4(KTR_NET, "%s: sheduling free for encap %d (%p) for if %p",
 		    __func__, bp->bif_dlt, bp, ifp);
 
 		/* Free common descriptors */
 		while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) {
 			bpf_detachd_locked(d);
 			BPFD_LOCK(d);
 			bpf_wakeup(d);
 			BPFD_UNLOCK(d);
 		}
 
 		/* Free writer-only descriptors */
 		while ((d = LIST_FIRST(&bp->bif_wlist)) != NULL) {
 			bpf_detachd_locked(d);
 			BPFD_LOCK(d);
 			bpf_wakeup(d);
 			BPFD_UNLOCK(d);
 		}
 	}
 	BPF_UNLOCK();
 
 #ifdef INVARIANTS
 	if (ndetached == 0)
 		printf("bpfdetach: %s was not attached\n", ifp->if_xname);
 #endif
 }
 
 /*
  * Interface departure handler.
  * Note departure event does not guarantee interface is going down.
  * Interface renaming is currently done via departure/arrival event set.
  *
  * Departure handled is called after all routes pointing to
  * given interface are removed and interface is in down state
  * restricting any packets to be sent/received. We assume it is now safe
  * to free data allocated by BPF.
  */
 static void
 bpf_ifdetach(void *arg __unused, struct ifnet *ifp)
 {
 	struct bpf_if *bp, *bp_temp;
 	int nmatched = 0;
 
 	BPF_LOCK();
 	/*
 	 * Find matching entries in free list.
 	 * Nothing should be found if bpfdetach() was not called.
 	 */
 	LIST_FOREACH_SAFE(bp, &bpf_freelist, bif_next, bp_temp) {
 		if (ifp != bp->bif_ifp)
 			continue;
 
 		CTR3(KTR_NET, "%s: freeing BPF instance %p for interface %p",
 		    __func__, bp, ifp);
 
 		LIST_REMOVE(bp, bif_next);
 
 		rw_destroy(&bp->bif_lock);
 		free(bp, M_BPF);
 
 		nmatched++;
 	}
 	BPF_UNLOCK();
 
 	/*
 	 * Note that we cannot zero other pointers to
 	 * custom DLTs possibly used by given interface.
 	 */
 	if (nmatched != 0)
 		ifp->if_bpf = NULL;
 }
 
 /*
  * Get a list of available data link type of the interface.
  */
 static int
 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
 {
 	struct ifnet *ifp;
 	struct bpf_if *bp;
 	u_int *lst;
 	int error, n, n1;
 
 	BPF_LOCK_ASSERT();
 
 	ifp = d->bd_bif->bif_ifp;
 again:
 	n1 = 0;
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp == ifp)
 			n1++;
 	}
 	if (bfl->bfl_list == NULL) {
 		bfl->bfl_len = n1;
 		return (0);
 	}
 	if (n1 > bfl->bfl_len)
 		return (ENOMEM);
 	BPF_UNLOCK();
 	lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK);
 	n = 0;
 	BPF_LOCK();
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp != ifp)
 			continue;
 		if (n > n1) {
 			free(lst, M_TEMP);
 			goto again;
 		}
 		lst[n] = bp->bif_dlt;
 		n++;
 	}
 	BPF_UNLOCK();
 	error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n);
 	free(lst, M_TEMP);
 	BPF_LOCK();
 	bfl->bfl_len = n;
 	return (error);
 }
 
 /*
  * Set the data link type of a BPF instance.
  */
 static int
 bpf_setdlt(struct bpf_d *d, u_int dlt)
 {
 	int error, opromisc;
 	struct ifnet *ifp;
 	struct bpf_if *bp;
 
 	BPF_LOCK_ASSERT();
 
 	if (d->bd_bif->bif_dlt == dlt)
 		return (0);
 	ifp = d->bd_bif->bif_ifp;
 
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
 			break;
 	}
 
 	if (bp != NULL) {
 		opromisc = d->bd_promisc;
 		bpf_attachd(d, bp);
 		BPFD_LOCK(d);
 		reset_d(d);
 		BPFD_UNLOCK(d);
 		if (opromisc) {
 			error = ifpromisc(bp->bif_ifp, 1);
 			if (error)
 				if_printf(bp->bif_ifp,
 					"bpf_setdlt: ifpromisc failed (%d)\n",
 					error);
 			else
 				d->bd_promisc = 1;
 		}
 	}
 	return (bp == NULL ? EINVAL : 0);
 }
 
 static void
 bpf_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	mtx_init(&bpf_mtx, "bpf global lock", NULL, MTX_DEF);
 	LIST_INIT(&bpf_iflist);
 	LIST_INIT(&bpf_freelist);
 
 	dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf");
 	/* For compatibility */
 	make_dev_alias(dev, "bpf0");
 
 	/* Register interface departure handler */
 	bpf_ifdetach_cookie = EVENTHANDLER_REGISTER(
 		    ifnet_departure_event, bpf_ifdetach, NULL,
 		    EVENTHANDLER_PRI_ANY);
 }
 
 /*
  * Zero out the various packet counters associated with all of the bpf
  * descriptors.  At some point, we will probably want to get a bit more
  * granular and allow the user to specify descriptors to be zeroed.
  */
 static void
 bpf_zero_counters(void)
 {
 	struct bpf_if *bp;
 	struct bpf_d *bd;
 
 	BPF_LOCK();
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		BPFIF_RLOCK(bp);
 		LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
 			BPFD_LOCK(bd);
 			bd->bd_rcount = 0;
 			bd->bd_dcount = 0;
 			bd->bd_fcount = 0;
 			bd->bd_wcount = 0;
 			bd->bd_wfcount = 0;
 			bd->bd_zcopy = 0;
 			BPFD_UNLOCK(bd);
 		}
 		BPFIF_RUNLOCK(bp);
 	}
 	BPF_UNLOCK();
 }
 
 /*
  * Fill filter statistics
  */
 static void
 bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
 {
 
 	bzero(d, sizeof(*d));
 	BPFD_LOCK_ASSERT(bd);
 	d->bd_structsize = sizeof(*d);
 	/* XXX: reading should be protected by global lock */
 	d->bd_immediate = bd->bd_immediate;
 	d->bd_promisc = bd->bd_promisc;
 	d->bd_hdrcmplt = bd->bd_hdrcmplt;
 	d->bd_direction = bd->bd_direction;
 	d->bd_feedback = bd->bd_feedback;
 	d->bd_async = bd->bd_async;
 	d->bd_rcount = bd->bd_rcount;
 	d->bd_dcount = bd->bd_dcount;
 	d->bd_fcount = bd->bd_fcount;
 	d->bd_sig = bd->bd_sig;
 	d->bd_slen = bd->bd_slen;
 	d->bd_hlen = bd->bd_hlen;
 	d->bd_bufsize = bd->bd_bufsize;
 	d->bd_pid = bd->bd_pid;
 	strlcpy(d->bd_ifname,
 	    bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
 	d->bd_locked = bd->bd_locked;
 	d->bd_wcount = bd->bd_wcount;
 	d->bd_wdcount = bd->bd_wdcount;
 	d->bd_wfcount = bd->bd_wfcount;
 	d->bd_zcopy = bd->bd_zcopy;
 	d->bd_bufmode = bd->bd_bufmode;
 }
 
 /*
  * Handle `netstat -B' stats request
  */
 static int
 bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	static const struct xbpf_d zerostats;
 	struct xbpf_d *xbdbuf, *xbd, tempstats;
 	int index, error;
 	struct bpf_if *bp;
 	struct bpf_d *bd;
 
 	/*
 	 * XXX This is not technically correct. It is possible for non
 	 * privileged users to open bpf devices. It would make sense
 	 * if the users who opened the devices were able to retrieve
 	 * the statistics for them, too.
 	 */
 	error = priv_check(req->td, PRIV_NET_BPF);
 	if (error)
 		return (error);
 	/*
 	 * Check to see if the user is requesting that the counters be
 	 * zeroed out.  Explicitly check that the supplied data is zeroed,
 	 * as we aren't allowing the user to set the counters currently.
 	 */
 	if (req->newptr != NULL) {
 		if (req->newlen != sizeof(tempstats))
 			return (EINVAL);
 		memset(&tempstats, 0, sizeof(tempstats));
 		error = SYSCTL_IN(req, &tempstats, sizeof(tempstats));
 		if (error)
 			return (error);
 		if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0)
 			return (EINVAL);
 		bpf_zero_counters();
 		return (0);
 	}
 	if (req->oldptr == NULL)
 		return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd)));
 	if (bpf_bpfd_cnt == 0)
 		return (SYSCTL_OUT(req, 0, 0));
 	xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK);
 	BPF_LOCK();
 	if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) {
 		BPF_UNLOCK();
 		free(xbdbuf, M_BPF);
 		return (ENOMEM);
 	}
 	index = 0;
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		BPFIF_RLOCK(bp);
 		/* Send writers-only first */
 		LIST_FOREACH(bd, &bp->bif_wlist, bd_next) {
 			xbd = &xbdbuf[index++];
 			BPFD_LOCK(bd);
 			bpfstats_fill_xbpf(xbd, bd);
 			BPFD_UNLOCK(bd);
 		}
 		LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
 			xbd = &xbdbuf[index++];
 			BPFD_LOCK(bd);
 			bpfstats_fill_xbpf(xbd, bd);
 			BPFD_UNLOCK(bd);
 		}
 		BPFIF_RUNLOCK(bp);
 	}
 	BPF_UNLOCK();
 	error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
 	free(xbdbuf, M_BPF);
 	return (error);
 }
 
 SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL);
 
 #else /* !DEV_BPF && !NETGRAPH_BPF */
 /*
  * NOP stubs to allow bpf-using drivers to load and function.
  *
  * A 'better' implementation would allow the core bpf functionality
  * to be loaded at runtime.
  */
 static struct bpf_if bp_null;
 
 void
 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 {
 }
 
 void
 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 {
 }
 
 void
 bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
 {
 }
 
 void
 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
 {
 
 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
 }
 
 void
 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
 {
 
 	*driverp = &bp_null;
 }
 
 void
 bpfdetach(struct ifnet *ifp)
 {
 }
 
 u_int
 bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
 {
 	return -1;	/* "no filter" behaviour */
 }
 
 int
 bpf_validate(const struct bpf_insn *f, int len)
 {
 	return 0;		/* false */
 }
 
 #endif /* !DEV_BPF && !NETGRAPH_BPF */
+
+#ifdef DDB
+static void
+bpf_show_bpf_if(struct bpf_if *bpf_if)
+{
+
+	if (bpf_if == NULL)
+		return;
+	db_printf("%p:\n", bpf_if);
+#define	BPF_DB_PRINTF(f, e)	db_printf("   %s = " f "\n", #e, bpf_if->e);
+	/* bif_ext.bif_next */
+	/* bif_ext.bif_dlist */
+	BPF_DB_PRINTF("%#x", bif_dlt);
+	BPF_DB_PRINTF("%u", bif_hdrlen);
+	BPF_DB_PRINTF("%p", bif_ifp);
+	/* bif_lock */
+	/* bif_wlist */
+	BPF_DB_PRINTF("%#x", bif_flags);
+}
+
+DB_SHOW_COMMAND(bpf_if, db_show_bpf_if)
+{
+
+	if (!have_addr) {
+		db_printf("usage: show bpf_if <struct bpf_if *>\n");
+		return;
+	}
+
+	bpf_show_bpf_if((struct bpf_if *)addr);
+}
+#endif
Index: projects/vnet/sys/net/bpf.h
===================================================================
--- projects/vnet/sys/net/bpf.h	(revision 297511)
+++ projects/vnet/sys/net/bpf.h	(revision 297512)
@@ -1,1518 +1,1521 @@
 /*-
  * Copyright (c) 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from the Stanford/CMU enet packet filter,
  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
  * Berkeley Laboratory.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      @(#)bpf.h	8.1 (Berkeley) 6/10/93
  *	@(#)bpf.h	1.34 (LBL)     6/16/96
  *
  * $FreeBSD$
  */
 
 #ifndef _NET_BPF_H_
 #define _NET_BPF_H_
 
 /* BSD style release date */
 #define	BPF_RELEASE 199606
 
 typedef	int32_t	  bpf_int32;
 typedef	u_int32_t bpf_u_int32;
 typedef	int64_t	  bpf_int64;
 typedef	u_int64_t bpf_u_int64;
 
 /*
  * Alignment macros.  BPF_WORDALIGN rounds up to the next
  * even multiple of BPF_ALIGNMENT.
  */
 #define BPF_ALIGNMENT sizeof(long)
 #define BPF_WORDALIGN(x) (((x)+(BPF_ALIGNMENT-1))&~(BPF_ALIGNMENT-1))
 
 #define BPF_MAXINSNS 512
 #define BPF_MAXBUFSIZE 0x80000
 #define BPF_MINBUFSIZE 32
 
 /*
  *  Structure for BIOCSETF.
  */
 struct bpf_program {
 	u_int bf_len;
 	struct bpf_insn *bf_insns;
 };
 
 /*
  * Struct returned by BIOCGSTATS.
  */
 struct bpf_stat {
 	u_int bs_recv;		/* number of packets received */
 	u_int bs_drop;		/* number of packets dropped */
 };
 
 /*
  * Struct return by BIOCVERSION.  This represents the version number of
  * the filter language described by the instruction encodings below.
  * bpf understands a program iff kernel_major == filter_major &&
  * kernel_minor >= filter_minor, that is, if the value returned by the
  * running kernel has the same major number and a minor number equal
  * equal to or less than the filter being downloaded.  Otherwise, the
  * results are undefined, meaning an error may be returned or packets
  * may be accepted haphazardly.
  * It has nothing to do with the source code version.
  */
 struct bpf_version {
 	u_short bv_major;
 	u_short bv_minor;
 };
 /* Current version number of filter architecture. */
 #define BPF_MAJOR_VERSION 1
 #define BPF_MINOR_VERSION 1
 
 /*
  * Historically, BPF has supported a single buffering model, first using mbuf
  * clusters in kernel, and later using malloc(9) buffers in kernel.  We now
  * support multiple buffering modes, which may be queried and set using
  * BIOCGETBUFMODE and BIOCSETBUFMODE.  So as to avoid handling the complexity
  * of changing modes while sniffing packets, the mode becomes fixed once an
  * interface has been attached to the BPF descriptor.
  */
 #define	BPF_BUFMODE_BUFFER	1	/* Kernel buffers with read(). */
 #define	BPF_BUFMODE_ZBUF	2	/* Zero-copy buffers. */
 
 /*-
  * Struct used by BIOCSETZBUF, BIOCROTZBUF: describes up to two zero-copy
  * buffer as used by BPF.
  */
 struct bpf_zbuf {
 	void	*bz_bufa;	/* Location of 'a' zero-copy buffer. */
 	void	*bz_bufb;	/* Location of 'b' zero-copy buffer. */
 	size_t	 bz_buflen;	/* Size of zero-copy buffers. */
 };
 
 #define	BIOCGBLEN	_IOR('B', 102, u_int)
 #define	BIOCSBLEN	_IOWR('B', 102, u_int)
 #define	BIOCSETF	_IOW('B', 103, struct bpf_program)
 #define	BIOCFLUSH	_IO('B', 104)
 #define	BIOCPROMISC	_IO('B', 105)
 #define	BIOCGDLT	_IOR('B', 106, u_int)
 #define	BIOCGETIF	_IOR('B', 107, struct ifreq)
 #define	BIOCSETIF	_IOW('B', 108, struct ifreq)
 #define	BIOCSRTIMEOUT	_IOW('B', 109, struct timeval)
 #define	BIOCGRTIMEOUT	_IOR('B', 110, struct timeval)
 #define	BIOCGSTATS	_IOR('B', 111, struct bpf_stat)
 #define	BIOCIMMEDIATE	_IOW('B', 112, u_int)
 #define	BIOCVERSION	_IOR('B', 113, struct bpf_version)
 #define	BIOCGRSIG	_IOR('B', 114, u_int)
 #define	BIOCSRSIG	_IOW('B', 115, u_int)
 #define	BIOCGHDRCMPLT	_IOR('B', 116, u_int)
 #define	BIOCSHDRCMPLT	_IOW('B', 117, u_int)
 #define	BIOCGDIRECTION	_IOR('B', 118, u_int)
 #define	BIOCSDIRECTION	_IOW('B', 119, u_int)
 #define	BIOCSDLT	_IOW('B', 120, u_int)
 #define	BIOCGDLTLIST	_IOWR('B', 121, struct bpf_dltlist)
 #define	BIOCLOCK	_IO('B', 122)
 #define	BIOCSETWF	_IOW('B', 123, struct bpf_program)
 #define	BIOCFEEDBACK	_IOW('B', 124, u_int)
 #define	BIOCGETBUFMODE	_IOR('B', 125, u_int)
 #define	BIOCSETBUFMODE	_IOW('B', 126, u_int)
 #define	BIOCGETZMAX	_IOR('B', 127, size_t)
 #define	BIOCROTZBUF	_IOR('B', 128, struct bpf_zbuf)
 #define	BIOCSETZBUF	_IOW('B', 129, struct bpf_zbuf)
 #define	BIOCSETFNR	_IOW('B', 130, struct bpf_program)
 #define	BIOCGTSTAMP	_IOR('B', 131, u_int)
 #define	BIOCSTSTAMP	_IOW('B', 132, u_int)
 
 /* Obsolete */
 #define	BIOCGSEESENT	BIOCGDIRECTION
 #define	BIOCSSEESENT	BIOCSDIRECTION
 
 /* Packet directions */
 enum bpf_direction {
 	BPF_D_IN,	/* See incoming packets */
 	BPF_D_INOUT,	/* See incoming and outgoing packets */
 	BPF_D_OUT	/* See outgoing packets */
 };
 
 /* Time stamping functions */
 #define	BPF_T_MICROTIME		0x0000
 #define	BPF_T_NANOTIME		0x0001
 #define	BPF_T_BINTIME		0x0002
 #define	BPF_T_NONE		0x0003
 #define	BPF_T_FORMAT_MASK	0x0003
 #define	BPF_T_NORMAL		0x0000
 #define	BPF_T_FAST		0x0100
 #define	BPF_T_MONOTONIC		0x0200
 #define	BPF_T_MONOTONIC_FAST	(BPF_T_FAST | BPF_T_MONOTONIC)
 #define	BPF_T_FLAG_MASK		0x0300
 #define	BPF_T_FORMAT(t)		((t) & BPF_T_FORMAT_MASK)
 #define	BPF_T_FLAG(t)		((t) & BPF_T_FLAG_MASK)
 #define	BPF_T_VALID(t)						\
     ((t) == BPF_T_NONE || (BPF_T_FORMAT(t) != BPF_T_NONE &&	\
     ((t) & ~(BPF_T_FORMAT_MASK | BPF_T_FLAG_MASK)) == 0))
 
 #define	BPF_T_MICROTIME_FAST		(BPF_T_MICROTIME | BPF_T_FAST)
 #define	BPF_T_NANOTIME_FAST		(BPF_T_NANOTIME | BPF_T_FAST)
 #define	BPF_T_BINTIME_FAST		(BPF_T_BINTIME | BPF_T_FAST)
 #define	BPF_T_MICROTIME_MONOTONIC	(BPF_T_MICROTIME | BPF_T_MONOTONIC)
 #define	BPF_T_NANOTIME_MONOTONIC	(BPF_T_NANOTIME | BPF_T_MONOTONIC)
 #define	BPF_T_BINTIME_MONOTONIC		(BPF_T_BINTIME | BPF_T_MONOTONIC)
 #define	BPF_T_MICROTIME_MONOTONIC_FAST	(BPF_T_MICROTIME | BPF_T_MONOTONIC_FAST)
 #define	BPF_T_NANOTIME_MONOTONIC_FAST	(BPF_T_NANOTIME | BPF_T_MONOTONIC_FAST)
 #define	BPF_T_BINTIME_MONOTONIC_FAST	(BPF_T_BINTIME | BPF_T_MONOTONIC_FAST)
 
 /*
  * Structure prepended to each packet.
  */
 struct bpf_ts {
 	bpf_int64	bt_sec;		/* seconds */
 	bpf_u_int64	bt_frac;	/* fraction */
 };
 struct bpf_xhdr {
 	struct bpf_ts	bh_tstamp;	/* time stamp */
 	bpf_u_int32	bh_caplen;	/* length of captured portion */
 	bpf_u_int32	bh_datalen;	/* original length of packet */
 	u_short		bh_hdrlen;	/* length of bpf header (this struct
 					   plus alignment padding) */
 };
 /* Obsolete */
 struct bpf_hdr {
 	struct timeval	bh_tstamp;	/* time stamp */
 	bpf_u_int32	bh_caplen;	/* length of captured portion */
 	bpf_u_int32	bh_datalen;	/* original length of packet */
 	u_short		bh_hdrlen;	/* length of bpf header (this struct
 					   plus alignment padding) */
 };
 #ifdef _KERNEL
 #define	MTAG_BPF		0x627066
 #define	MTAG_BPF_TIMESTAMP	0
 #endif
 
 /*
  * When using zero-copy BPF buffers, a shared memory header is present
  * allowing the kernel BPF implementation and user process to synchronize
  * without using system calls.  This structure defines that header.  When
  * accessing these fields, appropriate atomic operation and memory barriers
  * are required in order not to see stale or out-of-order data; see bpf(4)
  * for reference code to access these fields from userspace.
  *
  * The layout of this structure is critical, and must not be changed; if must
  * fit in a single page on all architectures.
  */
 struct bpf_zbuf_header {
 	volatile u_int	bzh_kernel_gen;	/* Kernel generation number. */
 	volatile u_int	bzh_kernel_len;	/* Length of data in the buffer. */
 	volatile u_int	bzh_user_gen;	/* User generation number. */
 	u_int _bzh_pad[5];
 };
 
 /*
  * Data-link level type codes.
  */
 #define DLT_NULL	0	/* BSD loopback encapsulation */
 #define DLT_EN10MB	1	/* Ethernet (10Mb) */
 #define DLT_EN3MB	2	/* Experimental Ethernet (3Mb) */
 #define DLT_AX25	3	/* Amateur Radio AX.25 */
 #define DLT_PRONET	4	/* Proteon ProNET Token Ring */
 #define DLT_CHAOS	5	/* Chaos */
 #define DLT_IEEE802	6	/* IEEE 802 Networks */
 #define DLT_ARCNET	7	/* ARCNET */
 #define DLT_SLIP	8	/* Serial Line IP */
 #define DLT_PPP		9	/* Point-to-point Protocol */
 #define DLT_FDDI	10	/* FDDI */
 #define DLT_ATM_RFC1483	11	/* LLC/SNAP encapsulated atm */
 #define DLT_RAW		12	/* raw IP */
 
 /*
  * These are values from BSD/OS's "bpf.h".
  * These are not the same as the values from the traditional libpcap
  * "bpf.h"; however, these values shouldn't be generated by any
  * OS other than BSD/OS, so the correct values to use here are the
  * BSD/OS values.
  *
  * Platforms that have already assigned these values to other
  * DLT_ codes, however, should give these codes the values
  * from that platform, so that programs that use these codes will
  * continue to compile - even though they won't correctly read
  * files of these types.
  */
 #define DLT_SLIP_BSDOS	15	/* BSD/OS Serial Line IP */
 #define DLT_PPP_BSDOS	16	/* BSD/OS Point-to-point Protocol */
 
 #define DLT_ATM_CLIP	19	/* Linux Classical-IP over ATM */
 
 /*
  * These values are defined by NetBSD; other platforms should refrain from
  * using them for other purposes, so that NetBSD savefiles with link
  * types of 50 or 51 can be read as this type on all platforms.
  */
 #define DLT_PPP_SERIAL	50	/* PPP over serial with HDLC encapsulation */
 #define DLT_PPP_ETHER	51	/* PPP over Ethernet */
 
 /*
  * Reserved for the Symantec Enterprise Firewall.
  */
 #define DLT_SYMANTEC_FIREWALL	99
 
 /*
  * Values between 100 and 103 are used in capture file headers as
  * link-layer header type LINKTYPE_ values corresponding to DLT_ types
  * that differ between platforms; don't use those values for new DLT_
  * new types.
  */
 
 /*
  * Values starting with 104 are used for newly-assigned link-layer
  * header type values; for those link-layer header types, the DLT_
  * value returned by pcap_datalink() and passed to pcap_open_dead(),
  * and the LINKTYPE_ value that appears in capture files, are the
  * same.
  *
  * DLT_MATCHING_MIN is the lowest such value; DLT_MATCHING_MAX is
  * the highest such value.
  */
 #define DLT_MATCHING_MIN	104
 
 /*
  * This value was defined by libpcap 0.5; platforms that have defined
  * it with a different value should define it here with that value -
  * a link type of 104 in a save file will be mapped to DLT_C_HDLC,
  * whatever value that happens to be, so programs will correctly
  * handle files with that link type regardless of the value of
  * DLT_C_HDLC.
  *
  * The name DLT_C_HDLC was used by BSD/OS; we use that name for source
  * compatibility with programs written for BSD/OS.
  *
  * libpcap 0.5 defined it as DLT_CHDLC; we define DLT_CHDLC as well,
  * for source compatibility with programs written for libpcap 0.5.
  */
 #define DLT_C_HDLC	104	/* Cisco HDLC */
 #define DLT_CHDLC	DLT_C_HDLC
 
 #define DLT_IEEE802_11	105	/* IEEE 802.11 wireless */
 
 /*
  * Values between 106 and 107 are used in capture file headers as
  * link-layer types corresponding to DLT_ types that might differ
  * between platforms; don't use those values for new DLT_ new types.
  */
 
 /*
  * Frame Relay; BSD/OS has a DLT_FR with a value of 11, but that collides
  * with other values.
  * DLT_FR and DLT_FRELAY packets start with the Q.922 Frame Relay header
  * (DLCI, etc.).
  */
 #define DLT_FRELAY	107
 
 /*
  * OpenBSD DLT_LOOP, for loopback devices; it's like DLT_NULL, except
  * that the AF_ type in the link-layer header is in network byte order.
  *
  * OpenBSD defines it as 12, but that collides with DLT_RAW, so we
  * define it as 108 here.  If OpenBSD picks up this file, it should
  * define DLT_LOOP as 12 in its version, as per the comment above -
  * and should not use 108 as a DLT_ value.
  */
 #define DLT_LOOP	108
 
 /*
  * Values between 109 and 112 are used in capture file headers as
  * link-layer types corresponding to DLT_ types that might differ
  * between platforms; don't use those values for new DLT_ new types.
  */
 
 /*
  * Encapsulated packets for IPsec; DLT_ENC is 13 in OpenBSD, but that's
  * DLT_SLIP_BSDOS in NetBSD, so we don't use 13 for it in OSes other
  * than OpenBSD.
  */
 #define DLT_ENC	109
 
 /*
  * This is for Linux cooked sockets.
  */
 #define DLT_LINUX_SLL	113
 
 /*
  * Apple LocalTalk hardware.
  */
 #define DLT_LTALK	114
 
 /*
  * Acorn Econet.
  */
 #define DLT_ECONET	115
 
 /*
  * Reserved for use with OpenBSD ipfilter.
  */
 #define DLT_IPFILTER	116
 
 /*
  * Reserved for use in capture-file headers as a link-layer type
  * corresponding to OpenBSD DLT_PFLOG; DLT_PFLOG is 17 in OpenBSD,
  * but that's DLT_LANE8023 in SuSE 6.3, so we can't use 17 for it
  * in capture-file headers.
  */
 #define DLT_PFLOG	117
 
 /*
  * Registered for Cisco-internal use.
  */
 #define DLT_CISCO_IOS	118
 
 /*
  * Reserved for 802.11 cards using the Prism II chips, with a link-layer
  * header including Prism monitor mode information plus an 802.11
  * header.
  */
 #define DLT_PRISM_HEADER	119
 
 /*
  * Reserved for Aironet 802.11 cards, with an Aironet link-layer header
  * (see Doug Ambrisko's FreeBSD patches).
  */
 #define DLT_AIRONET_HEADER	120
 
 /*
  * Reserved for use by OpenBSD's pfsync device.
  */
 #define DLT_PFSYNC	121
 
 /*
  * Reserved for Siemens HiPath HDLC. XXX
  */
 #define DLT_HHDLC	121
 
 /*
  * Reserved for RFC 2625 IP-over-Fibre Channel.
  */
 #define DLT_IP_OVER_FC	122
 
 /*
  * Reserved for Full Frontal ATM on Solaris.
  */
 #define DLT_SUNATM	123
 
 /*
  * Reserved as per request from Kent Dahlgren <kent@praesum.com>
  * for private use.
  */
 #define DLT_RIO		124	/* RapidIO */
 #define DLT_PCI_EXP	125	/* PCI Express */
 #define DLT_AURORA	126	/* Xilinx Aurora link layer */
 
 /*
  * BSD header for 802.11 plus a number of bits of link-layer information
  * including radio information.
  */
 #ifndef DLT_IEEE802_11_RADIO
 #define DLT_IEEE802_11_RADIO	127
 #endif
 
 /*
  * Reserved for TZSP encapsulation.
  */
 #define DLT_TZSP		128	/* Tazmen Sniffer Protocol */
 
 /*
  * Reserved for Linux ARCNET.
  */
 #define DLT_ARCNET_LINUX	129
 
 /*
  * Juniper-private data link types.
  */
 #define DLT_JUNIPER_MLPPP	130
 #define DLT_JUNIPER_MLFR	131
 #define DLT_JUNIPER_ES		132
 #define DLT_JUNIPER_GGSN	133
 #define DLT_JUNIPER_MFR		134
 #define DLT_JUNIPER_ATM2	135
 #define DLT_JUNIPER_SERVICES	136
 #define DLT_JUNIPER_ATM1	137
 
 /*
  * Apple IP-over-IEEE 1394, as per a request from Dieter Siegmund
  * <dieter@apple.com>.  The header that's presented is an Ethernet-like
  * header:
  *
  *	#define FIREWIRE_EUI64_LEN	8
  *	struct firewire_header {
  *		u_char  firewire_dhost[FIREWIRE_EUI64_LEN];
  *		u_char  firewire_shost[FIREWIRE_EUI64_LEN];
  *		u_short firewire_type;
  *	};
  *
  * with "firewire_type" being an Ethernet type value, rather than,
  * for example, raw GASP frames being handed up.
  */
 #define DLT_APPLE_IP_OVER_IEEE1394	138
 
 /*
  * Various SS7 encapsulations, as per a request from Jeff Morriss
  * <jeff.morriss[AT]ulticom.com> and subsequent discussions.
  */
 #define DLT_MTP2_WITH_PHDR	139	/* pseudo-header with various info, followed by MTP2 */
 #define DLT_MTP2		140	/* MTP2, without pseudo-header */
 #define DLT_MTP3		141	/* MTP3, without pseudo-header or MTP2 */
 #define DLT_SCCP		142	/* SCCP, without pseudo-header or MTP2 or MTP3 */
 
 /*
  * Reserved for DOCSIS.
  */
 #define DLT_DOCSIS	143
 
 /*
  * Reserved for Linux IrDA.
  */
 #define DLT_LINUX_IRDA	144
 
 /*
  * Reserved for IBM SP switch and IBM Next Federation switch.
  */
 #define DLT_IBM_SP	145
 #define DLT_IBM_SN	146
 
 /*
  * Reserved for private use.  If you have some link-layer header type
  * that you want to use within your organization, with the capture files
  * using that link-layer header type not ever be sent outside your
  * organization, you can use these values.
  *
  * No libpcap release will use these for any purpose, nor will any
  * tcpdump release use them, either.
  *
  * Do *NOT* use these in capture files that you expect anybody not using
  * your private versions of capture-file-reading tools to read; in
  * particular, do *NOT* use them in products, otherwise you may find that
  * people won't be able to use tcpdump, or snort, or Ethereal, or... to
  * read capture files from your firewall/intrusion detection/traffic
  * monitoring/etc. appliance, or whatever product uses that DLT_ value,
  * and you may also find that the developers of those applications will
  * not accept patches to let them read those files.
  *
  * Also, do not use them if somebody might send you a capture using them
  * for *their* private type and tools using them for *your* private type
  * would have to read them.
  *
  * Instead, ask "tcpdump-workers@tcpdump.org" for a new DLT_ value,
  * as per the comment above, and use the type you're given.
  */
 #define DLT_USER0		147
 #define DLT_USER1		148
 #define DLT_USER2		149
 #define DLT_USER3		150
 #define DLT_USER4		151
 #define DLT_USER5		152
 #define DLT_USER6		153
 #define DLT_USER7		154
 #define DLT_USER8		155
 #define DLT_USER9		156
 #define DLT_USER10		157
 #define DLT_USER11		158
 #define DLT_USER12		159
 #define DLT_USER13		160
 #define DLT_USER14		161
 #define DLT_USER15		162
 
 /*
  * For future use with 802.11 captures - defined by AbsoluteValue
  * Systems to store a number of bits of link-layer information
  * including radio information:
  *
  *	http://www.shaftnet.org/~pizza/software/capturefrm.txt
  *
  * but it might be used by some non-AVS drivers now or in the
  * future.
  */
 #define DLT_IEEE802_11_RADIO_AVS 163	/* 802.11 plus AVS radio header */
 
 /*
  * Juniper-private data link type, as per request from
  * Hannes Gredler <hannes@juniper.net>.  The DLT_s are used
  * for passing on chassis-internal metainformation such as
  * QOS profiles, etc..
  */
 #define DLT_JUNIPER_MONITOR     164
 
 /*
  * Reserved for BACnet MS/TP.
  */
 #define DLT_BACNET_MS_TP	165
 
 /*
  * Another PPP variant as per request from Karsten Keil <kkeil@suse.de>.
  *
  * This is used in some OSes to allow a kernel socket filter to distinguish
  * between incoming and outgoing packets, on a socket intended to
  * supply pppd with outgoing packets so it can do dial-on-demand and
  * hangup-on-lack-of-demand; incoming packets are filtered out so they
  * don't cause pppd to hold the connection up (you don't want random
  * input packets such as port scans, packets from old lost connections,
  * etc. to force the connection to stay up).
  *
  * The first byte of the PPP header (0xff03) is modified to accomodate
  * the direction - 0x00 = IN, 0x01 = OUT.
  */
 #define DLT_PPP_PPPD		166
 
 /*
  * Names for backwards compatibility with older versions of some PPP
  * software; new software should use DLT_PPP_PPPD.
  */
 #define DLT_PPP_WITH_DIRECTION	DLT_PPP_PPPD
 #define DLT_LINUX_PPP_WITHDIRECTION	DLT_PPP_PPPD
 
 /*
  * Juniper-private data link type, as per request from
  * Hannes Gredler <hannes@juniper.net>.  The DLT_s are used
  * for passing on chassis-internal metainformation such as
  * QOS profiles, cookies, etc..
  */
 #define DLT_JUNIPER_PPPOE       167
 #define DLT_JUNIPER_PPPOE_ATM   168
 
 #define DLT_GPRS_LLC		169	/* GPRS LLC */
 #define DLT_GPF_T		170	/* GPF-T (ITU-T G.7041/Y.1303) */
 #define DLT_GPF_F		171	/* GPF-F (ITU-T G.7041/Y.1303) */
 
 /*
  * Requested by Oolan Zimmer <oz@gcom.com> for use in Gcom's T1/E1 line
  * monitoring equipment.
  */
 #define DLT_GCOM_T1E1		172
 #define DLT_GCOM_SERIAL		173
 
 /*
  * Juniper-private data link type, as per request from
  * Hannes Gredler <hannes@juniper.net>.  The DLT_ is used
  * for internal communication to Physical Interface Cards (PIC)
  */
 #define DLT_JUNIPER_PIC_PEER    174
 
 /*
  * Link types requested by Gregor Maier <gregor@endace.com> of Endace
  * Measurement Systems.  They add an ERF header (see
  * http://www.endace.com/support/EndaceRecordFormat.pdf) in front of
  * the link-layer header.
  */
 #define DLT_ERF_ETH		175	/* Ethernet */
 #define DLT_ERF_POS		176	/* Packet-over-SONET */
 
 /*
  * Requested by Daniele Orlandi <daniele@orlandi.com> for raw LAPD
  * for vISDN (http://www.orlandi.com/visdn/).  Its link-layer header
  * includes additional information before the LAPD header, so it's
  * not necessarily a generic LAPD header.
  */
 #define DLT_LINUX_LAPD		177
 
 /*
  * Juniper-private data link type, as per request from
  * Hannes Gredler <hannes@juniper.net>.
  * The DLT_ are used for prepending meta-information
  * like interface index, interface name
  * before standard Ethernet, PPP, Frelay & C-HDLC Frames
  */
 #define DLT_JUNIPER_ETHER       178
 #define DLT_JUNIPER_PPP         179
 #define DLT_JUNIPER_FRELAY      180
 #define DLT_JUNIPER_CHDLC       181
 
 /*
  * Multi Link Frame Relay (FRF.16)
  */
 #define DLT_MFR                 182
 
 /*
  * Juniper-private data link type, as per request from
  * Hannes Gredler <hannes@juniper.net>.
  * The DLT_ is used for internal communication with a
  * voice Adapter Card (PIC)
  */
 #define DLT_JUNIPER_VP          183
 
 /*
  * Arinc 429 frames.
  * DLT_ requested by Gianluca Varenni <gianluca.varenni@cacetech.com>.
  * Every frame contains a 32bit A429 label.
  * More documentation on Arinc 429 can be found at
  * http://www.condoreng.com/support/downloads/tutorials/ARINCTutorial.pdf
  */
 #define DLT_A429                184
 
 /*
  * Arinc 653 Interpartition Communication messages.
  * DLT_ requested by Gianluca Varenni <gianluca.varenni@cacetech.com>.
  * Please refer to the A653-1 standard for more information.
  */
 #define DLT_A653_ICM            185
 
 /*
  * USB packets, beginning with a USB setup header; requested by
  * Paolo Abeni <paolo.abeni@email.it>.
  */
 #define DLT_USB			186
 
 /*
  * Bluetooth HCI UART transport layer (part H:4); requested by
  * Paolo Abeni.
  */
 #define DLT_BLUETOOTH_HCI_H4	187
 
 /*
  * IEEE 802.16 MAC Common Part Sublayer; requested by Maria Cruz
  * <cruz_petagay@bah.com>.
  */
 #define DLT_IEEE802_16_MAC_CPS	188
 
 /*
  * USB packets, beginning with a Linux USB header; requested by
  * Paolo Abeni <paolo.abeni@email.it>.
  */
 #define DLT_USB_LINUX		189
 
 /*
  * Controller Area Network (CAN) v. 2.0B packets.
  * DLT_ requested by Gianluca Varenni <gianluca.varenni@cacetech.com>.
  * Used to dump CAN packets coming from a CAN Vector board.
  * More documentation on the CAN v2.0B frames can be found at
  * http://www.can-cia.org/downloads/?269
  */
 #define DLT_CAN20B              190
 
 /*
  * IEEE 802.15.4, with address fields padded, as is done by Linux
  * drivers; requested by Juergen Schimmer.
  */
 #define DLT_IEEE802_15_4_LINUX	191
 
 /*
  * Per Packet Information encapsulated packets.
  * DLT_ requested by Gianluca Varenni <gianluca.varenni@cacetech.com>.
  */
 #define DLT_PPI			192
 
 /*
  * Header for 802.16 MAC Common Part Sublayer plus a radiotap radio header;
  * requested by Charles Clancy.
  */
 #define DLT_IEEE802_16_MAC_CPS_RADIO	193
 
 /*
  * Juniper-private data link type, as per request from
  * Hannes Gredler <hannes@juniper.net>.
  * The DLT_ is used for internal communication with a
  * integrated service module (ISM).
  */
 #define DLT_JUNIPER_ISM         194
 
 /*
  * IEEE 802.15.4, exactly as it appears in the spec (no padding, no
  * nothing); requested by Mikko Saarnivala <mikko.saarnivala@sensinode.com>.
  */
 #define DLT_IEEE802_15_4	195
 
 /*
  * Various link-layer types, with a pseudo-header, for SITA
  * (http://www.sita.aero/); requested by Fulko Hew (fulko.hew@gmail.com).
  */
 #define DLT_SITA		196
 
 /*
  * Various link-layer types, with a pseudo-header, for Endace DAG cards;
  * encapsulates Endace ERF records.  Requested by Stephen Donnelly
  * <stephen@endace.com>.
  */
 #define DLT_ERF			197
 
 /*
  * Special header prepended to Ethernet packets when capturing from a
  * u10 Networks board.  Requested by Phil Mulholland
  * <phil@u10networks.com>.
  */
 #define DLT_RAIF1		198
 
 /*
  * IPMB packet for IPMI, beginning with the I2C slave address, followed
  * by the netFn and LUN, etc..  Requested by Chanthy Toeung
  * <chanthy.toeung@ca.kontron.com>.
  */
 #define DLT_IPMB		199
 
 /*
  * Juniper-private data link type, as per request from
  * Hannes Gredler <hannes@juniper.net>.
  * The DLT_ is used for capturing data on a secure tunnel interface.
  */
 #define DLT_JUNIPER_ST          200
 
 /*
  * Bluetooth HCI UART transport layer (part H:4), with pseudo-header
  * that includes direction information; requested by Paolo Abeni.
  */
 #define DLT_BLUETOOTH_HCI_H4_WITH_PHDR	201
 
 /*
  * AX.25 packet with a 1-byte KISS header; see
  *
  *      http://www.ax25.net/kiss.htm
  *
  * as per Richard Stearn <richard@rns-stearn.demon.co.uk>.
  */
 #define DLT_AX25_KISS           202
 
 /*
  * LAPD packets from an ISDN channel, starting with the address field,
  * with no pseudo-header.
  * Requested by Varuna De Silva <varunax@gmail.com>.
  */
 #define DLT_LAPD                203
 
 /*
  * Variants of various link-layer headers, with a one-byte direction
  * pseudo-header prepended - zero means "received by this host",
  * non-zero (any non-zero value) means "sent by this host" - as per
  * Will Barker <w.barker@zen.co.uk>.
  */
 #define DLT_PPP_WITH_DIR        204     /* PPP - don't confuse with DLT_PPP_WITH_DIRECTION */
 #define DLT_C_HDLC_WITH_DIR     205     /* Cisco HDLC */
 #define DLT_FRELAY_WITH_DIR     206     /* Frame Relay */
 #define DLT_LAPB_WITH_DIR       207     /* LAPB */
 
 /*
  * 208 is reserved for an as-yet-unspecified proprietary link-layer
  * type, as requested by Will Barker.
  */
 
 /*
  * IPMB with a Linux-specific pseudo-header; as requested by Alexey Neyman
  * <avn@pigeonpoint.com>.
  */
 #define DLT_IPMB_LINUX          209
 
 /*
  * FlexRay automotive bus - http://www.flexray.com/ - as requested
  * by Hannes Kaelber <hannes.kaelber@x2e.de>.
  */
 #define DLT_FLEXRAY             210
 
 /*
  * Media Oriented Systems Transport (MOST) bus for multimedia
  * transport - http://www.mostcooperation.com/ - as requested
  * by Hannes Kaelber <hannes.kaelber@x2e.de>.
  */
 #define DLT_MOST                211
 
 /*
  * Local Interconnect Network (LIN) bus for vehicle networks -
  * http://www.lin-subbus.org/ - as requested by Hannes Kaelber
  * <hannes.kaelber@x2e.de>.
  */
 #define DLT_LIN                 212
 
 /*
  * X2E-private data link type used for serial line capture,
  * as requested by Hannes Kaelber <hannes.kaelber@x2e.de>.
  */
 #define DLT_X2E_SERIAL          213
 
 /*
  * X2E-private data link type used for the Xoraya data logger
  * family, as requested by Hannes Kaelber <hannes.kaelber@x2e.de>.
  */
 #define DLT_X2E_XORAYA          214
 
 /*
  * IEEE 802.15.4, exactly as it appears in the spec (no padding, no
  * nothing), but with the PHY-level data for non-ASK PHYs (4 octets
  * of 0 as preamble, one octet of SFD, one octet of frame length+
  * reserved bit, and then the MAC-layer data, starting with the
  * frame control field).
  *
  * Requested by Max Filippov <jcmvbkbc@gmail.com>.
  */
 #define DLT_IEEE802_15_4_NONASK_PHY     215
 
 /* 
  * David Gibson <david@gibson.dropbear.id.au> requested this for
  * captures from the Linux kernel /dev/input/eventN devices. This
  * is used to communicate keystrokes and mouse movements from the
  * Linux kernel to display systems, such as Xorg. 
  */
 #define	DLT_LINUX_EVDEV		216
 
 /*
  * GSM Um and Abis interfaces, preceded by a "gsmtap" header.
  *
  * Requested by Harald Welte <laforge@gnumonks.org>.
  */
 #define	DLT_GSMTAP_UM		217
 #define	DLT_GSMTAP_ABIS		218
 
 /*
  * MPLS, with an MPLS label as the link-layer header.
  * Requested by Michele Marchetto <michele@openbsd.org> on behalf
  * of OpenBSD.
  */
 #define	DLT_MPLS		219
 
 /*
  * USB packets, beginning with a Linux USB header, with the USB header
  * padded to 64 bytes; required for memory-mapped access.
  */
 #define	DLT_USB_LINUX_MMAPPED	220
 
 /*
  * DECT packets, with a pseudo-header; requested by
  * Matthias Wenzel <tcpdump@mazzoo.de>.
  */
 #define	DLT_DECT		221
 /*
  * From: "Lidwa, Eric (GSFC-582.0)[SGT INC]" <eric.lidwa-1@nasa.gov>
  * Date: Mon, 11 May 2009 11:18:30 -0500
  *
  * DLT_AOS. We need it for AOS Space Data Link Protocol.
  *   I have already written dissectors for but need an OK from
  *   legal before I can submit a patch.
  *
  */
 #define	DLT_AOS			222
 
 /*
  * Wireless HART (Highway Addressable Remote Transducer)
  * From the HART Communication Foundation
  * IES/PAS 62591
  *
  * Requested by Sam Roberts <vieuxtech@gmail.com>.
  */
 #define	DLT_WIHART		223
 
 /*
  * Fibre Channel FC-2 frames, beginning with a Frame_Header.
  * Requested by Kahou Lei <kahou82@gmail.com>.
  */
 #define	DLT_FC_2		224
 
 /*
  * Fibre Channel FC-2 frames, beginning with an encoding of the
  * SOF, and ending with an encoding of the EOF.
  *
  * The encodings represent the frame delimiters as 4-byte sequences
  * representing the corresponding ordered sets, with K28.5
  * represented as 0xBC, and the D symbols as the corresponding
  * byte values; for example, SOFi2, which is K28.5 - D21.5 - D1.2 - D21.2,
  * is represented as 0xBC 0xB5 0x55 0x55.
  *
  * Requested by Kahou Lei <kahou82@gmail.com>.
  */
 #define	DLT_FC_2_WITH_FRAME_DELIMS	225
 /*
  * Solaris ipnet pseudo-header; requested by Darren Reed <Darren.Reed@Sun.COM>.
  *
  * The pseudo-header starts with a one-byte version number; for version 2,
  * the pseudo-header is:
  *
  * struct dl_ipnetinfo {
  *     u_int8_t   dli_version;
  *     u_int8_t   dli_family;
  *     u_int16_t  dli_htype;
  *     u_int32_t  dli_pktlen;
  *     u_int32_t  dli_ifindex;
  *     u_int32_t  dli_grifindex;
  *     u_int32_t  dli_zsrc;
  *     u_int32_t  dli_zdst;
  * };
  *
  * dli_version is 2 for the current version of the pseudo-header.
  *
  * dli_family is a Solaris address family value, so it's 2 for IPv4
  * and 26 for IPv6.
  *
  * dli_htype is a "hook type" - 0 for incoming packets, 1 for outgoing
  * packets, and 2 for packets arriving from another zone on the same
  * machine.
  *
  * dli_pktlen is the length of the packet data following the pseudo-header
  * (so the captured length minus dli_pktlen is the length of the
  * pseudo-header, assuming the entire pseudo-header was captured).
  *
  * dli_ifindex is the interface index of the interface on which the
  * packet arrived.
  *
  * dli_grifindex is the group interface index number (for IPMP interfaces).
  *
  * dli_zsrc is the zone identifier for the source of the packet.
  *
  * dli_zdst is the zone identifier for the destination of the packet.
  *
  * A zone number of 0 is the global zone; a zone number of 0xffffffff
  * means that the packet arrived from another host on the network, not
  * from another zone on the same machine.
  *
  * An IPv4 or IPv6 datagram follows the pseudo-header; dli_family indicates
  * which of those it is.
  */
 #define	DLT_IPNET			226
 
 /*
  * CAN (Controller Area Network) frames, with a pseudo-header as supplied
  * by Linux SocketCAN.  See Documentation/networking/can.txt in the Linux
  * source.
  *
  * Requested by Felix Obenhuber <felix@obenhuber.de>.
  */
 #define	DLT_CAN_SOCKETCAN		227
 
 /*
  * Raw IPv4/IPv6; different from DLT_RAW in that the DLT_ value specifies
  * whether it's v4 or v6.  Requested by Darren Reed <Darren.Reed@Sun.COM>.
  */
 #define DLT_IPV4		228
 #define DLT_IPV6		229
 
 /*
  * IEEE 802.15.4, exactly as it appears in the spec (no padding, no
  * nothing), and with no FCS at the end of the frame; requested by
  * Jon Smirl <jonsmirl@gmail.com>.
  */
 #define DLT_IEEE802_15_4_NOFCS	230
 
 /*
  * Raw D-Bus:
  *
  *	http://www.freedesktop.org/wiki/Software/dbus
  *
  * messages:
  *
  *	http://dbus.freedesktop.org/doc/dbus-specification.html#message-protocol-messages
  *
  * starting with the endianness flag, followed by the message type, etc.,
  * but without the authentication handshake before the message sequence:
  *
  *	http://dbus.freedesktop.org/doc/dbus-specification.html#auth-protocol
  *
  * Requested by Martin Vidner <martin@vidner.net>.
  */
 #define DLT_DBUS		231
 
 /*
  * Juniper-private data link type, as per request from
  * Hannes Gredler <hannes@juniper.net>.
  */
 #define DLT_JUNIPER_VS			232
 #define DLT_JUNIPER_SRX_E2E		233
 #define DLT_JUNIPER_FIBRECHANNEL	234
 
 /*
  * DVB-CI (DVB Common Interface for communication between a PC Card
  * module and a DVB receiver).  See
  *
  *	http://www.kaiser.cx/pcap-dvbci.html
  *
  * for the specification.
  *
  * Requested by Martin Kaiser <martin@kaiser.cx>.
  */
 #define DLT_DVB_CI		235
 
 /*
  * Variant of 3GPP TS 27.010 multiplexing protocol (similar to, but
  * *not* the same as, 27.010).  Requested by Hans-Christoph Schemmel
  * <hans-christoph.schemmel@cinterion.com>.
  */
 #define DLT_MUX27010		236
 
 /*
  * STANAG 5066 D_PDUs.  Requested by M. Baris Demiray
  * <barisdemiray@gmail.com>.
  */
 #define DLT_STANAG_5066_D_PDU	237
 
 /*
  * Juniper-private data link type, as per request from
  * Hannes Gredler <hannes@juniper.net>.
  */
 #define DLT_JUNIPER_ATM_CEMIC	238
 
 /*
  * NetFilter LOG messages 
  * (payload of netlink NFNL_SUBSYS_ULOG/NFULNL_MSG_PACKET packets)
  *
  * Requested by Jakub Zawadzki <darkjames-ws@darkjames.pl>
  */
 #define DLT_NFLOG		239
 
 /*
  * Hilscher Gesellschaft fuer Systemautomation mbH link-layer type
  * for Ethernet packets with a 4-byte pseudo-header and always
  * with the payload including the FCS, as supplied by their
  * netANALYZER hardware and software.
  *
  * Requested by Holger P. Frommer <HPfrommer@hilscher.com>
  */
 #define DLT_NETANALYZER		240
 
 /*
  * Hilscher Gesellschaft fuer Systemautomation mbH link-layer type
  * for Ethernet packets with a 4-byte pseudo-header and FCS and
  * with the Ethernet header preceded by 7 bytes of preamble and
  * 1 byte of SFD, as supplied by their netANALYZER hardware and
  * software.
  *
  * Requested by Holger P. Frommer <HPfrommer@hilscher.com>
  */
 #define DLT_NETANALYZER_TRANSPARENT	241
 
 /*
  * IP-over-InfiniBand, as specified by RFC 4391.
  *
  * Requested by Petr Sumbera <petr.sumbera@oracle.com>.
  */
 #define DLT_IPOIB		242
 
 /*
  * MPEG-2 transport stream (ISO 13818-1/ITU-T H.222.0).
  *
  * Requested by Guy Martin <gmsoft@tuxicoman.be>.
  */
 #define DLT_MPEG_2_TS		243
 
 /*
  * ng4T GmbH's UMTS Iub/Iur-over-ATM and Iub/Iur-over-IP format as
  * used by their ng40 protocol tester.
  *
  * Requested by Jens Grimmer <jens.grimmer@ng4t.com>.
  */
 #define DLT_NG40		244
 
 /*
  * Pseudo-header giving adapter number and flags, followed by an NFC
  * (Near-Field Communications) Logical Link Control Protocol (LLCP) PDU,
  * as specified by NFC Forum Logical Link Control Protocol Technical
  * Specification LLCP 1.1.
  *
  * Requested by Mike Wakerly <mikey@google.com>.
  */
 #define DLT_NFC_LLCP		245
 
 /*
  * 245 is used as LINKTYPE_PFSYNC; do not use it for any other purpose.
  *
  * DLT_PFSYNC has different values on different platforms, and all of
  * them collide with something used elsewhere.  On platforms that
  * don't already define it, define it as 245.
  */
 #if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__) && !defined(__DragonFly__) && !defined(__APPLE__)
 #define DLT_PFSYNC		246
 #endif
 
 /*
  * Raw InfiniBand packets, starting with the Local Routing Header.
  *
  * Requested by Oren Kladnitsky <orenk@mellanox.com>.
  */
 #define DLT_INFINIBAND		247
 
 /*
  * SCTP, with no lower-level protocols (i.e., no IPv4 or IPv6).
  *
  * Requested by Michael Tuexen <Michael.Tuexen@lurchi.franken.de>.
  */
 #define DLT_SCTP		248
 
 /*
  * USB packets, beginning with a USBPcap header.
  *
  * Requested by Tomasz Mon <desowin@gmail.com>
  */
 #define DLT_USBPCAP		249
 
 /*
  * Schweitzer Engineering Laboratories "RTAC" product serial-line
  * packets.
  *
  * Requested by Chris Bontje <chris_bontje@selinc.com>.
  */
 #define DLT_RTAC_SERIAL		250
 
 /*
  * Bluetooth Low Energy air interface link-layer packets.
  *
  * Requested by Mike Kershaw <dragorn@kismetwireless.net>.
  */
 #define DLT_BLUETOOTH_LE_LL	251
 
 /*
  * DLT type for upper-protocol layer PDU saves from wireshark.
  * 
  * the actual contents are determined by two TAGs stored with each
  * packet:
  *   EXP_PDU_TAG_LINKTYPE          the link type (LINKTYPE_ value) of the
  *				   original packet.
  *
  *   EXP_PDU_TAG_PROTO_NAME        the name of the wireshark dissector
  * 				   that can make sense of the data stored.
  */
 #define DLT_WIRESHARK_UPPER_PDU	252
 
 /*
  * DLT type for the netlink protocol (nlmon devices).
  */
 #define DLT_NETLINK		253
 
 /*
  * Bluetooth Linux Monitor headers for the BlueZ stack.
  */
 #define DLT_BLUETOOTH_LINUX_MONITOR	254
 
 /*
  * Bluetooth Basic Rate/Enhanced Data Rate baseband packets, as
  * captured by Ubertooth.
  */
 #define DLT_BLUETOOTH_BREDR_BB	255
 
 /*
  * Bluetooth Low Energy link layer packets, as captured by Ubertooth.
  */
 #define DLT_BLUETOOTH_LE_LL_WITH_PHDR	256
 
 /*
  * PROFIBUS data link layer.
  */
 #define DLT_PROFIBUS_DL		257
 
 /*
  * Apple's DLT_PKTAP headers.
  *
  * Sadly, the folks at Apple either had no clue that the DLT_USERn values
  * are for internal use within an organization and partners only, and
  * didn't know that the right way to get a link-layer header type is to
  * ask tcpdump.org for one, or knew and didn't care, so they just
  * used DLT_USER2, which causes problems for everything except for
  * their version of tcpdump.
  *
  * So I'll just give them one; hopefully this will show up in a
  * libpcap release in time for them to get this into 10.10 Big Sur
  * or whatever Mavericks' successor is called.  LINKTYPE_PKTAP
  * will be 258 *even on OS X*; that is *intentional*, so that
  * PKTAP files look the same on *all* OSes (different OSes can have
  * different numerical values for a given DLT_, but *MUST NOT* have
  * different values for what goes in a file, as files can be moved
  * between OSes!).
  *
  * When capturing, on a system with a Darwin-based OS, on a device
  * that returns 149 (DLT_USER2 and Apple's DLT_PKTAP) with this
  * version of libpcap, the DLT_ value for the pcap_t  will be DLT_PKTAP,
  * and that will continue to be DLT_USER2 on Darwin-based OSes. That way,
  * binary compatibility with Mavericks is preserved for programs using
  * this version of libpcap.  This does mean that if you were using
  * DLT_USER2 for some capture device on OS X, you can't do so with
  * this version of libpcap, just as you can't with Apple's libpcap -
  * on OS X, they define DLT_PKTAP to be DLT_USER2, so programs won't
  * be able to distinguish between PKTAP and whatever you were using
  * DLT_USER2 for.
  *
  * If the program saves the capture to a file using this version of
  * libpcap's pcap_dump code, the LINKTYPE_ value in the file will be
  * LINKTYPE_PKTAP, which will be 258, even on Darwin-based OSes.
  * That way, the file will *not* be a DLT_USER2 file.  That means
  * that the latest version of tcpdump, when built with this version
  * of libpcap, and sufficiently recent versions of Wireshark will
  * be able to read those files and interpret them correctly; however,
  * Apple's version of tcpdump in OS X 10.9 won't be able to handle
  * them.  (Hopefully, Apple will pick up this version of libpcap,
  * and the corresponding version of tcpdump, so that tcpdump will
  * be able to handle the old LINKTYPE_USER2 captures *and* the new
  * LINKTYPE_PKTAP captures.)
  */
 #ifdef __APPLE__
 #define DLT_PKTAP	DLT_USER2
 #else
 #define DLT_PKTAP	258
 #endif
 
 /*
  * Ethernet packets preceded by a header giving the last 6 octets
  * of the preamble specified by 802.3-2012 Clause 65, section
  * 65.1.3.2 "Transmit".
  */
 #define DLT_EPON	259
 
 /*
  * IPMI trace packets, as specified by Table 3-20 "Trace Data Block Format"
  * in the PICMG HPM.2 specification.
  */
 #define DLT_IPMI_HPM_2	260
 
 #define DLT_MATCHING_MAX	260	/* highest value in the "matching" range */
 
 /*
  * DLT and savefile link type values are split into a class and
  * a member of that class.  A class value of 0 indicates a regular
  * DLT_/LINKTYPE_ value.
  */
 #define DLT_CLASS(x)            ((x) & 0x03ff0000)
 
 /*
  * The instruction encodings.
  *
  * Please inform tcpdump-workers@lists.tcpdump.org if you use any
  * of the reserved values, so that we can note that they're used
  * (and perhaps implement it in the reference BPF implementation
  * and encourage its implementation elsewhere).
  */
 
 /*
  * The upper 8 bits of the opcode aren't used. BSD/OS used 0x8000.
  */
 
 /* instruction classes */
 #define BPF_CLASS(code) ((code) & 0x07)
 #define		BPF_LD		0x00
 #define		BPF_LDX		0x01
 #define		BPF_ST		0x02
 #define		BPF_STX		0x03
 #define		BPF_ALU		0x04
 #define		BPF_JMP		0x05
 #define		BPF_RET		0x06
 #define		BPF_MISC	0x07
 
 /* ld/ldx fields */
 #define BPF_SIZE(code)	((code) & 0x18)
 #define		BPF_W		0x00
 #define		BPF_H		0x08
 #define		BPF_B		0x10
 /*				0x18	reserved; used by BSD/OS */
 #define BPF_MODE(code)	((code) & 0xe0)
 #define		BPF_IMM 	0x00
 #define		BPF_ABS		0x20
 #define		BPF_IND		0x40
 #define		BPF_MEM		0x60
 #define		BPF_LEN		0x80
 #define		BPF_MSH		0xa0
 /*				0xc0	reserved; used by BSD/OS */
 /*				0xe0	reserved; used by BSD/OS */
 
 /* alu/jmp fields */
 #define BPF_OP(code)	((code) & 0xf0)
 #define		BPF_ADD		0x00
 #define		BPF_SUB		0x10
 #define		BPF_MUL		0x20
 #define		BPF_DIV		0x30
 #define		BPF_OR		0x40
 #define		BPF_AND		0x50
 #define		BPF_LSH		0x60
 #define		BPF_RSH		0x70
 #define		BPF_NEG		0x80
 #define		BPF_MOD		0x90
 #define		BPF_XOR		0xa0
 /*				0xb0	reserved */
 /*				0xc0	reserved */
 /*				0xd0	reserved */
 /*				0xe0	reserved */
 /*				0xf0	reserved */
 
 #define		BPF_JA		0x00
 #define		BPF_JEQ		0x10
 #define		BPF_JGT		0x20
 #define		BPF_JGE		0x30
 #define		BPF_JSET	0x40
 /*				0x50	reserved; used on BSD/OS */
 /*				0x60	reserved */
 /*				0x70	reserved */
 /*				0x80	reserved */
 /*				0x90	reserved */
 /*				0xa0	reserved */
 /*				0xb0	reserved */
 /*				0xc0	reserved */
 /*				0xd0	reserved */
 /*				0xe0	reserved */
 /*				0xf0	reserved */
 #define BPF_SRC(code)	((code) & 0x08)
 #define		BPF_K		0x00
 #define		BPF_X		0x08
 
 /* ret - BPF_K and BPF_X also apply */
 #define BPF_RVAL(code)	((code) & 0x18)
 #define		BPF_A		0x10
 /*				0x18	reserved */
 
 /* misc */
 #define BPF_MISCOP(code) ((code) & 0xf8)
 #define		BPF_TAX		0x00
 /*				0x08	reserved */
 /*				0x10	reserved */
 /*				0x18	reserved */
 /* #define	BPF_COP		0x20	NetBSD "coprocessor" extensions */
 /*				0x28	reserved */
 /*				0x30	reserved */
 /*				0x38	reserved */
 /* #define	BPF_COPX	0x40	NetBSD "coprocessor" extensions */
 /*					also used on BSD/OS */
 /*				0x48	reserved */
 /*				0x50	reserved */
 /*				0x58	reserved */
 /*				0x60	reserved */
 /*				0x68	reserved */
 /*				0x70	reserved */
 /*				0x78	reserved */
 #define		BPF_TXA		0x80
 /*				0x88	reserved */
 /*				0x90	reserved */
 /*				0x98	reserved */
 /*				0xa0	reserved */
 /*				0xa8	reserved */
 /*				0xb0	reserved */
 /*				0xb8	reserved */
 /*				0xc0	reserved; used on BSD/OS */
 /*				0xc8	reserved */
 /*				0xd0	reserved */
 /*				0xd8	reserved */
 /*				0xe0	reserved */
 /*				0xe8	reserved */
 /*				0xf0	reserved */
 /*				0xf8	reserved */
 
 /*
  * The instruction data structure.
  */
 struct bpf_insn {
 	u_short		code;
 	u_char		jt;
 	u_char		jf;
 	bpf_u_int32	k;
 };
 
 /*
  * Macros for insn array initializers.
  */
 #define BPF_STMT(code, k) { (u_short)(code), 0, 0, k }
 #define BPF_JUMP(code, k, jt, jf) { (u_short)(code), jt, jf, k }
 
 /*
  * Structure to retrieve available DLTs for the interface.
  */
 struct bpf_dltlist {
 	u_int	bfl_len;	/* number of bfd_list array */
 	u_int	*bfl_list;	/* array of DLTs */
 };
 
 #ifdef _KERNEL
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_BPF);
 #endif
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_net_bpf);
 #endif
 
 /*
  * Rotate the packet buffers in descriptor d.  Move the store buffer into the
  * hold slot, and the free buffer into the store slot.  Zero the length of the
  * new store buffer.  Descriptor lock should be held.  One must be careful to
  * not rotate the buffers twice, i.e. if fbuf != NULL.
  */
 #define	ROTATE_BUFFERS(d)	do {					\
 	(d)->bd_hbuf = (d)->bd_sbuf;					\
 	(d)->bd_hlen = (d)->bd_slen;					\
 	(d)->bd_sbuf = (d)->bd_fbuf;					\
 	(d)->bd_slen = 0;						\
 	(d)->bd_fbuf = NULL;						\
 	bpf_bufheld(d);							\
 } while (0)
 
 /*
  * Descriptor associated with each attached hardware interface.
  * Part of this structure is exposed to external callers to speed up
  * bpf_peers_present() calls.
  */
 struct bpf_if;
 
 struct bpf_if_ext {
 	LIST_ENTRY(bpf_if)	bif_next;	/* list of all interfaces */
 	LIST_HEAD(, bpf_d)	bif_dlist;	/* descriptor list */
 };
 
 void	 bpf_bufheld(struct bpf_d *d);
 int	 bpf_validate(const struct bpf_insn *, int);
 void	 bpf_tap(struct bpf_if *, u_char *, u_int);
 void	 bpf_mtap(struct bpf_if *, struct mbuf *);
 void	 bpf_mtap2(struct bpf_if *, void *, u_int, struct mbuf *);
 void	 bpfattach(struct ifnet *, u_int, u_int);
 void	 bpfattach2(struct ifnet *, u_int, u_int, struct bpf_if **);
 void	 bpfdetach(struct ifnet *);
+#ifdef VIMAGE
+int	 bpf_get_bp_params(struct bpf_if *, u_int *, u_int *);
+#endif
 
 void	 bpfilterattach(int);
 u_int	 bpf_filter(const struct bpf_insn *, u_char *, u_int, u_int);
 
 static __inline int
 bpf_peers_present(struct bpf_if *bpf)
 {
 	struct bpf_if_ext *ext;
 
 	ext = (struct bpf_if_ext *)bpf;
 	if (!LIST_EMPTY(&ext->bif_dlist))
 		return (1);
 	return (0);
 }
 
 #define	BPF_TAP(_ifp,_pkt,_pktlen) do {				\
 	if (bpf_peers_present((_ifp)->if_bpf))			\
 		bpf_tap((_ifp)->if_bpf, (_pkt), (_pktlen));	\
 } while (0)
 #define	BPF_MTAP(_ifp,_m) do {					\
 	if (bpf_peers_present((_ifp)->if_bpf)) {		\
 		M_ASSERTVALID(_m);				\
 		bpf_mtap((_ifp)->if_bpf, (_m));			\
 	}							\
 } while (0)
 #define	BPF_MTAP2(_ifp,_data,_dlen,_m) do {			\
 	if (bpf_peers_present((_ifp)->if_bpf)) {		\
 		M_ASSERTVALID(_m);				\
 		bpf_mtap2((_ifp)->if_bpf,(_data),(_dlen),(_m));	\
 	}							\
 } while (0)
 #endif
 
 /*
  * Number of scratch memory words (for BPF_LD|BPF_MEM and BPF_ST).
  */
 #define BPF_MEMWORDS 16
 
 #ifdef _SYS_EVENTHANDLER_H_
 /* BPF attach/detach events */
 struct ifnet;
 typedef void (*bpf_track_fn)(void *, struct ifnet *, int /* dlt */,
     int /* 1 =>'s attach */);
 EVENTHANDLER_DECLARE(bpf_track, bpf_track_fn);
 #endif /* _SYS_EVENTHANDLER_H_ */
 
 #endif /* _NET_BPF_H_ */
Index: projects/vnet/sys/net/if.c
===================================================================
--- projects/vnet/sys/net/if.c	(revision 297511)
+++ projects/vnet/sys/net/if.c	(revision 297512)
@@ -1,4024 +1,4047 @@
 /*-
  * Copyright (c) 1980, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if.c	8.5 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #include "opt_compat.h"
 #include "opt_inet6.h"
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/conf.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <sys/bus.h>
 #include <sys/mbuf.h>
 #include <sys/systm.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/refcount.h>
 #include <sys/module.h>
 #include <sys/rwlock.h>
 #include <sys/sockio.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/domain.h>
 #include <sys/jail.h>
 #include <sys/priv.h>
 
 #include <machine/stdarg.h>
 #include <vm/uma.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_vlan_var.h>
 #include <net/radix.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <net/ethernet.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_carp.h>
 #ifdef INET
 #include <netinet/if_ether.h>
 #endif /* INET */
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif /* INET6 */
 #endif /* INET || INET6 */
 
 #include <security/mac/mac_framework.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 #endif
 
 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
 
 SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN,
     &ifqmaxlen, 0, "max send queue size");
 
 /* Log link state change events */
 static int log_link_state_change = 1;
 
 SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW,
 	&log_link_state_change, 0,
 	"log interface link state change events");
 
 /* Interface description */
 static unsigned int ifdescr_maxlen = 1024;
 SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW,
 	&ifdescr_maxlen, 0,
 	"administrative maximum length for interface description");
 
 static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions");
 
 /* global sx for non-critical path ifdescr */
 static struct sx ifdescr_sx;
 SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr");
 
 void	(*bridge_linkstate_p)(struct ifnet *ifp);
 void	(*ng_ether_link_state_p)(struct ifnet *ifp, int state);
 void	(*lagg_linkstate_p)(struct ifnet *ifp, int state);
 /* These are external hooks for CARP. */
 void	(*carp_linkstate_p)(struct ifnet *ifp);
 void	(*carp_demote_adj_p)(int, char *);
 int	(*carp_master_p)(struct ifaddr *);
 #if defined(INET) || defined(INET6)
 int	(*carp_forus_p)(struct ifnet *ifp, u_char *dhost);
 int	(*carp_output_p)(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *sa);
 int	(*carp_ioctl_p)(struct ifreq *, u_long, struct thread *);   
 int	(*carp_attach_p)(struct ifaddr *, int);
 void	(*carp_detach_p)(struct ifaddr *);
 #endif
 #ifdef INET
 int	(*carp_iamatch_p)(struct ifaddr *, uint8_t **);
 #endif
 #ifdef INET6
 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6);
 caddr_t	(*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m,
     const struct in6_addr *taddr);
 #endif
 
 struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL;
 
 /*
  * XXX: Style; these should be sorted alphabetically, and unprototyped
  * static functions should be prototyped. Currently they are sorted by
  * declaration order.
  */
 static void	if_attachdomain(void *);
 static void	if_attachdomain1(struct ifnet *);
 static int	ifconf(u_long, caddr_t);
 static void	if_freemulti(struct ifmultiaddr *);
 static void	if_grow(void);
 static void	if_input_default(struct ifnet *, struct mbuf *);
 static int	if_requestencap_default(struct ifnet *, struct if_encap_req *);
 static void	if_route(struct ifnet *, int flag, int fam);
 static int	if_setflag(struct ifnet *, int, int, int *, int);
 static int	if_transmit(struct ifnet *ifp, struct mbuf *m);
 static void	if_unroute(struct ifnet *, int flag, int fam);
 static void	link_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
 static int	ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *);
 static int	if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int);
 static void	do_link_state_change(void *, int);
 static int	if_getgroup(struct ifgroupreq *, struct ifnet *);
 static int	if_getgroupmembers(struct ifgroupreq *);
 static void	if_delgroups(struct ifnet *);
 static void	if_attach_internal(struct ifnet *, int, struct if_clone *);
-static int	if_detach_internal(struct ifnet *, int, struct if_clone **);
+static int	if_detach_internal(struct ifnet *, int, int, struct if_clone **);
 #ifdef VIMAGE
-static void	if_vmove(struct ifnet *, struct vnet *);
+static void	if_vmove(struct ifnet *, struct vnet *, int);
 #endif
 
 #ifdef INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
  * should be more generalized?
  */
 extern void	nd6_setmtu(struct ifnet *);
 #endif
 
 /* ipsec helper hooks */
 VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]);
 VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]);
 
 VNET_DEFINE(int, if_index);
 int	ifqmaxlen = IFQ_MAXLEN;
 VNET_DEFINE(struct ifnethead, ifnet);	/* depend on static init XXX */
 VNET_DEFINE(struct ifgrouphead, ifg_head);
 
 static VNET_DEFINE(int, if_indexlim) = 8;
 
 /* Table of ifnet by index. */
 VNET_DEFINE(struct ifnet **, ifindex_table);
 
 #define	V_if_indexlim		VNET(if_indexlim)
 #define	V_ifindex_table		VNET(ifindex_table)
 
 /*
  * The global network interface list (V_ifnet) and related state (such as
  * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and
  * an rwlock.  Either may be acquired shared to stablize the list, but both
  * must be acquired writable to modify the list.  This model allows us to
  * both stablize the interface list during interrupt thread processing, but
  * also to stablize it over long-running ioctls, without introducing priority
  * inversions and deadlocks.
  */
 struct rwlock ifnet_rwlock;
 RW_SYSINIT_FLAGS(ifnet_rw, &ifnet_rwlock, "ifnet_rw", RW_RECURSE);
 struct sx ifnet_sxlock;
 SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE);
 
 /*
  * The allocation of network interfaces is a rather non-atomic affair; we
  * need to select an index before we are ready to expose the interface for
  * use, so will use this pointer value to indicate reservation.
  */
 #define	IFNET_HOLD	(void *)(uintptr_t)(-1)
 
 static	if_com_alloc_t *if_com_alloc[256];
 static	if_com_free_t *if_com_free[256];
 
 static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals");
 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
 
 struct ifnet *
 ifnet_byindex_locked(u_short idx)
 {
 
 	if (idx > V_if_index)
 		return (NULL);
 	if (V_ifindex_table[idx] == IFNET_HOLD)
 		return (NULL);
 	return (V_ifindex_table[idx]);
 }
 
 struct ifnet *
 ifnet_byindex(u_short idx)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifp = ifnet_byindex_locked(idx);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 struct ifnet *
 ifnet_byindex_ref(u_short idx)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifp = ifnet_byindex_locked(idx);
 	if (ifp == NULL || (ifp->if_flags & IFF_DYING)) {
 		IFNET_RUNLOCK_NOSLEEP();
 		return (NULL);
 	}
 	if_ref(ifp);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 /*
  * Allocate an ifindex array entry; return 0 on success or an error on
  * failure.
  */
 static u_short
 ifindex_alloc(void)
 {
 	u_short idx;
 
 	IFNET_WLOCK_ASSERT();
 retry:
 	/*
 	 * Try to find an empty slot below V_if_index.  If we fail, take the
 	 * next slot.
 	 */
 	for (idx = 1; idx <= V_if_index; idx++) {
 		if (V_ifindex_table[idx] == NULL)
 			break;
 	}
 
 	/* Catch if_index overflow. */
 	if (idx >= V_if_indexlim) {
 		if_grow();
 		goto retry;
 	}
 	if (idx > V_if_index)
 		V_if_index = idx;
 	return (idx);
 }
 
 static void
 ifindex_free_locked(u_short idx)
 {
 
 	IFNET_WLOCK_ASSERT();
 
 	V_ifindex_table[idx] = NULL;
 	while (V_if_index > 0 &&
 	    V_ifindex_table[V_if_index] == NULL)
 		V_if_index--;
 }
 
 static void
 ifindex_free(u_short idx)
 {
 
 	IFNET_WLOCK();
 	ifindex_free_locked(idx);
 	IFNET_WUNLOCK();
 }
 
 static void
 ifnet_setbyindex_locked(u_short idx, struct ifnet *ifp)
 {
 
 	IFNET_WLOCK_ASSERT();
 
 	V_ifindex_table[idx] = ifp;
 }
 
 static void
 ifnet_setbyindex(u_short idx, struct ifnet *ifp)
 {
 
 	IFNET_WLOCK();
 	ifnet_setbyindex_locked(idx, ifp);
 	IFNET_WUNLOCK();
 }
 
 struct ifaddr *
 ifaddr_byindex(u_short idx)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa = NULL;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifp = ifnet_byindex_locked(idx);
 	if (ifp != NULL && (ifa = ifp->if_addr) != NULL)
 		ifa_ref(ifa);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Network interface utility routines.
  *
  * Routines with ifa_ifwith* names take sockaddr *'s as
  * parameters.
  */
 
 static void
 vnet_if_init(const void *unused __unused)
 {
 
 	TAILQ_INIT(&V_ifnet);
 	TAILQ_INIT(&V_ifg_head);
 	IFNET_WLOCK();
 	if_grow();				/* create initial table */
 	IFNET_WUNLOCK();
 	vnet_if_clone_init();
 }
 VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init,
     NULL);
 
 #ifdef VIMAGE
 static void
 vnet_if_uninit(const void *unused __unused)
 {
 
 	VNET_ASSERT(TAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p "
 	    "not empty", __func__, __LINE__, &V_ifnet));
 	VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p "
 	    "not empty", __func__, __LINE__, &V_ifg_head));
 
 	free((caddr_t)V_ifindex_table, M_IFNET);
 }
 VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST,
     vnet_if_uninit, NULL);
 
-/*
- * XXX-BZ VNET; probably along with dom stuff.
- * This is very wrong but MC currently implies that interfaces are
- * gone before we can free it.  This needs to be fied differently
- * and this needs to be moved back to SI_SUB_INIT_IF.
- */
 static void
 vnet_if_return(const void *unused __unused)
 {
 	struct ifnet *ifp, *nifp;
 
 	/* Return all inherited interfaces to their parent vnets. */
 	TAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) {
 		if (ifp->if_home_vnet != ifp->if_vnet)
-			if_vmove(ifp, ifp->if_home_vnet);
+			if_vmove(ifp, ifp->if_home_vnet, 1);
 	}
 }
-VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY,
+VNET_SYSUNINIT(vnet_if_return, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_if_return, NULL);
 #endif
 
 static void
 if_grow(void)
 {
 	int oldlim;
 	u_int n;
 	struct ifnet **e;
 
 	IFNET_WLOCK_ASSERT();
 	oldlim = V_if_indexlim;
 	IFNET_WUNLOCK();
 	n = (oldlim << 1) * sizeof(*e);
 	e = malloc(n, M_IFNET, M_WAITOK | M_ZERO);
 	IFNET_WLOCK();
 	if (V_if_indexlim != oldlim) {
 		free(e, M_IFNET);
 		return;
 	}
 	if (V_ifindex_table != NULL) {
 		memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2);
 		free((caddr_t)V_ifindex_table, M_IFNET);
 	}
 	V_if_indexlim <<= 1;
 	V_ifindex_table = e;
 }
 
 /*
  * Allocate a struct ifnet and an index for an interface.  A layer 2
  * common structure will also be allocated if an allocation routine is
  * registered for the passed type.
  */
 struct ifnet *
 if_alloc(u_char type)
 {
 	struct ifnet *ifp;
 	u_short idx;
 
 	ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO);
 	IFNET_WLOCK();
 	idx = ifindex_alloc();
 	ifnet_setbyindex_locked(idx, IFNET_HOLD);
 	IFNET_WUNLOCK();
 	ifp->if_index = idx;
 	ifp->if_type = type;
 	ifp->if_alloctype = type;
 	if (if_com_alloc[type] != NULL) {
 		ifp->if_l2com = if_com_alloc[type](type, ifp);
 		if (ifp->if_l2com == NULL) {
 			free(ifp, M_IFNET);
 			ifindex_free(idx);
 			return (NULL);
 		}
 	}
 
 	IF_ADDR_LOCK_INIT(ifp);
 	TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp);
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_LOCK_INIT(ifp);
 	TAILQ_INIT(&ifp->if_addrhead);
 	TAILQ_INIT(&ifp->if_multiaddrs);
 	TAILQ_INIT(&ifp->if_groups);
 #ifdef MAC
 	mac_ifnet_init(ifp);
 #endif
 	ifq_init(&ifp->if_snd, ifp);
 
 	refcount_init(&ifp->if_refcount, 1);	/* Index reference. */
 	for (int i = 0; i < IFCOUNTERS; i++)
 		ifp->if_counters[i] = counter_u64_alloc(M_WAITOK);
 	ifp->if_get_counter = if_get_counter_default;
 	ifnet_setbyindex(ifp->if_index, ifp);
 	return (ifp);
 }
 
 /*
  * Do the actual work of freeing a struct ifnet, and layer 2 common
  * structure.  This call is made when the last reference to an
  * interface is released.
  */
 static void
 if_free_internal(struct ifnet *ifp)
 {
 
 	KASSERT((ifp->if_flags & IFF_DYING),
 	    ("if_free_internal: interface not dying"));
 
 	if (if_com_free[ifp->if_alloctype] != NULL)
 		if_com_free[ifp->if_alloctype](ifp->if_l2com,
 		    ifp->if_alloctype);
 
 #ifdef MAC
 	mac_ifnet_destroy(ifp);
 #endif /* MAC */
 	if (ifp->if_description != NULL)
 		free(ifp->if_description, M_IFDESCR);
 	IF_AFDATA_DESTROY(ifp);
 	IF_ADDR_LOCK_DESTROY(ifp);
 	ifq_delete(&ifp->if_snd);
 
 	for (int i = 0; i < IFCOUNTERS; i++)
 		counter_u64_free(ifp->if_counters[i]);
 
 	free(ifp, M_IFNET);
 }
 
 /*
  * Deregister an interface and free the associated storage.
  */
 void
 if_free(struct ifnet *ifp)
 {
 
 	ifp->if_flags |= IFF_DYING;			/* XXX: Locking */
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	IFNET_WLOCK();
 	KASSERT(ifp == ifnet_byindex_locked(ifp->if_index),
 	    ("%s: freeing unallocated ifnet", ifp->if_xname));
 
 	ifindex_free_locked(ifp->if_index);
 	IFNET_WUNLOCK();
 
 	if (refcount_release(&ifp->if_refcount))
 		if_free_internal(ifp);
 	CURVNET_RESTORE();
 }
 
 /*
  * Interfaces to keep an ifnet type-stable despite the possibility of the
  * driver calling if_free().  If there are additional references, we defer
  * freeing the underlying data structure.
  */
 void
 if_ref(struct ifnet *ifp)
 {
 
 	/* We don't assert the ifnet list lock here, but arguably should. */
 	refcount_acquire(&ifp->if_refcount);
 }
 
 void
 if_rele(struct ifnet *ifp)
 {
 
 	if (!refcount_release(&ifp->if_refcount))
 		return;
 	if_free_internal(ifp);
 }
 
 void
 ifq_init(struct ifaltq *ifq, struct ifnet *ifp)
 {
 	
 	mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF);
 
 	if (ifq->ifq_maxlen == 0) 
 		ifq->ifq_maxlen = ifqmaxlen;
 
 	ifq->altq_type = 0;
 	ifq->altq_disc = NULL;
 	ifq->altq_flags &= ALTQF_CANTCHANGE;
 	ifq->altq_tbr  = NULL;
 	ifq->altq_ifp  = ifp;
 }
 
 void
 ifq_delete(struct ifaltq *ifq)
 {
 	mtx_destroy(&ifq->ifq_mtx);
 }
 
 /*
  * Perform generic interface initalization tasks and attach the interface
  * to the list of "active" interfaces.  If vmove flag is set on entry
  * to if_attach_internal(), perform only a limited subset of initialization
  * tasks, given that we are moving from one vnet to another an ifnet which
  * has already been fully initialized.
  *
  * Note that if_detach_internal() removes group membership unconditionally
  * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL.
  * Thus, when if_vmove() is applied to a cloned interface, group membership
  * is lost while a cloned one always joins a group whose name is
  * ifc->ifc_name.  To recover this after if_detach_internal() and
  * if_attach_internal(), the cloner should be specified to
  * if_attach_internal() via ifc.  If it is non-NULL, if_attach_internal()
  * attempts to join a group whose name is ifc->ifc_name.
  *
  * XXX:
  *  - The decision to return void and thus require this function to
  *    succeed is questionable.
  *  - We should probably do more sanity checking.  For instance we don't
  *    do anything to insure if_xname is unique or non-empty.
  */
 void
 if_attach(struct ifnet *ifp)
 {
 
 	if_attach_internal(ifp, 0, NULL);
 }
 
 /*
  * Compute the least common TSO limit.
  */
 void
 if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax)
 {
 	/*
 	 * 1) If there is no limit currently, take the limit from
 	 * the network adapter.
 	 *
 	 * 2) If the network adapter has a limit below the current
 	 * limit, apply it.
 	 */
 	if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 &&
 	    ifp->if_hw_tsomax < pmax->tsomaxbytes)) {
 		pmax->tsomaxbytes = ifp->if_hw_tsomax;
 	}
 	if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 &&
 	    ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) {
 		pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 	}
 	if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 &&
 	    ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) {
 		pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 	}
 }
 
 /*
  * Update TSO limit of a network adapter.
  *
  * Returns zero if no change. Else non-zero.
  */
 int
 if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax)
 {
 	int retval = 0;
 	if (ifp->if_hw_tsomax != pmax->tsomaxbytes) {
 		ifp->if_hw_tsomax = pmax->tsomaxbytes;
 		retval++;
 	}
 	if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) {
 		ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize;
 		retval++;
 	}
 	if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) {
 		ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount;
 		retval++;
 	}
 	return (retval);
 }
 
 static void
 if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc)
 {
 	unsigned socksize, ifasize;
 	int namelen, masklen;
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 
 	if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index))
 		panic ("%s: BUG: if_attach called without if_alloc'd input()\n",
 		    ifp->if_xname);
 
 #ifdef VIMAGE
 	ifp->if_vnet = curvnet;
 	if (ifp->if_home_vnet == NULL)
 		ifp->if_home_vnet = curvnet;
 #endif
 
 	if_addgroup(ifp, IFG_ALL);
 
 	/* Restore group membership for cloned interfaces. */
 	if (vmove && ifc != NULL)
 		if_clone_addgroup(ifp, ifc);
 
 	getmicrotime(&ifp->if_lastchange);
 	ifp->if_epoch = time_uptime;
 
 	KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) ||
 	    (ifp->if_transmit != NULL && ifp->if_qflush != NULL),
 	    ("transmit and qflush must both either be set or both be NULL"));
 	if (ifp->if_transmit == NULL) {
 		ifp->if_transmit = if_transmit;
 		ifp->if_qflush = if_qflush;
 	}
 	if (ifp->if_input == NULL)
 		ifp->if_input = if_input_default;
 
 	if (ifp->if_requestencap == NULL)
 		ifp->if_requestencap = if_requestencap_default;
 
 	if (!vmove) {
 #ifdef MAC
 		mac_ifnet_create(ifp);
 #endif
 
 		/*
 		 * Create a Link Level name for this device.
 		 */
 		namelen = strlen(ifp->if_xname);
 		/*
 		 * Always save enough space for any possiable name so we
 		 * can do a rename in place later.
 		 */
 		masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ;
 		socksize = masklen + ifp->if_addrlen;
 		if (socksize < sizeof(*sdl))
 			socksize = sizeof(*sdl);
 		socksize = roundup2(socksize, sizeof(long));
 		ifasize = sizeof(*ifa) + 2 * socksize;
 		ifa = ifa_alloc(ifasize, M_WAITOK);
 		sdl = (struct sockaddr_dl *)(ifa + 1);
 		sdl->sdl_len = socksize;
 		sdl->sdl_family = AF_LINK;
 		bcopy(ifp->if_xname, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl->sdl_index = ifp->if_index;
 		sdl->sdl_type = ifp->if_type;
 		ifp->if_addr = ifa;
 		ifa->ifa_ifp = ifp;
 		ifa->ifa_rtrequest = link_rtrequest;
 		ifa->ifa_addr = (struct sockaddr *)sdl;
 		sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
 		ifa->ifa_netmask = (struct sockaddr *)sdl;
 		sdl->sdl_len = masklen;
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
 		/* Reliably crash if used uninitialized. */
 		ifp->if_broadcastaddr = NULL;
 
 #if defined(INET) || defined(INET6)
 		/* Use defaults for TSO, if nothing is set */
 		if (ifp->if_hw_tsomax == 0 &&
 		    ifp->if_hw_tsomaxsegcount == 0 &&
 		    ifp->if_hw_tsomaxsegsize == 0) {
 			/*
 			 * The TSO defaults needs to be such that an
 			 * NFS mbuf list of 35 mbufs totalling just
 			 * below 64K works and that a chain of mbufs
 			 * can be defragged into at most 32 segments:
 			 */
 			ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) -
 			    (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
 			ifp->if_hw_tsomaxsegcount = 35;
 			ifp->if_hw_tsomaxsegsize = 2048;	/* 2K */
 
 			/* XXX some drivers set IFCAP_TSO after ethernet attach */
 			if (ifp->if_capabilities & IFCAP_TSO) {
 				if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n",
 				    ifp->if_hw_tsomax,
 				    ifp->if_hw_tsomaxsegcount,
 				    ifp->if_hw_tsomaxsegsize);
 			}
 		}
 #endif
 	}
 #ifdef VIMAGE
 	else {
 		/*
 		 * Update the interface index in the link layer address
 		 * of the interface.
 		 */
 		for (ifa = ifp->if_addr; ifa != NULL;
 		    ifa = TAILQ_NEXT(ifa, ifa_link)) {
 			if (ifa->ifa_addr->sa_family == AF_LINK) {
 				sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 				sdl->sdl_index = ifp->if_index;
 			}
 		}
 	}
 #endif
 
 	IFNET_WLOCK();
 	TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link);
 #ifdef VIMAGE
 	curvnet->vnet_ifcnt++;
 #endif
 	IFNET_WUNLOCK();
 
 	if (domain_init_status >= 2)
 		if_attachdomain1(ifp);
 
 	EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
 
 	/* Announce the interface. */
 	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 }
 
 static void
 if_attachdomain(void *dummy)
 {
 	struct ifnet *ifp;
 
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
 		if_attachdomain1(ifp);
 }
 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND,
     if_attachdomain, NULL);
 
 static void
 if_attachdomain1(struct ifnet *ifp)
 {
 	struct domain *dp;
 
 	/*
 	 * Since dp->dom_ifattach calls malloc() with M_WAITOK, we
 	 * cannot lock ifp->if_afdata initialization, entirely.
 	 */
 	if (IF_AFDATA_TRYLOCK(ifp) == 0)
 		return;
 	if (ifp->if_afdata_initialized >= domain_init_status) {
 		IF_AFDATA_UNLOCK(ifp);
 		log(LOG_WARNING, "%s called more than once on %s\n",
 		    __func__, ifp->if_xname);
 		return;
 	}
 	ifp->if_afdata_initialized = domain_init_status;
 	IF_AFDATA_UNLOCK(ifp);
 
 	/* address family dependent data region */
 	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_ifattach)
 			ifp->if_afdata[dp->dom_family] =
 			    (*dp->dom_ifattach)(ifp);
 	}
 }
 
 /*
  * Remove any unicast or broadcast network addresses from an interface.
  */
 void
 if_purgeaddrs(struct ifnet *ifp)
 {
 	struct ifaddr *ifa, *next;
 
 	/* XXX IF_ADDR_R/WLOCK */
 	TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
 		if (ifa->ifa_addr->sa_family == AF_LINK)
 			continue;
 #ifdef INET
 		/* XXX: Ugly!! ad hoc just for INET */
 		if (ifa->ifa_addr->sa_family == AF_INET) {
 			struct ifaliasreq ifr;
 
 			bzero(&ifr, sizeof(ifr));
 			ifr.ifra_addr = *ifa->ifa_addr;
 			if (ifa->ifa_dstaddr)
 				ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
 			if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
 			    NULL) == 0)
 				continue;
 		}
 #endif /* INET */
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6) {
 			in6_purgeaddr(ifa);
 			/* ifp_addrhead is already updated */
 			continue;
 		}
 #endif /* INET6 */
 		IF_ADDR_WLOCK(ifp);
 		TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
 		IF_ADDR_WUNLOCK(ifp);
 		ifa_free(ifa);
 	}
 }
 
 /*
  * Remove any multicast network addresses from an interface when an ifnet
  * is going away.
  */
 static void
 if_purgemaddrs(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 	struct ifmultiaddr *next;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
 		if_delmulti_locked(ifp, ifma, 1);
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 /*
  * Detach an interface, removing it from the list of "active" interfaces.
  * If vmove flag is set on entry to if_detach_internal(), perform only a
  * limited subset of cleanup tasks, given that we are moving an ifnet from
  * one vnet to another, where it must be fully operational.
  *
  * XXXRW: There are some significant questions about event ordering, and
  * how to prevent things from starting to use the interface during detach.
  */
 void
 if_detach(struct ifnet *ifp)
 {
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
-	if_detach_internal(ifp, 0, NULL);
+	if_detach_internal(ifp, 0, 0, NULL);
 	CURVNET_RESTORE();
 }
 
+/*
+ * The vmove, if set, flag indicates that we are called from a callpath
+ * that is moving an interface to a different vnet instance.
+ *
+ * The shutdown flag, if set, indicates that we are called in the
+ * progress of shutting down a vnet instance.  Currently only the
+ * vnet_if_return SYSUNINIT function sets it.  Note: we can be called
+ * on a vnet instance shutdown without this flag being set, e.g., when
+ * the cloned interfaces are destoyed as first thing of teardown.
+ */
 static int
-if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp)
+if_detach_internal(struct ifnet *ifp, int vmove, int shutdown,
+    struct if_clone **ifcp)
 {
 	struct ifaddr *ifa;
 	int i;
 	struct domain *dp;
  	struct ifnet *iter;
  	int found = 0;
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(iter, &V_ifnet, if_link)
 		if (iter == ifp) {
 			TAILQ_REMOVE(&V_ifnet, ifp, if_link);
 			found = 1;
 			break;
 		}
 #ifdef VIMAGE
 	if (found)
 		curvnet->vnet_ifcnt--;
 #endif
 	IFNET_WUNLOCK();
 	if (!found) {
 		/*
 		 * While we would want to panic here, we cannot
 		 * guarantee that the interface is indeed still on
 		 * the list given we don't hold locks all the way.
 		 */
 		return (ENOENT);
 #if 0
 		if (vmove)
 			panic("%s: ifp=%p not on the ifnet tailq %p",
 			    __func__, ifp, &V_ifnet);
 		else
 			return; /* XXX this should panic as well? */
 #endif
 	}
 
+	/* The one thing we have to do. */
+	if_delgroups(ifp);
+
+	if (!vmove && !shutdown && ifp->if_vnet->vnet_state == VNET_STATE_DYING_AFTER_PSEUDO)
+		return (ENOENT);
+
 	/* Check if this is a cloned interface or not. */
 	if (vmove && ifcp != NULL)
 		*ifcp = if_clone_findifc(ifp);
 
 	/*
 	 * Remove/wait for pending events.
 	 */
 	taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
 
 	/*
 	 * Remove routes and flush queues.
 	 */
 	if_down(ifp);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
 		altq_disable(&ifp->if_snd);
 	if (ALTQ_IS_ATTACHED(&ifp->if_snd))
 		altq_detach(&ifp->if_snd);
 #endif
 
 	if_purgeaddrs(ifp);
 
 #ifdef INET
-	in_ifdetach(ifp);
+	in_ifdetach(ifp, !shutdown);
 #endif
 
 #ifdef INET6
 	/*
 	 * Remove all IPv6 kernel structs related to ifp.  This should be done
 	 * before removing routing entries below, since IPv6 interface direct
 	 * routes are expected to be removed by the IPv6-specific kernel API.
 	 * Otherwise, the kernel will detect some inconsistency and bark it.
 	 */
-	in6_ifdetach(ifp);
+	in6_ifdetach(ifp, !shutdown);
 #endif
-	if_purgemaddrs(ifp);
+	if (!shutdown)
+		if_purgemaddrs(ifp);
 
 	/* Announce that the interface is gone. */
 	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 	EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
 
 	if (!vmove) {
 		/*
 		 * Prevent further calls into the device driver via ifnet.
 		 */
 		if_dead(ifp);
 
 		/*
 		 * Remove link ifaddr pointer and maybe decrement if_index.
 		 * Clean up all addresses.
 		 */
 		ifp->if_addr = NULL;
 
 		/* We can now free link ifaddr. */
 		if (!TAILQ_EMPTY(&ifp->if_addrhead)) {
 			ifa = TAILQ_FIRST(&ifp->if_addrhead);
 			IF_ADDR_WLOCK(ifp);
 			TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
 			IF_ADDR_WUNLOCK(ifp);
 			ifa_free(ifa);
 		}
 	}
 
-	rt_flushifroutes(ifp);
-	if_delgroups(ifp);
+	if (!shutdown)
+		rt_flushifroutes(ifp);
 
 	/*
 	 * We cannot hold the lock over dom_ifdetach calls as they might
 	 * sleep, for example trying to drain a callout, thus open up the
 	 * theoretical race with re-attaching.
 	 */
 	IF_AFDATA_LOCK(ifp);
 	i = ifp->if_afdata_initialized;
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_UNLOCK(ifp);
 	for (dp = domains; i > 0 && dp; dp = dp->dom_next) {
 		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
 			(*dp->dom_ifdetach)(ifp,
 			    ifp->if_afdata[dp->dom_family]);
 	}
 
 	return (0);
 }
 
 #ifdef VIMAGE
 /*
  * if_vmove() performs a limited version of if_detach() in current
  * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg.
  * An attempt is made to shrink if_index in current vnet, find an
  * unused if_index in target vnet and calls if_grow() if necessary,
  * and finally find an unused if_xname for the target vnet.
  */
 static void
-if_vmove(struct ifnet *ifp, struct vnet *new_vnet)
+if_vmove(struct ifnet *ifp, struct vnet *new_vnet, int shutdown)
 {
 	struct if_clone *ifc;
 	int rc;
+	u_int bif_dlt, bif_hdrlen;
 
 	/*
+	 * if_detach_internal() will call the eventhandler to notify
+	 * interface departure.  That will detach if_bpf.  We need to
+	 * safe the dlt and hdrlen so we can re-attach it later.
+	 */
+	bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen);
+
+	/*
 	 * Detach from current vnet, but preserve LLADDR info, do not
 	 * mark as dead etc. so that the ifnet can be reattached later.
 	 * If we cannot find it, we lost the race to someone else.
 	 */
-	rc = if_detach_internal(ifp, 1, &ifc);
+	rc = if_detach_internal(ifp, 1, shutdown, &ifc);
 	if (rc != 0)
 		return;
 
 	/*
 	 * Unlink the ifnet from ifindex_table[] in current vnet, and shrink
 	 * the if_index for that vnet if possible.
 	 *
 	 * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized,
 	 * or we'd lock on one vnet and unlock on another.
 	 */
 	IFNET_WLOCK();
 	ifindex_free_locked(ifp->if_index);
 	IFNET_WUNLOCK();
 
 	/*
 	 * Perform interface-specific reassignment tasks, if provided by
 	 * the driver.
 	 */
 	if (ifp->if_reassign != NULL)
 		ifp->if_reassign(ifp, new_vnet, NULL);
 
 	/*
 	 * Switch to the context of the target vnet.
 	 */
 	CURVNET_SET_QUIET(new_vnet);
 
 	IFNET_WLOCK();
 	ifp->if_index = ifindex_alloc();
 	ifnet_setbyindex_locked(ifp->if_index, ifp);
 	IFNET_WUNLOCK();
 
 	if_attach_internal(ifp, 1, ifc);
 
+	if (ifp->if_bpf == NULL)
+		bpfattach(ifp, bif_dlt, bif_hdrlen);
+
 	CURVNET_RESTORE();
 }
 
 /*
  * Move an ifnet to or from another child prison/vnet, specified by the jail id.
  */
 static int
 if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct ifnet *difp;
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Do not try to move the iface from and to the same prison. */
 	if (pr->pr_vnet == ifp->if_vnet) {
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Make sure the named iface does not exists in the dst. prison/vnet. */
 	/* XXX Lock interfaces to avoid races. */
 	CURVNET_SET_QUIET(pr->pr_vnet);
 	difp = ifunit(ifname);
 	CURVNET_RESTORE();
 	if (difp != NULL) {
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Move the interface into the child jail/vnet. */
-	if_vmove(ifp, pr->pr_vnet);
+	if_vmove(ifp, pr->pr_vnet, 0);
 
 	/* Report the new if_xname back to the userland. */
 	sprintf(ifname, "%s", ifp->if_xname);
 
 	prison_free(pr);
 	return (0);
 }
 
 static int
 if_vmove_reclaim(struct thread *td, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct vnet *vnet_dst;
 	struct ifnet *ifp;
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Make sure the named iface exists in the source prison/vnet. */
 	CURVNET_SET(pr->pr_vnet);
 	ifp = ifunit(ifname);		/* XXX Lock to avoid races. */
 	if (ifp == NULL) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (ENXIO);
 	}
 
 	/* Do not try to move the iface from and to the same prison. */
 	vnet_dst = TD_TO_VNET(td);
 	if (vnet_dst == ifp->if_vnet) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Get interface back from child jail/vnet. */
-	if_vmove(ifp, vnet_dst);
+	if_vmove(ifp, vnet_dst, 0);
 	CURVNET_RESTORE();
 
 	/* Report the new if_xname back to the userland. */
 	sprintf(ifname, "%s", ifp->if_xname);
 
 	prison_free(pr);
 	return (0);
 }
 #endif /* VIMAGE */
 
 /*
  * Add a group to an interface
  */
 int
 if_addgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_group	*ifg = NULL;
 	struct ifg_member	*ifgm;
 	int 			 new = 0;
 
 	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
 	    groupname[strlen(groupname) - 1] <= '9')
 		return (EINVAL);
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) {
 			IFNET_WUNLOCK();
 			return (EEXIST);
 		}
 
 	if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP,
 	    M_NOWAIT)) == NULL) {
 	    	IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member),
 	    M_TEMP, M_NOWAIT)) == NULL) {
 		free(ifgl, M_TEMP);
 		IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, groupname))
 			break;
 
 	if (ifg == NULL) {
 		if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group),
 		    M_TEMP, M_NOWAIT)) == NULL) {
 			free(ifgl, M_TEMP);
 			free(ifgm, M_TEMP);
 			IFNET_WUNLOCK();
 			return (ENOMEM);
 		}
 		strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
 		ifg->ifg_refcnt = 0;
 		TAILQ_INIT(&ifg->ifg_members);
 		TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next);
 		new = 1;
 	}
 
 	ifg->ifg_refcnt++;
 	ifgl->ifgl_group = ifg;
 	ifgm->ifgm_ifp = ifp;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
 	TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	IFNET_WUNLOCK();
 
 	if (new)
 		EVENTHANDLER_INVOKE(group_attach_event, ifg);
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Remove a group from an interface
  */
 int
 if_delgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_member	*ifgm;
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
 			break;
 	if (ifgl == NULL) {
 		IFNET_WUNLOCK();
 		return (ENOENT);
 	}
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
 		if (ifgm->ifgm_ifp == ifp)
 			break;
 
 	if (ifgm != NULL) {
 		TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
 		free(ifgm, M_TEMP);
 	}
 
 	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 		TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
 		IFNET_WUNLOCK();
 		EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group);
 		free(ifgl->ifgl_group, M_TEMP);
 	} else
 		IFNET_WUNLOCK();
 
 	free(ifgl, M_TEMP);
 
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Remove an interface from all groups
  */
 static void
 if_delgroups(struct ifnet *ifp)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_member	*ifgm;
 	char groupname[IFNAMSIZ];
 
 	IFNET_WLOCK();
 	while (!TAILQ_EMPTY(&ifp->if_groups)) {
 		ifgl = TAILQ_FIRST(&ifp->if_groups);
 
 		strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ);
 
 		IF_ADDR_WLOCK(ifp);
 		TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);		// <<<<
 		IF_ADDR_WUNLOCK(ifp);
 
 		TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
 			if (ifgm->ifgm_ifp == ifp)
 				break;
 
 		if (ifgm != NULL) {
 			TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm,
 			    ifgm_next);
 			free(ifgm, M_TEMP);
 		}
 
 		if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 			TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
 			IFNET_WUNLOCK();
 			EVENTHANDLER_INVOKE(group_detach_event,
 			    ifgl->ifgl_group);
 			free(ifgl->ifgl_group, M_TEMP);
 		} else
 			IFNET_WUNLOCK();
 
 		free(ifgl, M_TEMP);
 
 		EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 		IFNET_WLOCK();
 	}
 	IFNET_WUNLOCK();
 }
 
 /*
  * Stores all groups from an interface in memory pointed
  * to by data
  */
 static int
 if_getgroup(struct ifgroupreq *data, struct ifnet *ifp)
 {
 	int			 len, error;
 	struct ifg_list		*ifgl;
 	struct ifg_req		 ifgrq, *ifgp;
 	struct ifgroupreq	*ifgr = data;
 
 	if (ifgr->ifgr_len == 0) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 			ifgr->ifgr_len += sizeof(struct ifg_req);
 		IF_ADDR_RUNLOCK(ifp);
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	/* XXX: wire */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
 		if (len < sizeof(ifgrq)) {
 			IF_ADDR_RUNLOCK(ifp);
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
 		    sizeof(ifgrq.ifgrq_group));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 		    	IF_ADDR_RUNLOCK(ifp);
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (0);
 }
 
 /*
  * Stores all members of a group in memory pointed to by data
  */
 static int
 if_getgroupmembers(struct ifgroupreq *data)
 {
 	struct ifgroupreq	*ifgr = data;
 	struct ifg_group	*ifg;
 	struct ifg_member	*ifgm;
 	struct ifg_req		 ifgrq, *ifgp;
 	int			 len, error;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
 			break;
 	if (ifg == NULL) {
 		IFNET_RUNLOCK();
 		return (ENOENT);
 	}
 
 	if (ifgr->ifgr_len == 0) {
 		TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
 			ifgr->ifgr_len += sizeof(ifgrq);
 		IFNET_RUNLOCK();
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
 		if (len < sizeof(ifgrq)) {
 			IFNET_RUNLOCK();
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
 		    sizeof(ifgrq.ifgrq_member));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 			IFNET_RUNLOCK();
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IFNET_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * Return counter values from counter(9)s stored in ifnet.
  */
 uint64_t
 if_get_counter_default(struct ifnet *ifp, ift_counter cnt)
 {
 
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	return (counter_u64_fetch(ifp->if_counters[cnt]));
 }
 
 /*
  * Increase an ifnet counter. Usually used for counters shared
  * between the stack and a driver, but function supports them all.
  */
 void
 if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc)
 {
 
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	counter_u64_add(ifp->if_counters[cnt], inc);
 }
 
 /*
  * Copy data from ifnet to userland API structure if_data.
  */
 void
 if_data_copy(struct ifnet *ifp, struct if_data *ifd)
 {
 
 	ifd->ifi_type = ifp->if_type;
 	ifd->ifi_physical = 0;
 	ifd->ifi_addrlen = ifp->if_addrlen;
 	ifd->ifi_hdrlen = ifp->if_hdrlen;
 	ifd->ifi_link_state = ifp->if_link_state;
 	ifd->ifi_vhid = 0;
 	ifd->ifi_datalen = sizeof(struct if_data);
 	ifd->ifi_mtu = ifp->if_mtu;
 	ifd->ifi_metric = ifp->if_metric;
 	ifd->ifi_baudrate = ifp->if_baudrate;
 	ifd->ifi_hwassist = ifp->if_hwassist;
 	ifd->ifi_epoch = ifp->if_epoch;
 	ifd->ifi_lastchange = ifp->if_lastchange;
 
 	ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS);
 	ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS);
 	ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS);
 	ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS);
 	ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS);
 	ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES);
 	ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES);
 	ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS);
 	ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS);
 	ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS);
 	ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS);
 	ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO);
 }
 
 /*
  * Wrapper functions for struct ifnet address list locking macros.  These are
  * used by kernel modules to avoid encoding programming interface or binary
  * interface assumptions that may be violated when kernel-internal locking
  * approaches change.
  */
 void
 if_addr_rlock(struct ifnet *ifp)
 {
 
 	IF_ADDR_RLOCK(ifp);
 }
 
 void
 if_addr_runlock(struct ifnet *ifp)
 {
 
 	IF_ADDR_RUNLOCK(ifp);
 }
 
 void
 if_maddr_rlock(if_t ifp)
 {
 
 	IF_ADDR_RLOCK((struct ifnet *)ifp);
 }
 
 void
 if_maddr_runlock(if_t ifp)
 {
 
 	IF_ADDR_RUNLOCK((struct ifnet *)ifp);
 }
 
 /*
  * Initialization, destruction and refcounting functions for ifaddrs.
  */
 struct ifaddr *
 ifa_alloc(size_t size, int flags)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(size >= sizeof(struct ifaddr),
 	    ("%s: invalid size %zu", __func__, size));
 
 	ifa = malloc(size, M_IFADDR, M_ZERO | flags);
 	if (ifa == NULL)
 		return (NULL);
 
 	if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 
 	refcount_init(&ifa->ifa_refcnt, 1);
 
 	return (ifa);
 
 fail:
 	/* free(NULL) is okay */
 	counter_u64_free(ifa->ifa_opackets);
 	counter_u64_free(ifa->ifa_ipackets);
 	counter_u64_free(ifa->ifa_obytes);
 	counter_u64_free(ifa->ifa_ibytes);
 	free(ifa, M_IFADDR);
 
 	return (NULL);
 }
 
 void
 ifa_ref(struct ifaddr *ifa)
 {
 
 	refcount_acquire(&ifa->ifa_refcnt);
 }
 
 void
 ifa_free(struct ifaddr *ifa)
 {
 
 	if (refcount_release(&ifa->ifa_refcnt)) {
 		counter_u64_free(ifa->ifa_opackets);
 		counter_u64_free(ifa->ifa_ipackets);
 		counter_u64_free(ifa->ifa_obytes);
 		counter_u64_free(ifa->ifa_ibytes);
 		free(ifa, M_IFADDR);
 	}
 }
 
 static int
 ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa,
     struct sockaddr *ia)
 {
 	int error;
 	struct rt_addrinfo info;
 	struct sockaddr_dl null_sdl;
 	struct ifnet *ifp;
 
 	ifp = ifa->ifa_ifp;
 
 	bzero(&info, sizeof(info));
 	if (cmd != RTM_DELETE)
 		info.rti_ifp = V_loif;
 	info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC;
 	info.rti_info[RTAX_DST] = ia;
 	info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl;
 	link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type);
 
 	error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib);
 
 	if (error != 0)
 		log(LOG_DEBUG, "%s: %s failed for interface %s: %u\n",
 		    __func__, otype, if_name(ifp), error);
 
 	return (error);
 }
 
 int
 ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 
 	return (ifa_maintain_loopback_route(RTM_ADD, "insertion", ifa, ia));
 }
 
 int
 ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 
 	return (ifa_maintain_loopback_route(RTM_DELETE, "deletion", ifa, ia));
 }
 
 int
 ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 
 	return (ifa_maintain_loopback_route(RTM_CHANGE, "switch", ifa, ia));
 }
 
 /*
  * XXX: Because sockaddr_dl has deeper structure than the sockaddr
  * structs used to represent other address families, it is necessary
  * to perform a different comparison.
  */
 
 #define	sa_dl_equal(a1, a2)	\
 	((((const struct sockaddr_dl *)(a1))->sdl_len ==		\
 	 ((const struct sockaddr_dl *)(a2))->sdl_len) &&		\
 	 (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)),		\
 	       CLLADDR((const struct sockaddr_dl *)(a2)),		\
 	       ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0))
 
 /*
  * Locate an interface based on a complete address.
  */
 /*ARGSUSED*/
 static struct ifaddr *
 ifa_ifwithaddr_internal(const struct sockaddr *addr, int getref)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (sa_equal(addr, ifa->ifa_addr)) {
 				if (getref)
 					ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 			/* IP6 doesn't have broadcast */
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				if (getref)
 					ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 struct ifaddr *
 ifa_ifwithaddr(const struct sockaddr *addr)
 {
 
 	return (ifa_ifwithaddr_internal(addr, 1));
 }
 
 int
 ifa_ifwithaddr_check(const struct sockaddr *addr)
 {
 
 	return (ifa_ifwithaddr_internal(addr, 0) != NULL);
 }
 
 /*
  * Locate an interface based on the broadcast address.
  */
 /* ARGSUSED */
 struct ifaddr *
 ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Locate the point to point interface with a given destination address.
  */
 /*ARGSUSED*/
 struct ifaddr *
 ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
 			continue;
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (ifa->ifa_dstaddr != NULL &&
 			    sa_equal(addr, ifa->ifa_dstaddr)) {
 				ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Find an interface on a specific network.  If many, choice
  * is most specific found.
  */
 struct ifaddr *
 ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 	const char *addr_data = addr->sa_data, *cplim;
 
 	/*
 	 * AF_LINK addresses can be looked up directly by their index number,
 	 * so do that if we can.
 	 */
 	if (af == AF_LINK) {
 	    const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr;
 	    if (sdl->sdl_index && sdl->sdl_index <= V_if_index)
 		return (ifaddr_byindex(sdl->sdl_index));
 	}
 
 	/*
 	 * Scan though each interface, looking for ones that have addresses
 	 * in this address family and the requested fib.  Maintain a reference
 	 * on ifa_maybe once we find one, as we release the IF_ADDR_RLOCK() that
 	 * kept it stable when we move onto the next interface.
 	 */
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			const char *cp, *cp2, *cp3;
 
 			if (ifa->ifa_addr->sa_family != af)
 next:				continue;
 			if (af == AF_INET && 
 			    ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) {
 				/*
 				 * This is a bit broken as it doesn't
 				 * take into account that the remote end may
 				 * be a single node in the network we are
 				 * looking for.
 				 * The trouble is that we don't know the
 				 * netmask for the remote end.
 				 */
 				if (ifa->ifa_dstaddr != NULL &&
 				    sa_equal(addr, ifa->ifa_dstaddr)) {
 					ifa_ref(ifa);
 					IF_ADDR_RUNLOCK(ifp);
 					goto done;
 				}
 			} else {
 				/*
 				 * Scan all the bits in the ifa's address.
 				 * If a bit dissagrees with what we are
 				 * looking for, mask it with the netmask
 				 * to see if it really matters.
 				 * (A byte at a time)
 				 */
 				if (ifa->ifa_netmask == 0)
 					continue;
 				cp = addr_data;
 				cp2 = ifa->ifa_addr->sa_data;
 				cp3 = ifa->ifa_netmask->sa_data;
 				cplim = ifa->ifa_netmask->sa_len
 					+ (char *)ifa->ifa_netmask;
 				while (cp3 < cplim)
 					if ((*cp++ ^ *cp2++) & *cp3++)
 						goto next; /* next address! */
 				/*
 				 * If the netmask of what we just found
 				 * is more specific than what we had before
 				 * (if we had one), or if the virtual status
 				 * of new prefix is better than of the old one,
 				 * then remember the new one before continuing
 				 * to search for an even better one.
 				 */
 				if (ifa_maybe == NULL ||
 				    ifa_preferred(ifa_maybe, ifa) ||
 				    rn_refines((caddr_t)ifa->ifa_netmask,
 				    (caddr_t)ifa_maybe->ifa_netmask)) {
 					if (ifa_maybe != NULL)
 						ifa_free(ifa_maybe);
 					ifa_maybe = ifa;
 					ifa_ref(ifa_maybe);
 				}
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = ifa_maybe;
 	ifa_maybe = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	if (ifa_maybe != NULL)
 		ifa_free(ifa_maybe);
 	return (ifa);
 }
 
 /*
  * Find an interface address specific to an interface best matching
  * a given address.
  */
 struct ifaddr *
 ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 	const char *cp, *cp2, *cp3;
 	char *cplim;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 
 	if (af >= AF_MAX)
 		return (NULL);
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != af)
 			continue;
 		if (ifa_maybe == NULL)
 			ifa_maybe = ifa;
 		if (ifa->ifa_netmask == 0) {
 			if (sa_equal(addr, ifa->ifa_addr) ||
 			    (ifa->ifa_dstaddr &&
 			    sa_equal(addr, ifa->ifa_dstaddr)))
 				goto done;
 			continue;
 		}
 		if (ifp->if_flags & IFF_POINTOPOINT) {
 			if (sa_equal(addr, ifa->ifa_dstaddr))
 				goto done;
 		} else {
 			cp = addr->sa_data;
 			cp2 = ifa->ifa_addr->sa_data;
 			cp3 = ifa->ifa_netmask->sa_data;
 			cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
 			for (; cp3 < cplim; cp3++)
 				if ((*cp++ ^ *cp2++) & *cp3)
 					break;
 			if (cp3 == cplim)
 				goto done;
 		}
 	}
 	ifa = ifa_maybe;
 done:
 	if (ifa != NULL)
 		ifa_ref(ifa);
 	IF_ADDR_RUNLOCK(ifp);
 	return (ifa);
 }
 
 /*
  * See whether new ifa is better than current one:
  * 1) A non-virtual one is preferred over virtual.
  * 2) A virtual in master state preferred over any other state.
  *
  * Used in several address selecting functions.
  */
 int
 ifa_preferred(struct ifaddr *cur, struct ifaddr *next)
 {
 
 	return (cur->ifa_carp && (!next->ifa_carp ||
 	    ((*carp_master_p)(next) && !(*carp_master_p)(cur))));
 }
 
 #include <net/if_llatbl.h>
 
 /*
  * Default action when installing a route with a Link Level gateway.
  * Lookup an appropriate real ifa to point to.
  * This should be moved to /sys/net/link.c eventually.
  */
 static void
 link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info)
 {
 	struct ifaddr *ifa, *oifa;
 	struct sockaddr *dst;
 	struct ifnet *ifp;
 
 	if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) ||
 	    ((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0))
 		return;
 	ifa = ifaof_ifpforaddr(dst, ifp);
 	if (ifa) {
 		oifa = rt->rt_ifa;
 		rt->rt_ifa = ifa;
 		ifa_free(oifa);
 		if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
 			ifa->ifa_rtrequest(cmd, rt, info);
 	}
 }
 
 struct sockaddr_dl *
 link_alloc_sdl(size_t size, int flags)
 {
 
 	return (malloc(size, M_TEMP, flags));
 }
 
 void
 link_free_sdl(struct sockaddr *sa)
 {
 	free(sa, M_TEMP);
 }
 
 /*
  * Fills in given sdl with interface basic info.
  * Returns pointer to filled sdl.
  */
 struct sockaddr_dl *
 link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype)
 {
 	struct sockaddr_dl *sdl;
 
 	sdl = (struct sockaddr_dl *)paddr;
 	memset(sdl, 0, sizeof(struct sockaddr_dl));
 	sdl->sdl_len = sizeof(struct sockaddr_dl);
 	sdl->sdl_family = AF_LINK;
 	sdl->sdl_index = ifp->if_index;
 	sdl->sdl_type = iftype;
 
 	return (sdl);
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 static void
 if_unroute(struct ifnet *ifp, int flag, int fam)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP"));
 
 	ifp->if_flags &= ~flag;
 	getmicrotime(&ifp->if_lastchange);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
 	ifp->if_qflush(ifp);
 
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 static void
 if_route(struct ifnet *ifp, int flag, int fam)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP"));
 
 	ifp->if_flags |= flag;
 	getmicrotime(&ifp->if_lastchange);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFUP, ifa->ifa_addr);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp);
 #ifdef INET6
 	in6_if_up(ifp);
 #endif
 }
 
 void	(*vlan_link_state_p)(struct ifnet *);	/* XXX: private from if_vlan */
 void	(*vlan_trunk_cap_p)(struct ifnet *);		/* XXX: private from if_vlan */
 struct ifnet *(*vlan_trunkdev_p)(struct ifnet *);
 struct	ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t);
 int	(*vlan_tag_p)(struct ifnet *, uint16_t *);
 int	(*vlan_setcookie_p)(struct ifnet *, void *);
 void	*(*vlan_cookie_p)(struct ifnet *);
 
 /*
  * Handle a change in the interface link state. To avoid LORs
  * between driver lock and upper layer locks, as well as possible
  * recursions, we post event to taskqueue, and all job
  * is done in static do_link_state_change().
  */
 void
 if_link_state_change(struct ifnet *ifp, int link_state)
 {
 	/* Return if state hasn't changed. */
 	if (ifp->if_link_state == link_state)
 		return;
 
 	ifp->if_link_state = link_state;
 
 	taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask);
 }
 
 static void
 do_link_state_change(void *arg, int pending)
 {
 	struct ifnet *ifp = (struct ifnet *)arg;
 	int link_state = ifp->if_link_state;
 	CURVNET_SET(ifp->if_vnet);
 
 	/* Notify that the link state has changed. */
 	rt_ifmsg(ifp);
 	if (ifp->if_vlantrunk != NULL)
 		(*vlan_link_state_p)(ifp);
 
 	if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) &&
 	    ifp->if_l2com != NULL)
 		(*ng_ether_link_state_p)(ifp, link_state);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	if (ifp->if_bridge)
 		(*bridge_linkstate_p)(ifp);
 	if (ifp->if_lagg)
 		(*lagg_linkstate_p)(ifp, link_state);
 
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname,
 		    (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN",
 		    NULL);
 	if (pending > 1)
 		if_printf(ifp, "%d link states coalesced\n", pending);
 	if (log_link_state_change)
 		log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname,
 		    (link_state == LINK_STATE_UP) ? "UP" : "DOWN" );
 	EVENTHANDLER_INVOKE(ifnet_link_event, ifp, ifp->if_link_state);
 	CURVNET_RESTORE();
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 void
 if_down(struct ifnet *ifp)
 {
 
 	if_unroute(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 void
 if_up(struct ifnet *ifp)
 {
 
 	if_route(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Flush an interface queue.
  */
 void
 if_qflush(struct ifnet *ifp)
 {
 	struct mbuf *m, *n;
 	struct ifaltq *ifq;
 	
 	ifq = &ifp->if_snd;
 	IFQ_LOCK(ifq);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(ifq))
 		ALTQ_PURGE(ifq);
 #endif
 	n = ifq->ifq_head;
 	while ((m = n) != 0) {
 		n = m->m_nextpkt;
 		m_freem(m);
 	}
 	ifq->ifq_head = 0;
 	ifq->ifq_tail = 0;
 	ifq->ifq_len = 0;
 	IFQ_UNLOCK(ifq);
 }
 
 /*
  * Map interface name to interface structure pointer, with or without
  * returning a reference.
  */
 struct ifnet *
 ifunit_ref(const char *name)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 &&
 		    !(ifp->if_flags & IFF_DYING))
 			break;
 	}
 	if (ifp != NULL)
 		if_ref(ifp);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 struct ifnet *
 ifunit(const char *name)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0)
 			break;
 	}
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 /*
  * Hardware specific interface ioctls.
  */
 static int
 ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
 {
 	struct ifreq *ifr;
 	int error = 0;
 	int new_flags, temp_flags;
 	size_t namelen, onamelen;
 	size_t descrlen;
 	char *descrbuf, *odescrbuf;
 	char new_name[IFNAMSIZ];
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 	case SIOCGIFINDEX:
 		ifr->ifr_index = ifp->if_index;
 		break;
 
 	case SIOCGIFFLAGS:
 		temp_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifr->ifr_flags = temp_flags & 0xffff;
 		ifr->ifr_flagshigh = temp_flags >> 16;
 		break;
 
 	case SIOCGIFCAP:
 		ifr->ifr_reqcap = ifp->if_capabilities;
 		ifr->ifr_curcap = ifp->if_capenable;
 		break;
 
 #ifdef MAC
 	case SIOCGIFMAC:
 		error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCGIFMETRIC:
 		ifr->ifr_metric = ifp->if_metric;
 		break;
 
 	case SIOCGIFMTU:
 		ifr->ifr_mtu = ifp->if_mtu;
 		break;
 
 	case SIOCGIFPHYS:
 		/* XXXGL: did this ever worked? */
 		ifr->ifr_phys = 0;
 		break;
 
 	case SIOCGIFDESCR:
 		error = 0;
 		sx_slock(&ifdescr_sx);
 		if (ifp->if_description == NULL)
 			error = ENOMSG;
 		else {
 			/* space for terminating nul */
 			descrlen = strlen(ifp->if_description) + 1;
 			if (ifr->ifr_buffer.length < descrlen)
 				ifr->ifr_buffer.buffer = NULL;
 			else
 				error = copyout(ifp->if_description,
 				    ifr->ifr_buffer.buffer, descrlen);
 			ifr->ifr_buffer.length = descrlen;
 		}
 		sx_sunlock(&ifdescr_sx);
 		break;
 
 	case SIOCSIFDESCR:
 		error = priv_check(td, PRIV_NET_SETIFDESCR);
 		if (error)
 			return (error);
 
 		/*
 		 * Copy only (length-1) bytes to make sure that
 		 * if_description is always nul terminated.  The
 		 * length parameter is supposed to count the
 		 * terminating nul in.
 		 */
 		if (ifr->ifr_buffer.length > ifdescr_maxlen)
 			return (ENAMETOOLONG);
 		else if (ifr->ifr_buffer.length == 0)
 			descrbuf = NULL;
 		else {
 			descrbuf = malloc(ifr->ifr_buffer.length, M_IFDESCR,
 			    M_WAITOK | M_ZERO);
 			error = copyin(ifr->ifr_buffer.buffer, descrbuf,
 			    ifr->ifr_buffer.length - 1);
 			if (error) {
 				free(descrbuf, M_IFDESCR);
 				break;
 			}
 		}
 
 		sx_xlock(&ifdescr_sx);
 		odescrbuf = ifp->if_description;
 		ifp->if_description = descrbuf;
 		sx_xunlock(&ifdescr_sx);
 
 		getmicrotime(&ifp->if_lastchange);
 		free(odescrbuf, M_IFDESCR);
 		break;
 
 	case SIOCGIFFIB:
 		ifr->ifr_fib = ifp->if_fib;
 		break;
 
 	case SIOCSIFFIB:
 		error = priv_check(td, PRIV_NET_SETIFFIB);
 		if (error)
 			return (error);
 		if (ifr->ifr_fib >= rt_numfibs)
 			return (EINVAL);
 
 		ifp->if_fib = ifr->ifr_fib;
 		break;
 
 	case SIOCSIFFLAGS:
 		error = priv_check(td, PRIV_NET_SETIFFLAGS);
 		if (error)
 			return (error);
 		/*
 		 * Currently, no driver owned flags pass the IFF_CANTCHANGE
 		 * check, so we don't need special handling here yet.
 		 */
 		new_flags = (ifr->ifr_flags & 0xffff) |
 		    (ifr->ifr_flagshigh << 16);
 		if (ifp->if_flags & IFF_UP &&
 		    (new_flags & IFF_UP) == 0) {
 			if_down(ifp);
 		} else if (new_flags & IFF_UP &&
 		    (ifp->if_flags & IFF_UP) == 0) {
 			if_up(ifp);
 		}
 		/* See if permanently promiscuous mode bit is about to flip */
 		if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) {
 			if (new_flags & IFF_PPROMISC)
 				ifp->if_flags |= IFF_PROMISC;
 			else if (ifp->if_pcount == 0)
 				ifp->if_flags &= ~IFF_PROMISC;
 			log(LOG_INFO, "%s: permanently promiscuous mode %s\n",
 			    ifp->if_xname,
 			    (new_flags & IFF_PPROMISC) ? "enabled" : "disabled");
 		}
 		ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
 			(new_flags &~ IFF_CANTCHANGE);
 		if (ifp->if_ioctl) {
 			(void) (*ifp->if_ioctl)(ifp, cmd, data);
 		}
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFCAP:
 		error = priv_check(td, PRIV_NET_SETIFCAP);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		if (ifr->ifr_reqcap & ~ifp->if_capabilities)
 			return (EINVAL);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 #ifdef MAC
 	case SIOCSIFMAC:
 		error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCSIFNAME:
 		error = priv_check(td, PRIV_NET_SETIFNAME);
 		if (error)
 			return (error);
 		error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
 		if (error != 0)
 			return (error);
 		if (new_name[0] == '\0')
 			return (EINVAL);
 		if (ifunit(new_name) != NULL)
 			return (EEXIST);
 
 		/*
 		 * XXX: Locking.  Nothing else seems to lock if_flags,
 		 * and there are numerous other races with the
 		 * ifunit() checks not being atomic with namespace
 		 * changes (renames, vmoves, if_attach, etc).
 		 */
 		ifp->if_flags |= IFF_RENAMING;
 		
 		/* Announce the departure of the interface. */
 		rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 		EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 
 		log(LOG_INFO, "%s: changing name to '%s'\n",
 		    ifp->if_xname, new_name);
 
 		IF_ADDR_WLOCK(ifp);
 		strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
 		ifa = ifp->if_addr;
 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 		namelen = strlen(new_name);
 		onamelen = sdl->sdl_nlen;
 		/*
 		 * Move the address if needed.  This is safe because we
 		 * allocate space for a name of length IFNAMSIZ when we
 		 * create this in if_attach().
 		 */
 		if (namelen != onamelen) {
 			bcopy(sdl->sdl_data + onamelen,
 			    sdl->sdl_data + namelen, sdl->sdl_alen);
 		}
 		bcopy(new_name, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
 		bzero(sdl->sdl_data, onamelen);
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		IF_ADDR_WUNLOCK(ifp);
 
 		EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 		/* Announce the return of the interface. */
 		rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 
 		ifp->if_flags &= ~IFF_RENAMING;
 		break;
 
 #ifdef VIMAGE
 	case SIOCSIFVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error)
 			return (error);
 		error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid);
 		break;
 #endif
 
 	case SIOCSIFMETRIC:
 		error = priv_check(td, PRIV_NET_SETIFMETRIC);
 		if (error)
 			return (error);
 		ifp->if_metric = ifr->ifr_metric;
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYS:
 		error = priv_check(td, PRIV_NET_SETIFPHYS);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFMTU:
 	{
 		u_long oldmtu = ifp->if_mtu;
 
 		error = priv_check(td, PRIV_NET_SETIFMTU);
 		if (error)
 			return (error);
 		if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU)
 			return (EINVAL);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0) {
 			getmicrotime(&ifp->if_lastchange);
 			rt_ifmsg(ifp);
 		}
 		/*
 		 * If the link MTU changed, do network layer specific procedure.
 		 */
 		if (ifp->if_mtu != oldmtu) {
 #ifdef INET6
 			nd6_setmtu(ifp);
 #endif
 			rt_updatemtu(ifp);
 		}
 		break;
 	}
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (cmd == SIOCADDMULTI)
 			error = priv_check(td, PRIV_NET_ADDMULTI);
 		else
 			error = priv_check(td, PRIV_NET_DELMULTI);
 		if (error)
 			return (error);
 
 		/* Don't allow group membership on non-multicast interfaces. */
 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
 			return (EOPNOTSUPP);
 
 		/* Don't let users screw up protocols' entries. */
 		if (ifr->ifr_addr.sa_family != AF_LINK)
 			return (EINVAL);
 
 		if (cmd == SIOCADDMULTI) {
 			struct ifmultiaddr *ifma;
 
 			/*
 			 * Userland is only permitted to join groups once
 			 * via the if_addmulti() KPI, because it cannot hold
 			 * struct ifmultiaddr * between calls. It may also
 			 * lose a race while we check if the membership
 			 * already exists.
 			 */
 			IF_ADDR_RLOCK(ifp);
 			ifma = if_findmulti(ifp, &ifr->ifr_addr);
 			IF_ADDR_RUNLOCK(ifp);
 			if (ifma != NULL)
 				error = EADDRINUSE;
 			else
 				error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
 		} else {
 			error = if_delmulti(ifp, &ifr->ifr_addr);
 		}
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYADDR:
 	case SIOCDIFPHYADDR:
 #ifdef INET6
 	case SIOCSIFPHYADDR_IN6:
 #endif
 	case SIOCSIFMEDIA:
 	case SIOCSIFGENERIC:
 		error = priv_check(td, PRIV_NET_HWIOCTL);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCGIFSTATUS:
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 	case SIOCGIFMEDIA:
 	case SIOCGIFXMEDIA:
 	case SIOCGIFGENERIC:
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		break;
 
 	case SIOCSIFLLADDR:
 		error = priv_check(td, PRIV_NET_SETLLADDR);
 		if (error)
 			return (error);
 		error = if_setlladdr(ifp,
 		    ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len);
 		break;
 
 	case SIOCAIFGROUP:
 	{
 		struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
 
 		error = priv_check(td, PRIV_NET_ADDIFGROUP);
 		if (error)
 			return (error);
 		if ((error = if_addgroup(ifp, ifgr->ifgr_group)))
 			return (error);
 		break;
 	}
 
 	case SIOCGIFGROUP:
 		if ((error = if_getgroup((struct ifgroupreq *)ifr, ifp)))
 			return (error);
 		break;
 
 	case SIOCDIFGROUP:
 	{
 		struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
 
 		error = priv_check(td, PRIV_NET_DELIFGROUP);
 		if (error)
 			return (error);
 		if ((error = if_delgroup(ifp, ifgr->ifgr_group)))
 			return (error);
 		break;
 	}
 
 	default:
 		error = ENOIOCTL;
 		break;
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD32
 struct ifconf32 {
 	int32_t	ifc_len;
 	union {
 		uint32_t	ifcu_buf;
 		uint32_t	ifcu_req;
 	} ifc_ifcu;
 };
 #define	SIOCGIFCONF32	_IOWR('i', 36, struct ifconf32)
 #endif
 
 /*
  * Interface ioctls.
  */
 int
 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td)
 {
 	struct ifnet *ifp;
 	struct ifreq *ifr;
 	int error;
 	int oif_flags;
 
 	CURVNET_SET(so->so_vnet);
 	switch (cmd) {
 	case SIOCGIFCONF:
 		error = ifconf(cmd, data);
 		CURVNET_RESTORE();
 		return (error);
 
 #ifdef COMPAT_FREEBSD32
 	case SIOCGIFCONF32:
 		{
 			struct ifconf32 *ifc32;
 			struct ifconf ifc;
 
 			ifc32 = (struct ifconf32 *)data;
 			ifc.ifc_len = ifc32->ifc_len;
 			ifc.ifc_buf = PTRIN(ifc32->ifc_buf);
 
 			error = ifconf(SIOCGIFCONF, (void *)&ifc);
 			CURVNET_RESTORE();
 			if (error == 0)
 				ifc32->ifc_len = ifc.ifc_len;
 			return (error);
 		}
 #endif
 	}
 	ifr = (struct ifreq *)data;
 
 	switch (cmd) {
 #ifdef VIMAGE
 	case SIOCSIFRVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error == 0)
 			error = if_vmove_reclaim(td, ifr->ifr_name,
 			    ifr->ifr_jid);
 		CURVNET_RESTORE();
 		return (error);
 #endif
 	case SIOCIFCREATE:
 	case SIOCIFCREATE2:
 		error = priv_check(td, PRIV_NET_IFCREATE);
 		if (error == 0)
 			error = if_clone_create(ifr->ifr_name,
 			    sizeof(ifr->ifr_name),
 			    cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL);
 		CURVNET_RESTORE();
 		return (error);
 	case SIOCIFDESTROY:
 		error = priv_check(td, PRIV_NET_IFDESTROY);
 		if (error == 0)
 			error = if_clone_destroy(ifr->ifr_name);
 		CURVNET_RESTORE();
 		return (error);
 
 	case SIOCIFGCLONERS:
 		error = if_clone_list((struct if_clonereq *)data);
 		CURVNET_RESTORE();
 		return (error);
 	case SIOCGIFGMEMB:
 		error = if_getgroupmembers((struct ifgroupreq *)data);
 		CURVNET_RESTORE();
 		return (error);
 #if defined(INET) || defined(INET6)
 	case SIOCSVH:
 	case SIOCGVH:
 		if (carp_ioctl_p == NULL)
 			error = EPROTONOSUPPORT;
 		else
 			error = (*carp_ioctl_p)(ifr, cmd, td);
 		CURVNET_RESTORE();
 		return (error);
 #endif
 	}
 
 	ifp = ifunit_ref(ifr->ifr_name);
 	if (ifp == NULL) {
 		CURVNET_RESTORE();
 		return (ENXIO);
 	}
 
 	error = ifhwioctl(cmd, ifp, data, td);
 	if (error != ENOIOCTL) {
 		if_rele(ifp);
 		CURVNET_RESTORE();
 		return (error);
 	}
 
 	oif_flags = ifp->if_flags;
 	if (so->so_proto == NULL) {
 		if_rele(ifp);
 		CURVNET_RESTORE();
 		return (EOPNOTSUPP);
 	}
 
 	/*
 	 * Pass the request on to the socket control method, and if the
 	 * latter returns EOPNOTSUPP, directly to the interface.
 	 *
 	 * Make an exception for the legacy SIOCSIF* requests.  Drivers
 	 * trust SIOCSIFADDR et al to come from an already privileged
 	 * layer, and do not perform any credentials checks or input
 	 * validation.
 	 */
 	error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data,
 	    ifp, td));
 	if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL &&
 	    cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR &&
 	    cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK)
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 
 	if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
 #ifdef INET6
 		if (ifp->if_flags & IFF_UP)
 			in6_if_up(ifp);
 #endif
 	}
 	if_rele(ifp);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * The code common to handling reference counted flags,
  * e.g., in ifpromisc() and if_allmulti().
  * The "pflag" argument can specify a permanent mode flag to check,
  * such as IFF_PPROMISC for promiscuous mode; should be 0 if none.
  *
  * Only to be used on stack-owned flags, not driver-owned flags.
  */
 static int
 if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch)
 {
 	struct ifreq ifr;
 	int error;
 	int oldflags, oldcount;
 
 	/* Sanity checks to catch programming errors */
 	KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0,
 	    ("%s: setting driver-owned flag %d", __func__, flag));
 
 	if (onswitch)
 		KASSERT(*refcount >= 0,
 		    ("%s: increment negative refcount %d for flag %d",
 		    __func__, *refcount, flag));
 	else
 		KASSERT(*refcount > 0,
 		    ("%s: decrement non-positive refcount %d for flag %d",
 		    __func__, *refcount, flag));
 
 	/* In case this mode is permanent, just touch refcount */
 	if (ifp->if_flags & pflag) {
 		*refcount += onswitch ? 1 : -1;
 		return (0);
 	}
 
 	/* Save ifnet parameters for if_ioctl() may fail */
 	oldcount = *refcount;
 	oldflags = ifp->if_flags;
 	
 	/*
 	 * See if we aren't the only and touching refcount is enough.
 	 * Actually toggle interface flag if we are the first or last.
 	 */
 	if (onswitch) {
 		if ((*refcount)++)
 			return (0);
 		ifp->if_flags |= flag;
 	} else {
 		if (--(*refcount))
 			return (0);
 		ifp->if_flags &= ~flag;
 	}
 
 	/* Call down the driver since we've changed interface flags */
 	if (ifp->if_ioctl == NULL) {
 		error = EOPNOTSUPP;
 		goto recover;
 	}
 	ifr.ifr_flags = ifp->if_flags & 0xffff;
 	ifr.ifr_flagshigh = ifp->if_flags >> 16;
 	error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 	if (error)
 		goto recover;
 	/* Notify userland that interface flags have changed */
 	rt_ifmsg(ifp);
 	return (0);
 
 recover:
 	/* Recover after driver error */
 	*refcount = oldcount;
 	ifp->if_flags = oldflags;
 	return (error);
 }
 
 /*
  * Set/clear promiscuous mode on interface ifp based on the truth value
  * of pswitch.  The calls are reference counted so that only the first
  * "on" request actually has an effect, as does the final "off" request.
  * Results are undefined if the "off" and "on" requests are not matched.
  */
 int
 ifpromisc(struct ifnet *ifp, int pswitch)
 {
 	int error;
 	int oldflags = ifp->if_flags;
 
 	error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC,
 			   &ifp->if_pcount, pswitch);
 	/* If promiscuous mode status has changed, log a message */
 	if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC))
 		log(LOG_INFO, "%s: promiscuous mode %s\n",
 		    ifp->if_xname,
 		    (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled");
 	return (error);
 }
 
 /*
  * Return interface configuration
  * of system.  List may be used
  * in later ioctl's (above) to get
  * other information.
  */
 /*ARGSUSED*/
 static int
 ifconf(u_long cmd, caddr_t data)
 {
 	struct ifconf *ifc = (struct ifconf *)data;
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 	struct sbuf *sb;
 	int error, full = 0, valid_len, max_len;
 
 	/* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */
 	max_len = MAXPHYS - 1;
 
 	/* Prevent hostile input from being able to crash the system */
 	if (ifc->ifc_len <= 0)
 		return (EINVAL);
 
 again:
 	if (ifc->ifc_len <= max_len) {
 		max_len = ifc->ifc_len;
 		full = 1;
 	}
 	sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
 	max_len = 0;
 	valid_len = 0;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		int addrs;
 
 		/*
 		 * Zero the ifr_name buffer to make sure we don't
 		 * disclose the contents of the stack.
 		 */
 		memset(ifr.ifr_name, 0, sizeof(ifr.ifr_name));
 
 		if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
 		    >= sizeof(ifr.ifr_name)) {
 			sbuf_delete(sb);
 			IFNET_RUNLOCK();
 			return (ENAMETOOLONG);
 		}
 
 		addrs = 0;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa = ifa->ifa_addr;
 
 			if (prison_if(curthread->td_ucred, sa) != 0)
 				continue;
 			addrs++;
 			if (sa->sa_len <= sizeof(*sa)) {
 				ifr.ifr_addr = *sa;
 				sbuf_bcat(sb, &ifr, sizeof(ifr));
 				max_len += sizeof(ifr);
 			} else {
 				sbuf_bcat(sb, &ifr,
 				    offsetof(struct ifreq, ifr_addr));
 				max_len += offsetof(struct ifreq, ifr_addr);
 				sbuf_bcat(sb, sa, sa->sa_len);
 				max_len += sa->sa_len;
 			}
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (addrs == 0) {
 			bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr));
 			sbuf_bcat(sb, &ifr, sizeof(ifr));
 			max_len += sizeof(ifr);
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 	}
 	IFNET_RUNLOCK();
 
 	/*
 	 * If we didn't allocate enough space (uncommon), try again.  If
 	 * we have already allocated as much space as we are allowed,
 	 * return what we've got.
 	 */
 	if (valid_len != max_len && !full) {
 		sbuf_delete(sb);
 		goto again;
 	}
 
 	ifc->ifc_len = valid_len;
 	sbuf_finish(sb);
 	error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
 	sbuf_delete(sb);
 	return (error);
 }
 
 /*
  * Just like ifpromisc(), but for all-multicast-reception mode.
  */
 int
 if_allmulti(struct ifnet *ifp, int onswitch)
 {
 
 	return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch));
 }
 
 struct ifmultiaddr *
 if_findmulti(struct ifnet *ifp, const struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 
 	IF_ADDR_LOCK_ASSERT(ifp);
 
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (sa->sa_family == AF_LINK) {
 			if (sa_dl_equal(ifma->ifma_addr, sa))
 				break;
 		} else {
 			if (sa_equal(ifma->ifma_addr, sa))
 				break;
 		}
 	}
 
 	return ifma;
 }
 
 /*
  * Allocate a new ifmultiaddr and initialize based on passed arguments.  We
  * make copies of passed sockaddrs.  The ifmultiaddr will not be added to
  * the ifnet multicast address list here, so the caller must do that and
  * other setup work (such as notifying the device driver).  The reference
  * count is initialized to 1.
  */
 static struct ifmultiaddr *
 if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa,
     int mflags)
 {
 	struct ifmultiaddr *ifma;
 	struct sockaddr *dupsa;
 
 	ifma = malloc(sizeof *ifma, M_IFMADDR, mflags |
 	    M_ZERO);
 	if (ifma == NULL)
 		return (NULL);
 
 	dupsa = malloc(sa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(sa, dupsa, sa->sa_len);
 	ifma->ifma_addr = dupsa;
 
 	ifma->ifma_ifp = ifp;
 	ifma->ifma_refcount = 1;
 	ifma->ifma_protospec = NULL;
 
 	if (llsa == NULL) {
 		ifma->ifma_lladdr = NULL;
 		return (ifma);
 	}
 
 	dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma->ifma_addr, M_IFMADDR);
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(llsa, dupsa, llsa->sa_len);
 	ifma->ifma_lladdr = dupsa;
 
 	return (ifma);
 }
 
 /*
  * if_freemulti: free ifmultiaddr structure and possibly attached related
  * addresses.  The caller is responsible for implementing reference
  * counting, notifying the driver, handling routing messages, and releasing
  * any dependent link layer state.
  */
 static void
 if_freemulti(struct ifmultiaddr *ifma)
 {
 
 	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d",
 	    ifma->ifma_refcount));
 
 	if (ifma->ifma_lladdr != NULL)
 		free(ifma->ifma_lladdr, M_IFMADDR);
 	free(ifma->ifma_addr, M_IFMADDR);
 	free(ifma, M_IFMADDR);
 }
 
 /*
  * Register an additional multicast address with a network interface.
  *
  * - If the address is already present, bump the reference count on the
  *   address and return.
  * - If the address is not link-layer, look up a link layer address.
  * - Allocate address structures for one or both addresses, and attach to the
  *   multicast address list on the interface.  If automatically adding a link
  *   layer address, the protocol address will own a reference to the link
  *   layer address, to be freed when it is freed.
  * - Notify the network device driver of an addition to the multicast address
  *   list.
  *
  * 'sa' points to caller-owned memory with the desired multicast address.
  *
  * 'retifma' will be used to return a pointer to the resulting multicast
  * address reference, if desired.
  */
 int
 if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
     struct ifmultiaddr **retifma)
 {
 	struct ifmultiaddr *ifma, *ll_ifma;
 	struct sockaddr *llsa;
 	struct sockaddr_dl sdl;
 	int error;
 
 	/*
 	 * If the address is already present, return a new reference to it;
 	 * otherwise, allocate storage and set up a new address.
 	 */
 	IF_ADDR_WLOCK(ifp);
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL) {
 		ifma->ifma_refcount++;
 		if (retifma != NULL)
 			*retifma = ifma;
 		IF_ADDR_WUNLOCK(ifp);
 		return (0);
 	}
 
 	/*
 	 * The address isn't already present; resolve the protocol address
 	 * into a link layer address, and then look that up, bump its
 	 * refcount or allocate an ifma for that also.
 	 * Most link layer resolving functions returns address data which
 	 * fits inside default sockaddr_dl structure. However callback
 	 * can allocate another sockaddr structure, in that case we need to
 	 * free it later.
 	 */
 	llsa = NULL;
 	ll_ifma = NULL;
 	if (ifp->if_resolvemulti != NULL) {
 		/* Provide called function with buffer size information */
 		sdl.sdl_len = sizeof(sdl);
 		llsa = (struct sockaddr *)&sdl;
 		error = ifp->if_resolvemulti(ifp, &llsa, sa);
 		if (error)
 			goto unlock_out;
 	}
 
 	/*
 	 * Allocate the new address.  Don't hook it up yet, as we may also
 	 * need to allocate a link layer multicast address.
 	 */
 	ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT);
 	if (ifma == NULL) {
 		error = ENOMEM;
 		goto free_llsa_out;
 	}
 
 	/*
 	 * If a link layer address is found, we'll need to see if it's
 	 * already present in the address list, or allocate is as well.
 	 * When this block finishes, the link layer address will be on the
 	 * list.
 	 */
 	if (llsa != NULL) {
 		ll_ifma = if_findmulti(ifp, llsa);
 		if (ll_ifma == NULL) {
 			ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT);
 			if (ll_ifma == NULL) {
 				--ifma->ifma_refcount;
 				if_freemulti(ifma);
 				error = ENOMEM;
 				goto free_llsa_out;
 			}
 			TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma,
 			    ifma_link);
 		} else
 			ll_ifma->ifma_refcount++;
 		ifma->ifma_llifma = ll_ifma;
 	}
 
 	/*
 	 * We now have a new multicast address, ifma, and possibly a new or
 	 * referenced link layer address.  Add the primary address to the
 	 * ifnet address list.
 	 */
 	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if (retifma != NULL)
 		*retifma = ifma;
 
 	/*
 	 * Must generate the message while holding the lock so that 'ifma'
 	 * pointer is still valid.
 	 */
 	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
 	IF_ADDR_WUNLOCK(ifp);
 
 	/*
 	 * We are certain we have added something, so call down to the
 	 * interface to let them know about it.
 	 */
 	if (ifp->if_ioctl != NULL) {
 		(void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
 	}
 
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 	return (0);
 
 free_llsa_out:
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 unlock_out:
 	IF_ADDR_WUNLOCK(ifp);
 	return (error);
 }
 
 /*
  * Delete a multicast group membership by network-layer group address.
  *
  * Returns ENOENT if the entry could not be found. If ifp no longer
  * exists, results are undefined. This entry point should only be used
  * from subsystems which do appropriate locking to hold ifp for the
  * duration of the call.
  * Network-layer protocol domains must use if_delmulti_ifma().
  */
 int
 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 	int lastref;
 #ifdef INVARIANTS
 	struct ifnet *oifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(oifp, &V_ifnet, if_link)
 		if (ifp == oifp)
 			break;
 	if (ifp != oifp)
 		ifp = NULL;
 	IFNET_RUNLOCK_NOSLEEP();
 
 	KASSERT(ifp != NULL, ("%s: ifnet went away", __func__));
 #endif
 	if (ifp == NULL)
 		return (ENOENT);
 
 	IF_ADDR_WLOCK(ifp);
 	lastref = 0;
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL)
 		lastref = if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 
 	if (ifma == NULL)
 		return (ENOENT);
 
 	if (lastref && ifp->if_ioctl != NULL) {
 		(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 	}
 
 	return (0);
 }
 
 /*
  * Delete all multicast group membership for an interface.
  * Should be used to quickly flush all multicast filters.
  */
 void
 if_delallmulti(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 	struct ifmultiaddr *next;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
 		if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 /*
  * Delete a multicast group membership by group membership pointer.
  * Network-layer protocol domains must use this routine.
  *
  * It is safe to call this routine if the ifp disappeared.
  */
 void
 if_delmulti_ifma(struct ifmultiaddr *ifma)
 {
 	struct ifnet *ifp;
 	int lastref;
 
 	ifp = ifma->ifma_ifp;
 #ifdef DIAGNOSTIC
 	if (ifp == NULL) {
 		printf("%s: ifma_ifp seems to be detached\n", __func__);
 	} else {
 		struct ifnet *oifp;
 
 		IFNET_RLOCK_NOSLEEP();
 		TAILQ_FOREACH(oifp, &V_ifnet, if_link)
 			if (ifp == oifp)
 				break;
 		if (ifp != oifp) {
 			printf("%s: ifnet %p disappeared\n", __func__, ifp);
 			ifp = NULL;
 		}
 		IFNET_RUNLOCK_NOSLEEP();
 	}
 #endif
 	/*
 	 * If and only if the ifnet instance exists: Acquire the address lock.
 	 */
 	if (ifp != NULL)
 		IF_ADDR_WLOCK(ifp);
 
 	lastref = if_delmulti_locked(ifp, ifma, 0);
 
 	if (ifp != NULL) {
 		/*
 		 * If and only if the ifnet instance exists:
 		 *  Release the address lock.
 		 *  If the group was left: update the hardware hash filter.
 		 */
 		IF_ADDR_WUNLOCK(ifp);
 		if (lastref && ifp->if_ioctl != NULL) {
 			(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 		}
 	}
 }
 
 /*
  * Perform deletion of network-layer and/or link-layer multicast address.
  *
  * Return 0 if the reference count was decremented.
  * Return 1 if the final reference was released, indicating that the
  * hardware hash filter should be reprogrammed.
  */
 static int
 if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching)
 {
 	struct ifmultiaddr *ll_ifma;
 
 	if (ifp != NULL && ifma->ifma_ifp != NULL) {
 		KASSERT(ifma->ifma_ifp == ifp,
 		    ("%s: inconsistent ifp %p", __func__, ifp));
 		IF_ADDR_WLOCK_ASSERT(ifp);
 	}
 
 	ifp = ifma->ifma_ifp;
 
 	/*
 	 * If the ifnet is detaching, null out references to ifnet,
 	 * so that upper protocol layers will notice, and not attempt
 	 * to obtain locks for an ifnet which no longer exists. The
 	 * routing socket announcement must happen before the ifnet
 	 * instance is detached from the system.
 	 */
 	if (detaching) {
 #ifdef DIAGNOSTIC
 		printf("%s: detaching ifnet instance %p\n", __func__, ifp);
 #endif
 		/*
 		 * ifp may already be nulled out if we are being reentered
 		 * to delete the ll_ifma.
 		 */
 		if (ifp != NULL) {
 			rt_newmaddrmsg(RTM_DELMADDR, ifma);
 			ifma->ifma_ifp = NULL;
 		}
 	}
 
 	if (--ifma->ifma_refcount > 0)
 		return 0;
 
 	/*
 	 * If this ifma is a network-layer ifma, a link-layer ifma may
 	 * have been associated with it. Release it first if so.
 	 */
 	ll_ifma = ifma->ifma_llifma;
 	if (ll_ifma != NULL) {
 		KASSERT(ifma->ifma_lladdr != NULL,
 		    ("%s: llifma w/o lladdr", __func__));
 		if (detaching)
 			ll_ifma->ifma_ifp = NULL;	/* XXX */
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ifp != NULL) {
 				TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma,
 				    ifma_link);
 			}
 			if_freemulti(ll_ifma);
 		}
 	}
 
 	if (ifp != NULL)
 		TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if_freemulti(ifma);
 
 	/*
 	 * The last reference to this instance of struct ifmultiaddr
 	 * was released; the hardware should be notified of this change.
 	 */
 	return 1;
 }
 
 /*
  * Set the link layer address on an interface.
  *
  * At this time we only support certain types of interfaces,
  * and we don't allow the length of the address to change.
  *
  * Set noinline to be dtrace-friendly
  */
 __noinline int
 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
 {
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 
 	IF_ADDR_RLOCK(ifp);
 	ifa = ifp->if_addr;
 	if (ifa == NULL) {
 		IF_ADDR_RUNLOCK(ifp);
 		return (EINVAL);
 	}
 	ifa_ref(ifa);
 	IF_ADDR_RUNLOCK(ifp);
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	if (sdl == NULL) {
 		ifa_free(ifa);
 		return (EINVAL);
 	}
 	if (len != sdl->sdl_alen) {	/* don't allow length to change */
 		ifa_free(ifa);
 		return (EINVAL);
 	}
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_FDDI:
 	case IFT_XETHER:
 	case IFT_ISO88025:
 	case IFT_L2VLAN:
 	case IFT_BRIDGE:
 	case IFT_ARCNET:
 	case IFT_IEEE8023ADLAG:
 	case IFT_IEEE80211:
 		bcopy(lladdr, LLADDR(sdl), len);
 		ifa_free(ifa);
 		break;
 	default:
 		ifa_free(ifa);
 		return (ENODEV);
 	}
 
 	/*
 	 * If the interface is already up, we need
 	 * to re-init it in order to reprogram its
 	 * address filter.
 	 */
 	if ((ifp->if_flags & IFF_UP) != 0) {
 		if (ifp->if_ioctl) {
 			ifp->if_flags &= ~IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 			ifp->if_flags |= IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 		}
 	}
 	EVENTHANDLER_INVOKE(iflladdr_event, ifp);
 	return (0);
 }
 
 /*
  * Compat function for handling basic encapsulation requests.
  * Not converted stacks (FDDI, IB, ..) supports traditional
  * output model: ARP (and other similar L2 protocols) are handled
  * inside output routine, arpresolve/nd6_resolve() returns MAC
  * address instead of full prepend.
  *
  * This function creates calculated header==MAC for IPv4/IPv6 and
  * returns EAFNOSUPPORT (which is then handled in ARP code) for other
  * address families.
  */
 static int
 if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req)
 {
 
 	if (req->rtype != IFENCAP_LL)
 		return (EOPNOTSUPP);
 
 	if (req->bufsize < req->lladdr_len)
 		return (ENOMEM);
 
 	switch (req->family) {
 	case AF_INET:
 	case AF_INET6:
 		break;
 	default:
 		return (EAFNOSUPPORT);
 	}
 
 	/* Copy lladdr to storage as is */
 	memmove(req->buf, req->lladdr, req->lladdr_len);
 	req->bufsize = req->lladdr_len;
 	req->lladdr_off = 0;
 
 	return (0);
 }
 
 /*
  * The name argument must be a pointer to storage which will last as
  * long as the interface does.  For physical devices, the result of
  * device_get_name(dev) is a good choice and for pseudo-devices a
  * static string works well.
  */
 void
 if_initname(struct ifnet *ifp, const char *name, int unit)
 {
 	ifp->if_dname = name;
 	ifp->if_dunit = unit;
 	if (unit != IF_DUNIT_NONE)
 		snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
 	else
 		strlcpy(ifp->if_xname, name, IFNAMSIZ);
 }
 
 int
 if_printf(struct ifnet *ifp, const char * fmt, ...)
 {
 	va_list ap;
 	int retval;
 
 	retval = printf("%s: ", ifp->if_xname);
 	va_start(ap, fmt);
 	retval += vprintf(fmt, ap);
 	va_end(ap);
 	return (retval);
 }
 
 void
 if_start(struct ifnet *ifp)
 {
 
 	(*(ifp)->if_start)(ifp);
 }
 
 /*
  * Backwards compatibility interface for drivers 
  * that have not implemented it
  */
 static int
 if_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	int error;
 
 	IFQ_HANDOFF(ifp, m, error);
 	return (error);
 }
 
 static void
 if_input_default(struct ifnet *ifp __unused, struct mbuf *m)
 {
 
 	m_freem(m);
 }
 
 int
 if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust)
 {
 	int active = 0;
 
 	IF_LOCK(ifq);
 	if (_IF_QFULL(ifq)) {
 		IF_UNLOCK(ifq);
 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 		m_freem(m);
 		return (0);
 	}
 	if (ifp != NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust);
 		if (m->m_flags & (M_BCAST|M_MCAST))
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 		active = ifp->if_drv_flags & IFF_DRV_OACTIVE;
 	}
 	_IF_ENQUEUE(ifq, m);
 	IF_UNLOCK(ifq);
 	if (ifp != NULL && !active)
 		(*(ifp)->if_start)(ifp);
 	return (1);
 }
 
 void
 if_register_com_alloc(u_char type,
     if_com_alloc_t *a, if_com_free_t *f)
 {
 	
 	KASSERT(if_com_alloc[type] == NULL,
 	    ("if_register_com_alloc: %d already registered", type));
 	KASSERT(if_com_free[type] == NULL,
 	    ("if_register_com_alloc: %d free already registered", type));
 
 	if_com_alloc[type] = a;
 	if_com_free[type] = f;
 }
 
 void
 if_deregister_com_alloc(u_char type)
 {
 	
 	KASSERT(if_com_alloc[type] != NULL,
 	    ("if_deregister_com_alloc: %d not registered", type));
 	KASSERT(if_com_free[type] != NULL,
 	    ("if_deregister_com_alloc: %d free not registered", type));
 	if_com_alloc[type] = NULL;
 	if_com_free[type] = NULL;
 }
 
 /* API for driver access to network stack owned ifnet.*/
 uint64_t
 if_setbaudrate(struct ifnet *ifp, uint64_t baudrate)
 {
 	uint64_t oldbrate;
 
 	oldbrate = ifp->if_baudrate;
 	ifp->if_baudrate = baudrate;
 	return (oldbrate);
 }
 
 uint64_t
 if_getbaudrate(if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_baudrate);
 }
 
 int
 if_setcapabilities(if_t ifp, int capabilities)
 {
 	((struct ifnet *)ifp)->if_capabilities = capabilities;
 	return (0);
 }
 
 int
 if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit)
 {
 	((struct ifnet *)ifp)->if_capabilities |= setbit;
 	((struct ifnet *)ifp)->if_capabilities &= ~clearbit;
 
 	return (0);
 }
 
 int
 if_getcapabilities(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_capabilities;
 }
 
 int 
 if_setcapenable(if_t ifp, int capabilities)
 {
 	((struct ifnet *)ifp)->if_capenable = capabilities;
 	return (0);
 }
 
 int 
 if_setcapenablebit(if_t ifp, int setcap, int clearcap)
 {
 	if(setcap) 
 		((struct ifnet *)ifp)->if_capenable |= setcap;
 	if(clearcap)
 		((struct ifnet *)ifp)->if_capenable &= ~clearcap;
 
 	return (0);
 }
 
 const char *
 if_getdname(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_dname;
 }
 
 int 
 if_togglecapenable(if_t ifp, int togglecap)
 {
 	((struct ifnet *)ifp)->if_capenable ^= togglecap;
 	return (0);
 }
 
 int
 if_getcapenable(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_capenable;
 }
 
 /*
  * This is largely undesirable because it ties ifnet to a device, but does
  * provide flexiblity for an embedded product vendor. Should be used with
  * the understanding that it violates the interface boundaries, and should be
  * a last resort only.
  */
 int
 if_setdev(if_t ifp, void *dev)
 {
 	return (0);
 }
 
 int
 if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags)
 {
 	((struct ifnet *)ifp)->if_drv_flags |= set_flags;
 	((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags;
 
 	return (0);
 }
 
 int
 if_getdrvflags(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_drv_flags;
 }
  
 int
 if_setdrvflags(if_t ifp, int flags)
 {
 	((struct ifnet *)ifp)->if_drv_flags = flags;
 	return (0);
 }
 
 
 int
 if_setflags(if_t ifp, int flags)
 {
 	((struct ifnet *)ifp)->if_flags = flags;
 	return (0);
 }
 
 int
 if_setflagbits(if_t ifp, int set, int clear)
 {
 	((struct ifnet *)ifp)->if_flags |= set;
 	((struct ifnet *)ifp)->if_flags &= ~clear;
 
 	return (0);
 }
 
 int
 if_getflags(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_flags;
 }
 
 int
 if_clearhwassist(if_t ifp)
 {
 	((struct ifnet *)ifp)->if_hwassist = 0;
 	return (0);
 }
 
 int
 if_sethwassistbits(if_t ifp, int toset, int toclear)
 {
 	((struct ifnet *)ifp)->if_hwassist |= toset;
 	((struct ifnet *)ifp)->if_hwassist &= ~toclear;
 
 	return (0);
 }
 
 int
 if_sethwassist(if_t ifp, int hwassist_bit)
 {
 	((struct ifnet *)ifp)->if_hwassist = hwassist_bit;
 	return (0);
 }
 
 int
 if_gethwassist(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_hwassist;
 }
 
 int
 if_setmtu(if_t ifp, int mtu)
 {
 	((struct ifnet *)ifp)->if_mtu = mtu;
 	return (0);
 }
 
 int
 if_getmtu(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_mtu;
 }
 
 int
 if_getmtu_family(if_t ifp, int family)
 {
 	struct domain *dp;
 
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_family == family && dp->dom_ifmtu != NULL)
 			return (dp->dom_ifmtu((struct ifnet *)ifp));
 	}
 
 	return (((struct ifnet *)ifp)->if_mtu);
 }
 
 int
 if_setsoftc(if_t ifp, void *softc)
 {
 	((struct ifnet *)ifp)->if_softc = softc;
 	return (0);
 }
 
 void *
 if_getsoftc(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_softc;
 }
 
 void 
 if_setrcvif(struct mbuf *m, if_t ifp)
 {
 	m->m_pkthdr.rcvif = (struct ifnet *)ifp;
 }
 
 void 
 if_setvtag(struct mbuf *m, uint16_t tag)
 {
 	m->m_pkthdr.ether_vtag = tag;	
 }
 
 uint16_t
 if_getvtag(struct mbuf *m)
 {
 
 	return (m->m_pkthdr.ether_vtag);
 }
 
 int
 if_sendq_empty(if_t ifp)
 {
 	return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd);
 }
 
 struct ifaddr *
 if_getifaddr(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_addr;
 }
 
 int
 if_getamcount(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_amcount;
 }
 
 
 int
 if_setsendqready(if_t ifp)
 {
 	IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd);
 	return (0);
 }
 
 int
 if_setsendqlen(if_t ifp, int tx_desc_count)
 {
 	IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count);
 	((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count;
 
 	return (0);
 }
 
 int
 if_vlantrunkinuse(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0;
 }
 
 int
 if_input(if_t ifp, struct mbuf* sendmp)
 {
 	(*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp);
 	return (0);
 
 }
 
 /* XXX */
 #ifndef ETH_ADDR_LEN
 #define ETH_ADDR_LEN 6
 #endif
 
 int 
 if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max)
 {
 	struct ifmultiaddr *ifma;
 	uint8_t *lmta = (uint8_t *)mta;
 	int mcnt = 0;
 
 	TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 
 		if (mcnt == max)
 			break;
 
 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
 		    &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN);
 		mcnt++;
 	}
 	*cnt = mcnt;
 
 	return (0);
 }
 
 int
 if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max)
 {
 	int error;
 
 	if_maddr_rlock(ifp);
 	error = if_setupmultiaddr(ifp, mta, cnt, max);
 	if_maddr_runlock(ifp);
 	return (error);
 }
 
 int
 if_multiaddr_count(if_t ifp, int max)
 {
 	struct ifmultiaddr *ifma;
 	int count;
 
 	count = 0;
 	if_maddr_rlock(ifp);
 	TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		count++;
 		if (count == max)
 			break;
 	}
 	if_maddr_runlock(ifp);
 	return (count);
 }
 
 struct mbuf *
 if_dequeue(if_t ifp)
 {
 	struct mbuf *m;
 	IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m);
 
 	return (m);
 }
 
 int
 if_sendq_prepend(if_t ifp, struct mbuf *m)
 {
 	IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m);
 	return (0);
 }
 
 int
 if_setifheaderlen(if_t ifp, int len)
 {
 	((struct ifnet *)ifp)->if_hdrlen = len;
 	return (0);
 }
 
 caddr_t
 if_getlladdr(if_t ifp)
 {
 	return (IF_LLADDR((struct ifnet *)ifp));
 }
 
 void *
 if_gethandle(u_char type)
 {
 	return (if_alloc(type));
 }
 
 void
 if_bpfmtap(if_t ifh, struct mbuf *m)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 
 	BPF_MTAP(ifp, m);
 }
 
 void
 if_etherbpfmtap(if_t ifh, struct mbuf *m)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 
 	ETHER_BPF_MTAP(ifp, m);
 }
 
 void
 if_vlancap(if_t ifh)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 	VLAN_CAPABILITIES(ifp);
 }
 
 void
 if_setinitfn(if_t ifp, void (*init_fn)(void *))
 {
 	((struct ifnet *)ifp)->if_init = init_fn;
 }
 
 void
 if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t))
 {
 	((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn;
 }
 
 void
 if_setstartfn(if_t ifp, void (*start_fn)(if_t))
 {
 	((struct ifnet *)ifp)->if_start = (void *)start_fn;
 }
 
 void
 if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn)
 {
 	((struct ifnet *)ifp)->if_transmit = start_fn;
 }
 
 void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn)
 {
 	((struct ifnet *)ifp)->if_qflush = flush_fn;
 	
 }
 
 void
 if_setgetcounterfn(if_t ifp, if_get_counter_t fn)
 {
 
 	ifp->if_get_counter = fn;
 }
 
 /* Revisit these - These are inline functions originally. */
 int
 drbr_inuse_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_inuse(ifh, br);
 }
 
 struct mbuf*
 drbr_dequeue_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_dequeue(ifh, br);
 }
 
 int
 drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_needs_enqueue(ifh, br);
 }
 
 int
 drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m)
 {
 	return drbr_enqueue(ifh, br, m);
 
 }
Index: projects/vnet/sys/net/if_bridge.c
===================================================================
--- projects/vnet/sys/net/if_bridge.c	(revision 297511)
+++ projects/vnet/sys/net/if_bridge.c	(revision 297512)
@@ -1,3572 +1,3572 @@
 /*	$NetBSD: if_bridge.c,v 1.31 2005/06/01 19:45:34 jdc Exp $	*/
 
 /*
  * Copyright 2001 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Jason R. Thorpe for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed for the NetBSD Project by
  *	Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * OpenBSD: if_bridge.c,v 1.60 2001/06/15 03:38:33 itojun Exp
  */
 
 /*
  * Network interface bridge support.
  *
  * TODO:
  *
  *	- Currently only supports Ethernet-like interfaces (Ethernet,
  *	  802.11, VLANs on Ethernet, etc.)  Figure out a nice way
  *	  to bridge other types of interfaces (FDDI-FDDI, and maybe
  *	  consider heterogenous bridges).
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/protosw.h>
 #include <sys/systm.h>
 #include <sys/jail.h>
 #include <sys/time.h>
 #include <sys/socket.h> /* for net/if.h */
 #include <sys/sockio.h>
 #include <sys/ctype.h>  /* string functions */
 #include <sys/kernel.h>
 #include <sys/random.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <vm/uma.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 
 #include <net/bpf.h>
 #include <net/if.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif
 #if defined(INET) || defined(INET6)
 #include <netinet/ip_carp.h>
 #endif
 #include <machine/in_cksum.h>
 #include <netinet/if_ether.h>
 #include <net/bridgestp.h>
 #include <net/if_bridgevar.h>
 #include <net/if_llc.h>
 #include <net/if_vlan_var.h>
 
 #include <net/route.h>
 
 /*
  * Size of the route hash table.  Must be a power of two.
  */
 #ifndef BRIDGE_RTHASH_SIZE
 #define	BRIDGE_RTHASH_SIZE		1024
 #endif
 
 #define	BRIDGE_RTHASH_MASK		(BRIDGE_RTHASH_SIZE - 1)
 
 /*
  * Default maximum number of addresses to cache.
  */
 #ifndef BRIDGE_RTABLE_MAX
 #define	BRIDGE_RTABLE_MAX		2000
 #endif
 
 /*
  * Timeout (in seconds) for entries learned dynamically.
  */
 #ifndef BRIDGE_RTABLE_TIMEOUT
 #define	BRIDGE_RTABLE_TIMEOUT		(20 * 60)	/* same as ARP */
 #endif
 
 /*
  * Number of seconds between walks of the route list.
  */
 #ifndef BRIDGE_RTABLE_PRUNE_PERIOD
 #define	BRIDGE_RTABLE_PRUNE_PERIOD	(5 * 60)
 #endif
 
 /*
  * List of capabilities to possibly mask on the member interface.
  */
 #define	BRIDGE_IFCAPS_MASK		(IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM)
 
 /*
  * List of capabilities to strip
  */
 #define	BRIDGE_IFCAPS_STRIP		IFCAP_LRO
 
 /*
  * Bridge interface list entry.
  */
 struct bridge_iflist {
 	LIST_ENTRY(bridge_iflist) bif_next;
 	struct ifnet		*bif_ifp;	/* member if */
 	struct bstp_port	bif_stp;	/* STP state */
 	uint32_t		bif_flags;	/* member if flags */
 	int			bif_savedcaps;	/* saved capabilities */
 	uint32_t		bif_addrmax;	/* max # of addresses */
 	uint32_t		bif_addrcnt;	/* cur. # of addresses */
 	uint32_t		bif_addrexceeded;/* # of address violations */
 };
 
 /*
  * Bridge route node.
  */
 struct bridge_rtnode {
 	LIST_ENTRY(bridge_rtnode) brt_hash;	/* hash table linkage */
 	LIST_ENTRY(bridge_rtnode) brt_list;	/* list linkage */
 	struct bridge_iflist	*brt_dst;	/* destination if */
 	unsigned long		brt_expire;	/* expiration time */
 	uint8_t			brt_flags;	/* address flags */
 	uint8_t			brt_addr[ETHER_ADDR_LEN];
 	uint16_t		brt_vlan;	/* vlan id */
 };
 #define	brt_ifp			brt_dst->bif_ifp
 
 /*
  * Software state for each bridge.
  */
 struct bridge_softc {
 	struct ifnet		*sc_ifp;	/* make this an interface */
 	LIST_ENTRY(bridge_softc) sc_list;
 	struct mtx		sc_mtx;
 	struct cv		sc_cv;
 	uint32_t		sc_brtmax;	/* max # of addresses */
 	uint32_t		sc_brtcnt;	/* cur. # of addresses */
 	uint32_t		sc_brttimeout;	/* rt timeout in seconds */
 	struct callout		sc_brcallout;	/* bridge callout */
 	uint32_t		sc_iflist_ref;	/* refcount for sc_iflist */
 	uint32_t		sc_iflist_xcnt;	/* refcount for sc_iflist */
 	LIST_HEAD(, bridge_iflist) sc_iflist;	/* member interface list */
 	LIST_HEAD(, bridge_rtnode) *sc_rthash;	/* our forwarding table */
 	LIST_HEAD(, bridge_rtnode) sc_rtlist;	/* list version of above */
 	uint32_t		sc_rthash_key;	/* key for hash */
 	LIST_HEAD(, bridge_iflist) sc_spanlist;	/* span ports list */
 	struct bstp_state	sc_stp;		/* STP state */
 	uint32_t		sc_brtexceeded;	/* # of cache drops */
 	struct ifnet		*sc_ifaddr;	/* member mac copied from */
 	u_char			sc_defaddr[6];	/* Default MAC address */
 };
 
 static VNET_DEFINE(struct mtx, bridge_list_mtx);
 #define	V_bridge_list_mtx	VNET(bridge_list_mtx)
 static eventhandler_tag bridge_detach_cookie;
 
 int	bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD;
 
 uma_zone_t bridge_rtnode_zone;
 
 static int	bridge_clone_create(struct if_clone *, int, caddr_t);
 static void	bridge_clone_destroy(struct ifnet *);
 
 static int	bridge_ioctl(struct ifnet *, u_long, caddr_t);
 static void	bridge_mutecaps(struct bridge_softc *);
 static void	bridge_set_ifcap(struct bridge_softc *, struct bridge_iflist *,
 		    int);
 static void	bridge_ifdetach(void *arg __unused, struct ifnet *);
 static void	bridge_init(void *);
 static void	bridge_dummynet(struct mbuf *, struct ifnet *);
 static void	bridge_stop(struct ifnet *, int);
 static int	bridge_transmit(struct ifnet *, struct mbuf *);
 static void	bridge_qflush(struct ifnet *);
 static struct mbuf *bridge_input(struct ifnet *, struct mbuf *);
 static int	bridge_output(struct ifnet *, struct mbuf *, struct sockaddr *,
 		    struct rtentry *);
 static int	bridge_enqueue(struct bridge_softc *, struct ifnet *,
 		    struct mbuf *);
 static void	bridge_rtdelete(struct bridge_softc *, struct ifnet *ifp, int);
 
 static void	bridge_forward(struct bridge_softc *, struct bridge_iflist *,
 		    struct mbuf *m);
 
 static void	bridge_timer(void *);
 
 static void	bridge_broadcast(struct bridge_softc *, struct ifnet *,
 		    struct mbuf *, int);
 static void	bridge_span(struct bridge_softc *, struct mbuf *);
 
 static int	bridge_rtupdate(struct bridge_softc *, const uint8_t *,
 		    uint16_t, struct bridge_iflist *, int, uint8_t);
 static struct ifnet *bridge_rtlookup(struct bridge_softc *, const uint8_t *,
 		    uint16_t);
 static void	bridge_rttrim(struct bridge_softc *);
 static void	bridge_rtage(struct bridge_softc *);
 static void	bridge_rtflush(struct bridge_softc *, int);
 static int	bridge_rtdaddr(struct bridge_softc *, const uint8_t *,
 		    uint16_t);
 
 static void	bridge_rtable_init(struct bridge_softc *);
 static void	bridge_rtable_fini(struct bridge_softc *);
 
 static int	bridge_rtnode_addr_cmp(const uint8_t *, const uint8_t *);
 static struct bridge_rtnode *bridge_rtnode_lookup(struct bridge_softc *,
 		    const uint8_t *, uint16_t);
 static int	bridge_rtnode_insert(struct bridge_softc *,
 		    struct bridge_rtnode *);
 static void	bridge_rtnode_destroy(struct bridge_softc *,
 		    struct bridge_rtnode *);
 static void	bridge_rtable_expire(struct ifnet *, int);
 static void	bridge_state_change(struct ifnet *, int);
 
 static struct bridge_iflist *bridge_lookup_member(struct bridge_softc *,
 		    const char *name);
 static struct bridge_iflist *bridge_lookup_member_if(struct bridge_softc *,
 		    struct ifnet *ifp);
 static void	bridge_delete_member(struct bridge_softc *,
 		    struct bridge_iflist *, int);
 static void	bridge_delete_span(struct bridge_softc *,
 		    struct bridge_iflist *);
 
 static int	bridge_ioctl_add(struct bridge_softc *, void *);
 static int	bridge_ioctl_del(struct bridge_softc *, void *);
 static int	bridge_ioctl_gifflags(struct bridge_softc *, void *);
 static int	bridge_ioctl_sifflags(struct bridge_softc *, void *);
 static int	bridge_ioctl_scache(struct bridge_softc *, void *);
 static int	bridge_ioctl_gcache(struct bridge_softc *, void *);
 static int	bridge_ioctl_gifs(struct bridge_softc *, void *);
 static int	bridge_ioctl_rts(struct bridge_softc *, void *);
 static int	bridge_ioctl_saddr(struct bridge_softc *, void *);
 static int	bridge_ioctl_sto(struct bridge_softc *, void *);
 static int	bridge_ioctl_gto(struct bridge_softc *, void *);
 static int	bridge_ioctl_daddr(struct bridge_softc *, void *);
 static int	bridge_ioctl_flush(struct bridge_softc *, void *);
 static int	bridge_ioctl_gpri(struct bridge_softc *, void *);
 static int	bridge_ioctl_spri(struct bridge_softc *, void *);
 static int	bridge_ioctl_ght(struct bridge_softc *, void *);
 static int	bridge_ioctl_sht(struct bridge_softc *, void *);
 static int	bridge_ioctl_gfd(struct bridge_softc *, void *);
 static int	bridge_ioctl_sfd(struct bridge_softc *, void *);
 static int	bridge_ioctl_gma(struct bridge_softc *, void *);
 static int	bridge_ioctl_sma(struct bridge_softc *, void *);
 static int	bridge_ioctl_sifprio(struct bridge_softc *, void *);
 static int	bridge_ioctl_sifcost(struct bridge_softc *, void *);
 static int	bridge_ioctl_sifmaxaddr(struct bridge_softc *, void *);
 static int	bridge_ioctl_addspan(struct bridge_softc *, void *);
 static int	bridge_ioctl_delspan(struct bridge_softc *, void *);
 static int	bridge_ioctl_gbparam(struct bridge_softc *, void *);
 static int	bridge_ioctl_grte(struct bridge_softc *, void *);
 static int	bridge_ioctl_gifsstp(struct bridge_softc *, void *);
 static int	bridge_ioctl_sproto(struct bridge_softc *, void *);
 static int	bridge_ioctl_stxhc(struct bridge_softc *, void *);
 static int	bridge_pfil(struct mbuf **, struct ifnet *, struct ifnet *,
 		    int);
 static int	bridge_ip_checkbasic(struct mbuf **mp);
 #ifdef INET6
 static int	bridge_ip6_checkbasic(struct mbuf **mp);
 #endif /* INET6 */
 static int	bridge_fragment(struct ifnet *, struct mbuf *,
 		    struct ether_header *, int, struct llc *);
 static void	bridge_linkstate(struct ifnet *ifp);
 static void	bridge_linkcheck(struct bridge_softc *sc);
 
 extern void (*bridge_linkstate_p)(struct ifnet *ifp);
 
 /* The default bridge vlan is 1 (IEEE 802.1Q-2003 Table 9-2) */
 #define	VLANTAGOF(_m)	\
     (_m->m_flags & M_VLANTAG) ? EVL_VLANOFTAG(_m->m_pkthdr.ether_vtag) : 1
 
 static struct bstp_cb_ops bridge_ops = {
 	.bcb_state = bridge_state_change,
 	.bcb_rtage = bridge_rtable_expire
 };
 
 SYSCTL_DECL(_net_link);
 static SYSCTL_NODE(_net_link, IFT_BRIDGE, bridge, CTLFLAG_RW, 0, "Bridge");
 
 /* only pass IP[46] packets when pfil is enabled */
 static VNET_DEFINE(int, pfil_onlyip) = 1;
 #define	V_pfil_onlyip	VNET(pfil_onlyip)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_onlyip,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_onlyip), 0,
     "Only pass IP packets when pfil is enabled");
 
 /* run pfil hooks on the bridge interface */
 static VNET_DEFINE(int, pfil_bridge) = 1;
 #define	V_pfil_bridge	VNET(pfil_bridge)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_bridge,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_bridge), 0,
     "Packet filter on the bridge interface");
 
 /* layer2 filter with ipfw */
 static VNET_DEFINE(int, pfil_ipfw);
 #define	V_pfil_ipfw	VNET(pfil_ipfw)
 
 /* layer2 ARP filter with ipfw */
 static VNET_DEFINE(int, pfil_ipfw_arp);
 #define	V_pfil_ipfw_arp	VNET(pfil_ipfw_arp)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, ipfw_arp,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_ipfw_arp), 0,
     "Filter ARP packets through IPFW layer2");
 
 /* run pfil hooks on the member interface */
 static VNET_DEFINE(int, pfil_member) = 1;
 #define	V_pfil_member	VNET(pfil_member)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_member,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_member), 0,
     "Packet filter on the member interface");
 
 /* run pfil hooks on the physical interface for locally destined packets */
 static VNET_DEFINE(int, pfil_local_phys);
 #define	V_pfil_local_phys	VNET(pfil_local_phys)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_local_phys,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_local_phys), 0,
     "Packet filter on the physical interface for locally destined packets");
 
 /* log STP state changes */
 static VNET_DEFINE(int, log_stp);
 #define	V_log_stp	VNET(log_stp)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, log_stp,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(log_stp), 0,
     "Log STP state changes");
 
 /* share MAC with first bridge member */
 static VNET_DEFINE(int, bridge_inherit_mac);
 #define	V_bridge_inherit_mac	VNET(bridge_inherit_mac)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, inherit_mac,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(bridge_inherit_mac), 0,
     "Inherit MAC address from the first bridge member");
 
 static VNET_DEFINE(int, allow_llz_overlap) = 0;
 #define	V_allow_llz_overlap	VNET(allow_llz_overlap)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, allow_llz_overlap,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(allow_llz_overlap), 0,
     "Allow overlap of link-local scope "
     "zones of a bridge interface and the member interfaces");
 
 struct bridge_control {
 	int	(*bc_func)(struct bridge_softc *, void *);
 	int	bc_argsize;
 	int	bc_flags;
 };
 
 #define	BC_F_COPYIN		0x01	/* copy arguments in */
 #define	BC_F_COPYOUT		0x02	/* copy arguments out */
 #define	BC_F_SUSER		0x04	/* do super-user check */
 
 const struct bridge_control bridge_control_table[] = {
 	{ bridge_ioctl_add,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 	{ bridge_ioctl_del,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_gifflags,	sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_COPYOUT },
 	{ bridge_ioctl_sifflags,	sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_scache,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 	{ bridge_ioctl_gcache,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 
 	{ bridge_ioctl_gifs,		sizeof(struct ifbifconf),
 	  BC_F_COPYIN|BC_F_COPYOUT },
 	{ bridge_ioctl_rts,		sizeof(struct ifbaconf),
 	  BC_F_COPYIN|BC_F_COPYOUT },
 
 	{ bridge_ioctl_saddr,		sizeof(struct ifbareq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_sto,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 	{ bridge_ioctl_gto,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 
 	{ bridge_ioctl_daddr,		sizeof(struct ifbareq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_flush,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_gpri,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 	{ bridge_ioctl_spri,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_ght,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 	{ bridge_ioctl_sht,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_gfd,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 	{ bridge_ioctl_sfd,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_gma,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 	{ bridge_ioctl_sma,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_sifprio,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_sifcost,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_addspan,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 	{ bridge_ioctl_delspan,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_gbparam,		sizeof(struct ifbropreq),
 	  BC_F_COPYOUT },
 
 	{ bridge_ioctl_grte,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 
 	{ bridge_ioctl_gifsstp,		sizeof(struct ifbpstpconf),
 	  BC_F_COPYIN|BC_F_COPYOUT },
 
 	{ bridge_ioctl_sproto,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_stxhc,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_sifmaxaddr,	sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 };
 const int bridge_control_table_size = nitems(bridge_control_table);
 
 static VNET_DEFINE(LIST_HEAD(, bridge_softc), bridge_list);
 #define	V_bridge_list	VNET(bridge_list)
 #define	BRIDGE_LIST_LOCK_INIT(x)	mtx_init(&V_bridge_list_mtx,	\
 					    "if_bridge list", NULL, MTX_DEF)
 #define	BRIDGE_LIST_LOCK_DESTROY(x)	mtx_destroy(&V_bridge_list_mtx)
 #define	BRIDGE_LIST_LOCK(x)		mtx_lock(&V_bridge_list_mtx)
 #define	BRIDGE_LIST_UNLOCK(x)		mtx_unlock(&V_bridge_list_mtx)
 
 static VNET_DEFINE(struct if_clone *, bridge_cloner);
 #define	V_bridge_cloner	VNET(bridge_cloner)
 
 static const char bridge_name[] = "bridge";
 
 static void
 vnet_bridge_init(const void *unused __unused)
 {
 
 	BRIDGE_LIST_LOCK_INIT();
 	LIST_INIT(&V_bridge_list);
 	V_bridge_cloner = if_clone_simple(bridge_name,
 	    bridge_clone_create, bridge_clone_destroy, 0);
 }
 VNET_SYSINIT(vnet_bridge_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_bridge_init, NULL);
 
 static void
 vnet_bridge_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_bridge_cloner);
 	V_bridge_cloner = NULL;
 	BRIDGE_LIST_LOCK_DESTROY();
 }
 VNET_SYSUNINIT(vnet_bridge_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_bridge_uninit, NULL);
 
 static int
 bridge_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		bridge_rtnode_zone = uma_zcreate("bridge_rtnode",
 		    sizeof(struct bridge_rtnode), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		bridge_input_p = bridge_input;
 		bridge_output_p = bridge_output;
 		bridge_dn_p = bridge_dummynet;
 		bridge_linkstate_p = bridge_linkstate;
 		bridge_detach_cookie = EVENTHANDLER_REGISTER(
 		    ifnet_departure_event, bridge_ifdetach, NULL,
 		    EVENTHANDLER_PRI_ANY);
 		break;
 	case MOD_UNLOAD:
 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 		    bridge_detach_cookie);
 		uma_zdestroy(bridge_rtnode_zone);
 		bridge_input_p = NULL;
 		bridge_output_p = NULL;
 		bridge_dn_p = NULL;
 		bridge_linkstate_p = NULL;
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t bridge_mod = {
 	"if_bridge",
 	bridge_modevent,
 	0
 };
 
 DECLARE_MODULE(if_bridge, bridge_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_DEPEND(if_bridge, bridgestp, 1, 1, 1);
 
 /*
  * handler for net.link.bridge.ipfw
  */
 static int
 sysctl_pfil_ipfw(SYSCTL_HANDLER_ARGS)
 {
 	int enable = V_pfil_ipfw;
 	int error;
 
 	error = sysctl_handle_int(oidp, &enable, 0, req);
 	enable &= 1;
 
 	if (enable != V_pfil_ipfw) {
 		V_pfil_ipfw = enable;
 
 		/*
 		 * Disable pfil so that ipfw doesnt run twice, if the user
 		 * really wants both then they can re-enable pfil_bridge and/or
 		 * pfil_member. Also allow non-ip packets as ipfw can filter by
 		 * layer2 type.
 		 */
 		if (V_pfil_ipfw) {
 			V_pfil_onlyip = 0;
 			V_pfil_bridge = 0;
 			V_pfil_member = 0;
 		}
 	}
 
 	return (error);
 }
 SYSCTL_PROC(_net_link_bridge, OID_AUTO, ipfw,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(pfil_ipfw), 0, &sysctl_pfil_ipfw, "I",
     "Layer2 filter with IPFW");
 
 /*
  * bridge_clone_create:
  *
  *	Create a new bridge instance.
  */
 static int
 bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct bridge_softc *sc, *sc2;
 	struct ifnet *bifp, *ifp;
 	int fb, retry;
 	unsigned long hostid;
 
 	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
 	ifp = sc->sc_ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		free(sc, M_DEVBUF);
 		return (ENOSPC);
 	}
 
 	BRIDGE_LOCK_INIT(sc);
 	sc->sc_brtmax = BRIDGE_RTABLE_MAX;
 	sc->sc_brttimeout = BRIDGE_RTABLE_TIMEOUT;
 
 	/* Initialize our routing table. */
 	bridge_rtable_init(sc);
 
 	callout_init_mtx(&sc->sc_brcallout, &sc->sc_mtx, 0);
 
 	LIST_INIT(&sc->sc_iflist);
 	LIST_INIT(&sc->sc_spanlist);
 
 	ifp->if_softc = sc;
 	if_initname(ifp, bridge_name, unit);
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_ioctl = bridge_ioctl;
 	ifp->if_transmit = bridge_transmit;
 	ifp->if_qflush = bridge_qflush;
 	ifp->if_init = bridge_init;
 	ifp->if_type = IFT_BRIDGE;
 
 	/*
 	 * Generate an ethernet address with a locally administered address.
 	 *
 	 * Since we are using random ethernet addresses for the bridge, it is
 	 * possible that we might have address collisions, so make sure that
 	 * this hardware address isn't already in use on another bridge.
 	 * The first try uses the hostid and falls back to arc4rand().
 	 */
 	fb = 0;
 	getcredhostid(curthread->td_ucred, &hostid);
 	do {
 		if (fb || hostid == 0) {
 			arc4rand(sc->sc_defaddr, ETHER_ADDR_LEN, 1);
 			sc->sc_defaddr[0] &= ~1;/* clear multicast bit */
 			sc->sc_defaddr[0] |= 2;	/* set the LAA bit */
 		} else {
 			sc->sc_defaddr[0] = 0x2;
 			sc->sc_defaddr[1] = (hostid >> 24) & 0xff;
 			sc->sc_defaddr[2] = (hostid >> 16) & 0xff;
 			sc->sc_defaddr[3] = (hostid >> 8 ) & 0xff;
 			sc->sc_defaddr[4] =  hostid        & 0xff;
 			sc->sc_defaddr[5] = ifp->if_dunit & 0xff;
 		}
 
 		fb = 1;
 		retry = 0;
 		BRIDGE_LIST_LOCK();
 		LIST_FOREACH(sc2, &V_bridge_list, sc_list) {
 			bifp = sc2->sc_ifp;
 			if (memcmp(sc->sc_defaddr,
 			    IF_LLADDR(bifp), ETHER_ADDR_LEN) == 0) {
 				retry = 1;
 				break;
 			}
 		}
 		BRIDGE_LIST_UNLOCK();
 	} while (retry == 1);
 
 	bstp_attach(&sc->sc_stp, &bridge_ops);
 	ether_ifattach(ifp, sc->sc_defaddr);
 	/* Now undo some of the damage... */
 	ifp->if_baudrate = 0;
 	ifp->if_type = IFT_BRIDGE;
 
 	BRIDGE_LIST_LOCK();
 	LIST_INSERT_HEAD(&V_bridge_list, sc, sc_list);
 	BRIDGE_LIST_UNLOCK();
 
 	return (0);
 }
 
 /*
  * bridge_clone_destroy:
  *
  *	Destroy a bridge instance.
  */
 static void
 bridge_clone_destroy(struct ifnet *ifp)
 {
 	struct bridge_softc *sc = ifp->if_softc;
 	struct bridge_iflist *bif;
 
 	BRIDGE_LOCK(sc);
 
 	bridge_stop(ifp, 1);
 	ifp->if_flags &= ~IFF_UP;
 
 	while ((bif = LIST_FIRST(&sc->sc_iflist)) != NULL)
 		bridge_delete_member(sc, bif, 0);
 
 	while ((bif = LIST_FIRST(&sc->sc_spanlist)) != NULL) {
 		bridge_delete_span(sc, bif);
 	}
 
 	BRIDGE_UNLOCK(sc);
 
 	callout_drain(&sc->sc_brcallout);
 
 	BRIDGE_LIST_LOCK();
 	LIST_REMOVE(sc, sc_list);
 	BRIDGE_LIST_UNLOCK();
 
 	bstp_detach(&sc->sc_stp);
 	ether_ifdetach(ifp);
 	if_free(ifp);
 
 	/* Tear down the routing table. */
 	bridge_rtable_fini(sc);
 
 	BRIDGE_LOCK_DESTROY(sc);
 	free(sc, M_DEVBUF);
 }
 
 /*
  * bridge_ioctl:
  *
  *	Handle a control request from the operator.
  */
 static int
 bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct bridge_softc *sc = ifp->if_softc;
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct bridge_iflist *bif;
 	struct thread *td = curthread;
 	union {
 		struct ifbreq ifbreq;
 		struct ifbifconf ifbifconf;
 		struct ifbareq ifbareq;
 		struct ifbaconf ifbaconf;
 		struct ifbrparam ifbrparam;
 		struct ifbropreq ifbropreq;
 	} args;
 	struct ifdrv *ifd = (struct ifdrv *) data;
 	const struct bridge_control *bc;
 	int error = 0;
 
 	switch (cmd) {
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		break;
 
 	case SIOCGDRVSPEC:
 	case SIOCSDRVSPEC:
 		if (ifd->ifd_cmd >= bridge_control_table_size) {
 			error = EINVAL;
 			break;
 		}
 		bc = &bridge_control_table[ifd->ifd_cmd];
 
 		if (cmd == SIOCGDRVSPEC &&
 		    (bc->bc_flags & BC_F_COPYOUT) == 0) {
 			error = EINVAL;
 			break;
 		}
 		else if (cmd == SIOCSDRVSPEC &&
 		    (bc->bc_flags & BC_F_COPYOUT) != 0) {
 			error = EINVAL;
 			break;
 		}
 
 		if (bc->bc_flags & BC_F_SUSER) {
 			error = priv_check(td, PRIV_NET_BRIDGE);
 			if (error)
 				break;
 		}
 
 		if (ifd->ifd_len != bc->bc_argsize ||
 		    ifd->ifd_len > sizeof(args)) {
 			error = EINVAL;
 			break;
 		}
 
 		bzero(&args, sizeof(args));
 		if (bc->bc_flags & BC_F_COPYIN) {
 			error = copyin(ifd->ifd_data, &args, ifd->ifd_len);
 			if (error)
 				break;
 		}
 
 		BRIDGE_LOCK(sc);
 		error = (*bc->bc_func)(sc, &args);
 		BRIDGE_UNLOCK(sc);
 		if (error)
 			break;
 
 		if (bc->bc_flags & BC_F_COPYOUT)
 			error = copyout(&args, ifd->ifd_data, ifd->ifd_len);
 
 		break;
 
 	case SIOCSIFFLAGS:
 		if (!(ifp->if_flags & IFF_UP) &&
 		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			/*
 			 * If interface is marked down and it is running,
 			 * then stop and disable it.
 			 */
 			BRIDGE_LOCK(sc);
 			bridge_stop(ifp, 1);
 			BRIDGE_UNLOCK(sc);
 		} else if ((ifp->if_flags & IFF_UP) &&
 		    !(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			/*
 			 * If interface is marked up and it is stopped, then
 			 * start it.
 			 */
 			(*ifp->if_init)(sc);
 		}
 		break;
 
 	case SIOCSIFMTU:
 		if (ifr->ifr_mtu < 576) {
 			error = EINVAL;
 			break;
 		}
 		if (LIST_EMPTY(&sc->sc_iflist)) {
 			sc->sc_ifp->if_mtu = ifr->ifr_mtu;
 			break;
 		}
 		BRIDGE_LOCK(sc);
 		LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 			if (bif->bif_ifp->if_mtu != ifr->ifr_mtu) {
 				log(LOG_NOTICE, "%s: invalid MTU: %u(%s)"
 				    " != %d\n", sc->sc_ifp->if_xname,
 				    bif->bif_ifp->if_mtu,
 				    bif->bif_ifp->if_xname, ifr->ifr_mtu);
 				error = EINVAL;
 				break;
 			}
 		}
 		if (!error)
 			sc->sc_ifp->if_mtu = ifr->ifr_mtu;
 		BRIDGE_UNLOCK(sc);
 		break;
 	default:
 		/*
 		 * drop the lock as ether_ioctl() will call bridge_start() and
 		 * cause the lock to be recursed.
 		 */
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * bridge_mutecaps:
  *
  *	Clear or restore unwanted capabilities on the member interface
  */
 static void
 bridge_mutecaps(struct bridge_softc *sc)
 {
 	struct bridge_iflist *bif;
 	int enabled, mask;
 
 	/* Initial bitmask of capabilities to test */
 	mask = BRIDGE_IFCAPS_MASK;
 
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		/* Every member must support it or its disabled */
 		mask &= bif->bif_savedcaps;
 	}
 
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		enabled = bif->bif_ifp->if_capenable;
 		enabled &= ~BRIDGE_IFCAPS_STRIP;
 		/* strip off mask bits and enable them again if allowed */
 		enabled &= ~BRIDGE_IFCAPS_MASK;
 		enabled |= mask;
 		bridge_set_ifcap(sc, bif, enabled);
 	}
 
 }
 
 static void
 bridge_set_ifcap(struct bridge_softc *sc, struct bridge_iflist *bif, int set)
 {
 	struct ifnet *ifp = bif->bif_ifp;
 	struct ifreq ifr;
 	int error;
 
 	bzero(&ifr, sizeof(ifr));
 	ifr.ifr_reqcap = set;
 
 	if (ifp->if_capenable != set) {
 		error = (*ifp->if_ioctl)(ifp, SIOCSIFCAP, (caddr_t)&ifr);
 		if (error)
 			if_printf(sc->sc_ifp,
 			    "error setting interface capabilities on %s\n",
 			    ifp->if_xname);
 	}
 }
 
 /*
  * bridge_lookup_member:
  *
  *	Lookup a bridge member interface.
  */
 static struct bridge_iflist *
 bridge_lookup_member(struct bridge_softc *sc, const char *name)
 {
 	struct bridge_iflist *bif;
 	struct ifnet *ifp;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		ifp = bif->bif_ifp;
 		if (strcmp(ifp->if_xname, name) == 0)
 			return (bif);
 	}
 
 	return (NULL);
 }
 
 /*
  * bridge_lookup_member_if:
  *
  *	Lookup a bridge member interface by ifnet*.
  */
 static struct bridge_iflist *
 bridge_lookup_member_if(struct bridge_softc *sc, struct ifnet *member_ifp)
 {
 	struct bridge_iflist *bif;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		if (bif->bif_ifp == member_ifp)
 			return (bif);
 	}
 
 	return (NULL);
 }
 
 /*
  * bridge_delete_member:
  *
  *	Delete the specified member interface.
  */
 static void
 bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif,
     int gone)
 {
 	struct ifnet *ifs = bif->bif_ifp;
 	struct ifnet *fif = NULL;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	if (bif->bif_flags & IFBIF_STP)
 		bstp_disable(&bif->bif_stp);
 
 	ifs->if_bridge = NULL;
 	BRIDGE_XLOCK(sc);
 	LIST_REMOVE(bif, bif_next);
 	BRIDGE_XDROP(sc);
 
 	/*
 	 * If removing the interface that gave the bridge its mac address, set
 	 * the mac address of the bridge to the address of the next member, or
 	 * to its default address if no members are left.
 	 */
 	if (V_bridge_inherit_mac && sc->sc_ifaddr == ifs) {
 		if (LIST_EMPTY(&sc->sc_iflist)) {
 			bcopy(sc->sc_defaddr,
 			    IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
 			sc->sc_ifaddr = NULL;
 		} else {
 			fif = LIST_FIRST(&sc->sc_iflist)->bif_ifp;
 			bcopy(IF_LLADDR(fif),
 			    IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
 			sc->sc_ifaddr = fif;
 		}
 		EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
 	}
 
 	bridge_linkcheck(sc);
 	bridge_mutecaps(sc);	/* recalcuate now this interface is removed */
 	bridge_rtdelete(sc, ifs, IFBF_FLUSHALL);
 	KASSERT(bif->bif_addrcnt == 0,
 	    ("%s: %d bridge routes referenced", __func__, bif->bif_addrcnt));
 
 	BRIDGE_UNLOCK(sc);
 	if (!gone) {
 		switch (ifs->if_type) {
 		case IFT_ETHER:
 		case IFT_L2VLAN:
 			/*
 			 * Take the interface out of promiscuous mode, but only
 			 * if it was promiscuous in the first place. It might
 			 * not be if we're in the bridge_ioctl_add() error path.
 			 */
 			if (ifs->if_flags & IFF_PROMISC)
 				(void) ifpromisc(ifs, 0);
 			break;
 
 		case IFT_GIF:
 			break;
 
 		default:
 #ifdef DIAGNOSTIC
 			panic("bridge_delete_member: impossible");
 #endif
 			break;
 		}
 		/* reneable any interface capabilities */
 		bridge_set_ifcap(sc, bif, bif->bif_savedcaps);
 	}
 	bstp_destroy(&bif->bif_stp);	/* prepare to free */
 	BRIDGE_LOCK(sc);
 	free(bif, M_DEVBUF);
 }
 
 /*
  * bridge_delete_span:
  *
  *	Delete the specified span interface.
  */
 static void
 bridge_delete_span(struct bridge_softc *sc, struct bridge_iflist *bif)
 {
 	BRIDGE_LOCK_ASSERT(sc);
 
 	KASSERT(bif->bif_ifp->if_bridge == NULL,
 	    ("%s: not a span interface", __func__));
 
 	LIST_REMOVE(bif, bif_next);
 	free(bif, M_DEVBUF);
 }
 
 static int
 bridge_ioctl_add(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif = NULL;
 	struct ifnet *ifs;
 	int error = 0;
 
 	ifs = ifunit(req->ifbr_ifsname);
 	if (ifs == NULL)
 		return (ENOENT);
 	if (ifs->if_ioctl == NULL)	/* must be supported */
 		return (EINVAL);
 
 	/* If it's in the span list, it can't be a member. */
 	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
 		if (ifs == bif->bif_ifp)
 			return (EBUSY);
 
 	if (ifs->if_bridge == sc)
 		return (EEXIST);
 
 	if (ifs->if_bridge != NULL)
 		return (EBUSY);
 
 	switch (ifs->if_type) {
 	case IFT_ETHER:
 	case IFT_L2VLAN:
 	case IFT_GIF:
 		/* permitted interface types */
 		break;
 	default:
 		return (EINVAL);
 	}
 
 #ifdef INET6
 	/*
 	 * Two valid inet6 addresses with link-local scope must not be
 	 * on the parent interface and the member interfaces at the
 	 * same time.  This restriction is needed to prevent violation
 	 * of link-local scope zone.  Attempts to add a member
 	 * interface which has inet6 addresses when the parent has
 	 * inet6 triggers removal of all inet6 addresses on the member
 	 * interface.
 	 */
 
 	/* Check if the parent interface has a link-local scope addr. */
 	if (V_allow_llz_overlap == 0 &&
 	    in6ifa_llaonifp(sc->sc_ifp) != NULL) {
 		/*
 		 * If any, remove all inet6 addresses from the member
 		 * interfaces.
 		 */
 		BRIDGE_XLOCK(sc);
 		LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
  			if (in6ifa_llaonifp(bif->bif_ifp)) {
 				BRIDGE_UNLOCK(sc);
-				in6_ifdetach(bif->bif_ifp);
+				in6_ifdetach(bif->bif_ifp, 1);
 				BRIDGE_LOCK(sc);
 				if_printf(sc->sc_ifp,
 				    "IPv6 addresses on %s have been removed "
 				    "before adding it as a member to prevent "
 				    "IPv6 address scope violation.\n",
 				    bif->bif_ifp->if_xname);
 			}
 		}
 		BRIDGE_XDROP(sc);
 		if (in6ifa_llaonifp(ifs)) {
 			BRIDGE_UNLOCK(sc);
-			in6_ifdetach(ifs);
+			in6_ifdetach(ifs, 1);
 			BRIDGE_LOCK(sc);
 			if_printf(sc->sc_ifp,
 			    "IPv6 addresses on %s have been removed "
 			    "before adding it as a member to prevent "
 			    "IPv6 address scope violation.\n",
 			    ifs->if_xname);
 		}
 	}
 #endif
 	/* Allow the first Ethernet member to define the MTU */
 	if (LIST_EMPTY(&sc->sc_iflist))
 		sc->sc_ifp->if_mtu = ifs->if_mtu;
 	else if (sc->sc_ifp->if_mtu != ifs->if_mtu) {
 		if_printf(sc->sc_ifp, "invalid MTU: %u(%s) != %u\n",
 		    ifs->if_mtu, ifs->if_xname, sc->sc_ifp->if_mtu);
 		return (EINVAL);
 	}
 
 	bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO);
 	if (bif == NULL)
 		return (ENOMEM);
 
 	bif->bif_ifp = ifs;
 	bif->bif_flags = IFBIF_LEARNING | IFBIF_DISCOVER;
 	bif->bif_savedcaps = ifs->if_capenable;
 
 	/*
 	 * Assign the interface's MAC address to the bridge if it's the first
 	 * member and the MAC address of the bridge has not been changed from
 	 * the default randomly generated one.
 	 */
 	if (V_bridge_inherit_mac && LIST_EMPTY(&sc->sc_iflist) &&
 	    !memcmp(IF_LLADDR(sc->sc_ifp), sc->sc_defaddr, ETHER_ADDR_LEN)) {
 		bcopy(IF_LLADDR(ifs), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
 		sc->sc_ifaddr = ifs;
 		EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
 	}
 
 	ifs->if_bridge = sc;
 	bstp_create(&sc->sc_stp, &bif->bif_stp, bif->bif_ifp);
 	/*
 	 * XXX: XLOCK HERE!?!
 	 *
 	 * NOTE: insert_***HEAD*** should be safe for the traversals.
 	 */
 	LIST_INSERT_HEAD(&sc->sc_iflist, bif, bif_next);
 
 	/* Set interface capabilities to the intersection set of all members */
 	bridge_mutecaps(sc);
 	bridge_linkcheck(sc);
 
 	/* Place the interface into promiscuous mode */
 	switch (ifs->if_type) {
 		case IFT_ETHER:
 		case IFT_L2VLAN:
 			BRIDGE_UNLOCK(sc);
 			error = ifpromisc(ifs, 1);
 			BRIDGE_LOCK(sc);
 			break;
 	}
 
 	if (error)
 		bridge_delete_member(sc, bif, 0);
 	return (error);
 }
 
 static int
 bridge_ioctl_del(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	bridge_delete_member(sc, bif, 0);
 
 	return (0);
 }
 
 static int
 bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 	struct bstp_port *bp;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	bp = &bif->bif_stp;
 	req->ifbr_ifsflags = bif->bif_flags;
 	req->ifbr_state = bp->bp_state;
 	req->ifbr_priority = bp->bp_priority;
 	req->ifbr_path_cost = bp->bp_path_cost;
 	req->ifbr_portno = bif->bif_ifp->if_index & 0xfff;
 	req->ifbr_proto = bp->bp_protover;
 	req->ifbr_role = bp->bp_role;
 	req->ifbr_stpflags = bp->bp_flags;
 	req->ifbr_addrcnt = bif->bif_addrcnt;
 	req->ifbr_addrmax = bif->bif_addrmax;
 	req->ifbr_addrexceeded = bif->bif_addrexceeded;
 
 	/* Copy STP state options as flags */
 	if (bp->bp_operedge)
 		req->ifbr_ifsflags |= IFBIF_BSTP_EDGE;
 	if (bp->bp_flags & BSTP_PORT_AUTOEDGE)
 		req->ifbr_ifsflags |= IFBIF_BSTP_AUTOEDGE;
 	if (bp->bp_ptp_link)
 		req->ifbr_ifsflags |= IFBIF_BSTP_PTP;
 	if (bp->bp_flags & BSTP_PORT_AUTOPTP)
 		req->ifbr_ifsflags |= IFBIF_BSTP_AUTOPTP;
 	if (bp->bp_flags & BSTP_PORT_ADMEDGE)
 		req->ifbr_ifsflags |= IFBIF_BSTP_ADMEDGE;
 	if (bp->bp_flags & BSTP_PORT_ADMCOST)
 		req->ifbr_ifsflags |= IFBIF_BSTP_ADMCOST;
 	return (0);
 }
 
 static int
 bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 	struct bstp_port *bp;
 	int error;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 	bp = &bif->bif_stp;
 
 	if (req->ifbr_ifsflags & IFBIF_SPAN)
 		/* SPAN is readonly */
 		return (EINVAL);
 
 	if (req->ifbr_ifsflags & IFBIF_STP) {
 		if ((bif->bif_flags & IFBIF_STP) == 0) {
 			error = bstp_enable(&bif->bif_stp);
 			if (error)
 				return (error);
 		}
 	} else {
 		if ((bif->bif_flags & IFBIF_STP) != 0)
 			bstp_disable(&bif->bif_stp);
 	}
 
 	/* Pass on STP flags */
 	bstp_set_edge(bp, req->ifbr_ifsflags & IFBIF_BSTP_EDGE ? 1 : 0);
 	bstp_set_autoedge(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOEDGE ? 1 : 0);
 	bstp_set_ptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_PTP ? 1 : 0);
 	bstp_set_autoptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOPTP ? 1 : 0);
 
 	/* Save the bits relating to the bridge */
 	bif->bif_flags = req->ifbr_ifsflags & IFBIFMASK;
 
 	return (0);
 }
 
 static int
 bridge_ioctl_scache(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	sc->sc_brtmax = param->ifbrp_csize;
 	bridge_rttrim(sc);
 
 	return (0);
 }
 
 static int
 bridge_ioctl_gcache(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	param->ifbrp_csize = sc->sc_brtmax;
 
 	return (0);
 }
 
 static int
 bridge_ioctl_gifs(struct bridge_softc *sc, void *arg)
 {
 	struct ifbifconf *bifc = arg;
 	struct bridge_iflist *bif;
 	struct ifbreq breq;
 	char *buf, *outbuf;
 	int count, buflen, len, error = 0;
 
 	count = 0;
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next)
 		count++;
 	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
 		count++;
 
 	buflen = sizeof(breq) * count;
 	if (bifc->ifbic_len == 0) {
 		bifc->ifbic_len = buflen;
 		return (0);
 	}
 	BRIDGE_UNLOCK(sc);
 	outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
 	BRIDGE_LOCK(sc);
 
 	count = 0;
 	buf = outbuf;
 	len = min(bifc->ifbic_len, buflen);
 	bzero(&breq, sizeof(breq));
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		if (len < sizeof(breq))
 			break;
 
 		strlcpy(breq.ifbr_ifsname, bif->bif_ifp->if_xname,
 		    sizeof(breq.ifbr_ifsname));
 		/* Fill in the ifbreq structure */
 		error = bridge_ioctl_gifflags(sc, &breq);
 		if (error)
 			break;
 		memcpy(buf, &breq, sizeof(breq));
 		count++;
 		buf += sizeof(breq);
 		len -= sizeof(breq);
 	}
 	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) {
 		if (len < sizeof(breq))
 			break;
 
 		strlcpy(breq.ifbr_ifsname, bif->bif_ifp->if_xname,
 		    sizeof(breq.ifbr_ifsname));
 		breq.ifbr_ifsflags = bif->bif_flags;
 		breq.ifbr_portno = bif->bif_ifp->if_index & 0xfff;
 		memcpy(buf, &breq, sizeof(breq));
 		count++;
 		buf += sizeof(breq);
 		len -= sizeof(breq);
 	}
 
 	BRIDGE_UNLOCK(sc);
 	bifc->ifbic_len = sizeof(breq) * count;
 	error = copyout(outbuf, bifc->ifbic_req, bifc->ifbic_len);
 	BRIDGE_LOCK(sc);
 	free(outbuf, M_TEMP);
 	return (error);
 }
 
 static int
 bridge_ioctl_rts(struct bridge_softc *sc, void *arg)
 {
 	struct ifbaconf *bac = arg;
 	struct bridge_rtnode *brt;
 	struct ifbareq bareq;
 	char *buf, *outbuf;
 	int count, buflen, len, error = 0;
 
 	if (bac->ifbac_len == 0)
 		return (0);
 
 	count = 0;
 	LIST_FOREACH(brt, &sc->sc_rtlist, brt_list)
 		count++;
 	buflen = sizeof(bareq) * count;
 
 	BRIDGE_UNLOCK(sc);
 	outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
 	BRIDGE_LOCK(sc);
 
 	count = 0;
 	buf = outbuf;
 	len = min(bac->ifbac_len, buflen);
 	bzero(&bareq, sizeof(bareq));
 	LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) {
 		if (len < sizeof(bareq))
 			goto out;
 		strlcpy(bareq.ifba_ifsname, brt->brt_ifp->if_xname,
 		    sizeof(bareq.ifba_ifsname));
 		memcpy(bareq.ifba_dst, brt->brt_addr, sizeof(brt->brt_addr));
 		bareq.ifba_vlan = brt->brt_vlan;
 		if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC &&
 				time_uptime < brt->brt_expire)
 			bareq.ifba_expire = brt->brt_expire - time_uptime;
 		else
 			bareq.ifba_expire = 0;
 		bareq.ifba_flags = brt->brt_flags;
 
 		memcpy(buf, &bareq, sizeof(bareq));
 		count++;
 		buf += sizeof(bareq);
 		len -= sizeof(bareq);
 	}
 out:
 	BRIDGE_UNLOCK(sc);
 	bac->ifbac_len = sizeof(bareq) * count;
 	error = copyout(outbuf, bac->ifbac_req, bac->ifbac_len);
 	BRIDGE_LOCK(sc);
 	free(outbuf, M_TEMP);
 	return (error);
 }
 
 static int
 bridge_ioctl_saddr(struct bridge_softc *sc, void *arg)
 {
 	struct ifbareq *req = arg;
 	struct bridge_iflist *bif;
 	int error;
 
 	bif = bridge_lookup_member(sc, req->ifba_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	error = bridge_rtupdate(sc, req->ifba_dst, req->ifba_vlan, bif, 1,
 	    req->ifba_flags);
 
 	return (error);
 }
 
 static int
 bridge_ioctl_sto(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	sc->sc_brttimeout = param->ifbrp_ctime;
 	return (0);
 }
 
 static int
 bridge_ioctl_gto(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	param->ifbrp_ctime = sc->sc_brttimeout;
 	return (0);
 }
 
 static int
 bridge_ioctl_daddr(struct bridge_softc *sc, void *arg)
 {
 	struct ifbareq *req = arg;
 
 	return (bridge_rtdaddr(sc, req->ifba_dst, req->ifba_vlan));
 }
 
 static int
 bridge_ioctl_flush(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 
 	bridge_rtflush(sc, req->ifbr_ifsflags);
 	return (0);
 }
 
 static int
 bridge_ioctl_gpri(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 	struct bstp_state *bs = &sc->sc_stp;
 
 	param->ifbrp_prio = bs->bs_bridge_priority;
 	return (0);
 }
 
 static int
 bridge_ioctl_spri(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_priority(&sc->sc_stp, param->ifbrp_prio));
 }
 
 static int
 bridge_ioctl_ght(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 	struct bstp_state *bs = &sc->sc_stp;
 
 	param->ifbrp_hellotime = bs->bs_bridge_htime >> 8;
 	return (0);
 }
 
 static int
 bridge_ioctl_sht(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_htime(&sc->sc_stp, param->ifbrp_hellotime));
 }
 
 static int
 bridge_ioctl_gfd(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 	struct bstp_state *bs = &sc->sc_stp;
 
 	param->ifbrp_fwddelay = bs->bs_bridge_fdelay >> 8;
 	return (0);
 }
 
 static int
 bridge_ioctl_sfd(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_fdelay(&sc->sc_stp, param->ifbrp_fwddelay));
 }
 
 static int
 bridge_ioctl_gma(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 	struct bstp_state *bs = &sc->sc_stp;
 
 	param->ifbrp_maxage = bs->bs_bridge_max_age >> 8;
 	return (0);
 }
 
 static int
 bridge_ioctl_sma(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_maxage(&sc->sc_stp, param->ifbrp_maxage));
 }
 
 static int
 bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	return (bstp_set_port_priority(&bif->bif_stp, req->ifbr_priority));
 }
 
 static int
 bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	return (bstp_set_path_cost(&bif->bif_stp, req->ifbr_path_cost));
 }
 
 static int
 bridge_ioctl_sifmaxaddr(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	bif->bif_addrmax = req->ifbr_addrmax;
 	return (0);
 }
 
 static int
 bridge_ioctl_addspan(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif = NULL;
 	struct ifnet *ifs;
 
 	ifs = ifunit(req->ifbr_ifsname);
 	if (ifs == NULL)
 		return (ENOENT);
 
 	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
 		if (ifs == bif->bif_ifp)
 			return (EBUSY);
 
 	if (ifs->if_bridge != NULL)
 		return (EBUSY);
 
 	switch (ifs->if_type) {
 		case IFT_ETHER:
 		case IFT_GIF:
 		case IFT_L2VLAN:
 			break;
 		default:
 			return (EINVAL);
 	}
 
 	bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO);
 	if (bif == NULL)
 		return (ENOMEM);
 
 	bif->bif_ifp = ifs;
 	bif->bif_flags = IFBIF_SPAN;
 
 	LIST_INSERT_HEAD(&sc->sc_spanlist, bif, bif_next);
 
 	return (0);
 }
 
 static int
 bridge_ioctl_delspan(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 	struct ifnet *ifs;
 
 	ifs = ifunit(req->ifbr_ifsname);
 	if (ifs == NULL)
 		return (ENOENT);
 
 	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
 		if (ifs == bif->bif_ifp)
 			break;
 
 	if (bif == NULL)
 		return (ENOENT);
 
 	bridge_delete_span(sc, bif);
 
 	return (0);
 }
 
 static int
 bridge_ioctl_gbparam(struct bridge_softc *sc, void *arg)
 {
 	struct ifbropreq *req = arg;
 	struct bstp_state *bs = &sc->sc_stp;
 	struct bstp_port *root_port;
 
 	req->ifbop_maxage = bs->bs_bridge_max_age >> 8;
 	req->ifbop_hellotime = bs->bs_bridge_htime >> 8;
 	req->ifbop_fwddelay = bs->bs_bridge_fdelay >> 8;
 
 	root_port = bs->bs_root_port;
 	if (root_port == NULL)
 		req->ifbop_root_port = 0;
 	else
 		req->ifbop_root_port = root_port->bp_ifp->if_index;
 
 	req->ifbop_holdcount = bs->bs_txholdcount;
 	req->ifbop_priority = bs->bs_bridge_priority;
 	req->ifbop_protocol = bs->bs_protover;
 	req->ifbop_root_path_cost = bs->bs_root_pv.pv_cost;
 	req->ifbop_bridgeid = bs->bs_bridge_pv.pv_dbridge_id;
 	req->ifbop_designated_root = bs->bs_root_pv.pv_root_id;
 	req->ifbop_designated_bridge = bs->bs_root_pv.pv_dbridge_id;
 	req->ifbop_last_tc_time.tv_sec = bs->bs_last_tc_time.tv_sec;
 	req->ifbop_last_tc_time.tv_usec = bs->bs_last_tc_time.tv_usec;
 
 	return (0);
 }
 
 static int
 bridge_ioctl_grte(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	param->ifbrp_cexceeded = sc->sc_brtexceeded;
 	return (0);
 }
 
 static int
 bridge_ioctl_gifsstp(struct bridge_softc *sc, void *arg)
 {
 	struct ifbpstpconf *bifstp = arg;
 	struct bridge_iflist *bif;
 	struct bstp_port *bp;
 	struct ifbpstpreq bpreq;
 	char *buf, *outbuf;
 	int count, buflen, len, error = 0;
 
 	count = 0;
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		if ((bif->bif_flags & IFBIF_STP) != 0)
 			count++;
 	}
 
 	buflen = sizeof(bpreq) * count;
 	if (bifstp->ifbpstp_len == 0) {
 		bifstp->ifbpstp_len = buflen;
 		return (0);
 	}
 
 	BRIDGE_UNLOCK(sc);
 	outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
 	BRIDGE_LOCK(sc);
 
 	count = 0;
 	buf = outbuf;
 	len = min(bifstp->ifbpstp_len, buflen);
 	bzero(&bpreq, sizeof(bpreq));
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		if (len < sizeof(bpreq))
 			break;
 
 		if ((bif->bif_flags & IFBIF_STP) == 0)
 			continue;
 
 		bp = &bif->bif_stp;
 		bpreq.ifbp_portno = bif->bif_ifp->if_index & 0xfff;
 		bpreq.ifbp_fwd_trans = bp->bp_forward_transitions;
 		bpreq.ifbp_design_cost = bp->bp_desg_pv.pv_cost;
 		bpreq.ifbp_design_port = bp->bp_desg_pv.pv_port_id;
 		bpreq.ifbp_design_bridge = bp->bp_desg_pv.pv_dbridge_id;
 		bpreq.ifbp_design_root = bp->bp_desg_pv.pv_root_id;
 
 		memcpy(buf, &bpreq, sizeof(bpreq));
 		count++;
 		buf += sizeof(bpreq);
 		len -= sizeof(bpreq);
 	}
 
 	BRIDGE_UNLOCK(sc);
 	bifstp->ifbpstp_len = sizeof(bpreq) * count;
 	error = copyout(outbuf, bifstp->ifbpstp_req, bifstp->ifbpstp_len);
 	BRIDGE_LOCK(sc);
 	free(outbuf, M_TEMP);
 	return (error);
 }
 
 static int
 bridge_ioctl_sproto(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_protocol(&sc->sc_stp, param->ifbrp_proto));
 }
 
 static int
 bridge_ioctl_stxhc(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_holdcount(&sc->sc_stp, param->ifbrp_txhc));
 }
 
 /*
  * bridge_ifdetach:
  *
  *	Detach an interface from a bridge.  Called when a member
  *	interface is detaching.
  */
 static void
 bridge_ifdetach(void *arg __unused, struct ifnet *ifp)
 {
 	struct bridge_softc *sc = ifp->if_bridge;
 	struct bridge_iflist *bif;
 
 	if (ifp->if_flags & IFF_RENAMING)
 		return;
 	if (V_bridge_cloner == NULL) {
 		/*
 		 * This detach handler can be called after
 		 * vnet_bridge_uninit().  Just return in that case.
 		 */
 		return;
 	}
 	/* Check if the interface is a bridge member */
 	if (sc != NULL) {
 		BRIDGE_LOCK(sc);
 
 		bif = bridge_lookup_member_if(sc, ifp);
 		if (bif != NULL)
 			bridge_delete_member(sc, bif, 1);
 
 		BRIDGE_UNLOCK(sc);
 		return;
 	}
 
 	/* Check if the interface is a span port */
 	BRIDGE_LIST_LOCK();
 	LIST_FOREACH(sc, &V_bridge_list, sc_list) {
 		BRIDGE_LOCK(sc);
 		LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
 			if (ifp == bif->bif_ifp) {
 				bridge_delete_span(sc, bif);
 				break;
 			}
 
 		BRIDGE_UNLOCK(sc);
 	}
 	BRIDGE_LIST_UNLOCK();
 }
 
 /*
  * bridge_init:
  *
  *	Initialize a bridge interface.
  */
 static void
 bridge_init(void *xsc)
 {
 	struct bridge_softc *sc = (struct bridge_softc *)xsc;
 	struct ifnet *ifp = sc->sc_ifp;
 
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 		return;
 
 	BRIDGE_LOCK(sc);
 	callout_reset(&sc->sc_brcallout, bridge_rtable_prune_period * hz,
 	    bridge_timer, sc);
 
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	bstp_init(&sc->sc_stp);		/* Initialize Spanning Tree */
 
 	BRIDGE_UNLOCK(sc);
 }
 
 /*
  * bridge_stop:
  *
  *	Stop the bridge interface.
  */
 static void
 bridge_stop(struct ifnet *ifp, int disable)
 {
 	struct bridge_softc *sc = ifp->if_softc;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 
 	callout_stop(&sc->sc_brcallout);
 	bstp_stop(&sc->sc_stp);
 
 	bridge_rtflush(sc, IFBF_FLUSHDYN);
 
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 }
 
 /*
  * bridge_enqueue:
  *
  *	Enqueue a packet on a bridge member interface.
  *
  */
 static int
 bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m)
 {
 	int len, err = 0;
 	short mflags;
 	struct mbuf *m0;
 
 	/* We may be sending a fragment so traverse the mbuf */
 	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		len = m->m_pkthdr.len;
 		mflags = m->m_flags;
 
 		/*
 		 * If underlying interface can not do VLAN tag insertion itself
 		 * then attach a packet tag that holds it.
 		 */
 		if ((m->m_flags & M_VLANTAG) &&
 		    (dst_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) {
 			m = ether_vlanencap(m, m->m_pkthdr.ether_vtag);
 			if (m == NULL) {
 				if_printf(dst_ifp,
 				    "unable to prepend VLAN header\n");
 				if_inc_counter(dst_ifp, IFCOUNTER_OERRORS, 1);
 				continue;
 			}
 			m->m_flags &= ~M_VLANTAG;
 		}
 
 		if ((err = dst_ifp->if_transmit(dst_ifp, m))) {
 			m_freem(m0);
 			if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 			break;
 		}
 
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1);
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, len);
 		if (mflags & M_MCAST)
 			if_inc_counter(sc->sc_ifp, IFCOUNTER_OMCASTS, 1);
 	}
 
 	return (err);
 }
 
 /*
  * bridge_dummynet:
  *
  * 	Receive a queued packet from dummynet and pass it on to the output
  * 	interface.
  *
  *	The mbuf has the Ethernet header already attached.
  */
 static void
 bridge_dummynet(struct mbuf *m, struct ifnet *ifp)
 {
 	struct bridge_softc *sc;
 
 	sc = ifp->if_bridge;
 
 	/*
 	 * The packet didnt originate from a member interface. This should only
 	 * ever happen if a member interface is removed while packets are
 	 * queued for it.
 	 */
 	if (sc == NULL) {
 		m_freem(m);
 		return;
 	}
 
 	if (PFIL_HOOKED(&V_inet_pfil_hook)
 #ifdef INET6
 	    || PFIL_HOOKED(&V_inet6_pfil_hook)
 #endif
 	    ) {
 		if (bridge_pfil(&m, sc->sc_ifp, ifp, PFIL_OUT) != 0)
 			return;
 		if (m == NULL)
 			return;
 	}
 
 	bridge_enqueue(sc, ifp, m);
 }
 
 /*
  * bridge_output:
  *
  *	Send output from a bridge member interface.  This
  *	performs the bridging function for locally originated
  *	packets.
  *
  *	The mbuf has the Ethernet header already attached.  We must
  *	enqueue or free the mbuf before returning.
  */
 static int
 bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa,
     struct rtentry *rt)
 {
 	struct ether_header *eh;
 	struct ifnet *dst_if;
 	struct bridge_softc *sc;
 	uint16_t vlan;
 
 	if (m->m_len < ETHER_HDR_LEN) {
 		m = m_pullup(m, ETHER_HDR_LEN);
 		if (m == NULL)
 			return (0);
 	}
 
 	eh = mtod(m, struct ether_header *);
 	sc = ifp->if_bridge;
 	vlan = VLANTAGOF(m);
 
 	BRIDGE_LOCK(sc);
 
 	/*
 	 * If bridge is down, but the original output interface is up,
 	 * go ahead and send out that interface.  Otherwise, the packet
 	 * is dropped below.
 	 */
 	if ((sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		dst_if = ifp;
 		goto sendunicast;
 	}
 
 	/*
 	 * If the packet is a multicast, or we don't know a better way to
 	 * get there, send to all interfaces.
 	 */
 	if (ETHER_IS_MULTICAST(eh->ether_dhost))
 		dst_if = NULL;
 	else
 		dst_if = bridge_rtlookup(sc, eh->ether_dhost, vlan);
 	if (dst_if == NULL) {
 		struct bridge_iflist *bif;
 		struct mbuf *mc;
 		int error = 0, used = 0;
 
 		bridge_span(sc, m);
 
 		BRIDGE_LOCK2REF(sc, error);
 		if (error) {
 			m_freem(m);
 			return (0);
 		}
 
 		LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 			dst_if = bif->bif_ifp;
 
 			if (dst_if->if_type == IFT_GIF)
 				continue;
 			if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0)
 				continue;
 
 			/*
 			 * If this is not the original output interface,
 			 * and the interface is participating in spanning
 			 * tree, make sure the port is in a state that
 			 * allows forwarding.
 			 */
 			if (dst_if != ifp && (bif->bif_flags & IFBIF_STP) &&
 			    bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
 				continue;
 
 			if (LIST_NEXT(bif, bif_next) == NULL) {
 				used = 1;
 				mc = m;
 			} else {
 				mc = m_copypacket(m, M_NOWAIT);
 				if (mc == NULL) {
 					if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 					continue;
 				}
 			}
 
 			bridge_enqueue(sc, dst_if, mc);
 		}
 		if (used == 0)
 			m_freem(m);
 		BRIDGE_UNREF(sc);
 		return (0);
 	}
 
 sendunicast:
 	/*
 	 * XXX Spanning tree consideration here?
 	 */
 
 	bridge_span(sc, m);
 	if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		m_freem(m);
 		BRIDGE_UNLOCK(sc);
 		return (0);
 	}
 
 	BRIDGE_UNLOCK(sc);
 	bridge_enqueue(sc, dst_if, m);
 	return (0);
 }
 
 /*
  * bridge_transmit:
  *
  *	Do output on a bridge.
  *
  */
 static int
 bridge_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct bridge_softc *sc;
 	struct ether_header *eh;
 	struct ifnet *dst_if;
 	int error = 0;
 
 	sc = ifp->if_softc;
 
 	ETHER_BPF_MTAP(ifp, m);
 
 	eh = mtod(m, struct ether_header *);
 
 	BRIDGE_LOCK(sc);
 	if (((m->m_flags & (M_BCAST|M_MCAST)) == 0) &&
 	    (dst_if = bridge_rtlookup(sc, eh->ether_dhost, 1)) != NULL) {
 		BRIDGE_UNLOCK(sc);
 		error = bridge_enqueue(sc, dst_if, m);
 	} else
 		bridge_broadcast(sc, ifp, m, 0);
 
 	return (error);
 }
 
 /*
  * The ifp->if_qflush entry point for if_bridge(4) is no-op.
  */
 static void
 bridge_qflush(struct ifnet *ifp __unused)
 {
 }
 
 /*
  * bridge_forward:
  *
  *	The forwarding function of the bridge.
  *
  *	NOTE: Releases the lock on return.
  */
 static void
 bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif,
     struct mbuf *m)
 {
 	struct bridge_iflist *dbif;
 	struct ifnet *src_if, *dst_if, *ifp;
 	struct ether_header *eh;
 	uint16_t vlan;
 	uint8_t *dst;
 	int error;
 
 	src_if = m->m_pkthdr.rcvif;
 	ifp = sc->sc_ifp;
 
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	vlan = VLANTAGOF(m);
 
 	if ((sbif->bif_flags & IFBIF_STP) &&
 	    sbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
 		goto drop;
 
 	eh = mtod(m, struct ether_header *);
 	dst = eh->ether_dhost;
 
 	/* If the interface is learning, record the address. */
 	if (sbif->bif_flags & IFBIF_LEARNING) {
 		error = bridge_rtupdate(sc, eh->ether_shost, vlan,
 		    sbif, 0, IFBAF_DYNAMIC);
 		/*
 		 * If the interface has addresses limits then deny any source
 		 * that is not in the cache.
 		 */
 		if (error && sbif->bif_addrmax)
 			goto drop;
 	}
 
 	if ((sbif->bif_flags & IFBIF_STP) != 0 &&
 	    sbif->bif_stp.bp_state == BSTP_IFSTATE_LEARNING)
 		goto drop;
 
 	/*
 	 * At this point, the port either doesn't participate
 	 * in spanning tree or it is in the forwarding state.
 	 */
 
 	/*
 	 * If the packet is unicast, destined for someone on
 	 * "this" side of the bridge, drop it.
 	 */
 	if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) {
 		dst_if = bridge_rtlookup(sc, dst, vlan);
 		if (src_if == dst_if)
 			goto drop;
 	} else {
 		/*
 		 * Check if its a reserved multicast address, any address
 		 * listed in 802.1D section 7.12.6 may not be forwarded by the
 		 * bridge.
 		 * This is currently 01-80-C2-00-00-00 to 01-80-C2-00-00-0F
 		 */
 		if (dst[0] == 0x01 && dst[1] == 0x80 &&
 		    dst[2] == 0xc2 && dst[3] == 0x00 &&
 		    dst[4] == 0x00 && dst[5] <= 0x0f)
 			goto drop;
 
 		/* ...forward it to all interfaces. */
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 		dst_if = NULL;
 	}
 
 	/*
 	 * If we have a destination interface which is a member of our bridge,
 	 * OR this is a unicast packet, push it through the bpf(4) machinery.
 	 * For broadcast or multicast packets, don't bother because it will
 	 * be reinjected into ether_input. We do this before we pass the packets
 	 * through the pfil(9) framework, as it is possible that pfil(9) will
 	 * drop the packet, or possibly modify it, making it difficult to debug
 	 * firewall issues on the bridge.
 	 */
 	if (dst_if != NULL || (m->m_flags & (M_BCAST | M_MCAST)) == 0)
 		ETHER_BPF_MTAP(ifp, m);
 
 	/* run the packet filter */
 	if (PFIL_HOOKED(&V_inet_pfil_hook)
 #ifdef INET6
 	    || PFIL_HOOKED(&V_inet6_pfil_hook)
 #endif
 	    ) {
 		BRIDGE_UNLOCK(sc);
 		if (bridge_pfil(&m, ifp, src_if, PFIL_IN) != 0)
 			return;
 		if (m == NULL)
 			return;
 		BRIDGE_LOCK(sc);
 	}
 
 	if (dst_if == NULL) {
 		bridge_broadcast(sc, src_if, m, 1);
 		return;
 	}
 
 	/*
 	 * At this point, we're dealing with a unicast frame
 	 * going to a different interface.
 	 */
 	if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		goto drop;
 
 	dbif = bridge_lookup_member_if(sc, dst_if);
 	if (dbif == NULL)
 		/* Not a member of the bridge (anymore?) */
 		goto drop;
 
 	/* Private segments can not talk to each other */
 	if (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE)
 		goto drop;
 
 	if ((dbif->bif_flags & IFBIF_STP) &&
 	    dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
 		goto drop;
 
 	BRIDGE_UNLOCK(sc);
 
 	if (PFIL_HOOKED(&V_inet_pfil_hook)
 #ifdef INET6
 	    || PFIL_HOOKED(&V_inet6_pfil_hook)
 #endif
 	    ) {
 		if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0)
 			return;
 		if (m == NULL)
 			return;
 	}
 
 	bridge_enqueue(sc, dst_if, m);
 	return;
 
 drop:
 	BRIDGE_UNLOCK(sc);
 	m_freem(m);
 }
 
 /*
  * bridge_input:
  *
  *	Receive input from a member interface.  Queue the packet for
  *	bridging if it is not for us.
  */
 static struct mbuf *
 bridge_input(struct ifnet *ifp, struct mbuf *m)
 {
 	struct bridge_softc *sc = ifp->if_bridge;
 	struct bridge_iflist *bif, *bif2;
 	struct ifnet *bifp;
 	struct ether_header *eh;
 	struct mbuf *mc, *mc2;
 	uint16_t vlan;
 	int error;
 
 	if ((sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return (m);
 
 	bifp = sc->sc_ifp;
 	vlan = VLANTAGOF(m);
 
 	/*
 	 * Implement support for bridge monitoring. If this flag has been
 	 * set on this interface, discard the packet once we push it through
 	 * the bpf(4) machinery, but before we do, increment the byte and
 	 * packet counters associated with this interface.
 	 */
 	if ((bifp->if_flags & IFF_MONITOR) != 0) {
 		m->m_pkthdr.rcvif  = bifp;
 		ETHER_BPF_MTAP(bifp, m);
 		if_inc_counter(bifp, IFCOUNTER_IPACKETS, 1);
 		if_inc_counter(bifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 		m_freem(m);
 		return (NULL);
 	}
 	BRIDGE_LOCK(sc);
 	bif = bridge_lookup_member_if(sc, ifp);
 	if (bif == NULL) {
 		BRIDGE_UNLOCK(sc);
 		return (m);
 	}
 
 	eh = mtod(m, struct ether_header *);
 
 	bridge_span(sc, m);
 
 	if (m->m_flags & (M_BCAST|M_MCAST)) {
 		/* Tap off 802.1D packets; they do not get forwarded. */
 		if (memcmp(eh->ether_dhost, bstp_etheraddr,
 		    ETHER_ADDR_LEN) == 0) {
 			bstp_input(&bif->bif_stp, ifp, m); /* consumes mbuf */
 			BRIDGE_UNLOCK(sc);
 			return (NULL);
 		}
 
 		if ((bif->bif_flags & IFBIF_STP) &&
 		    bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) {
 			BRIDGE_UNLOCK(sc);
 			return (m);
 		}
 
 		/*
 		 * Make a deep copy of the packet and enqueue the copy
 		 * for bridge processing; return the original packet for
 		 * local processing.
 		 */
 		mc = m_dup(m, M_NOWAIT);
 		if (mc == NULL) {
 			BRIDGE_UNLOCK(sc);
 			return (m);
 		}
 
 		/* Perform the bridge forwarding function with the copy. */
 		bridge_forward(sc, bif, mc);
 
 		/*
 		 * Reinject the mbuf as arriving on the bridge so we have a
 		 * chance at claiming multicast packets. We can not loop back
 		 * here from ether_input as a bridge is never a member of a
 		 * bridge.
 		 */
 		KASSERT(bifp->if_bridge == NULL,
 		    ("loop created in bridge_input"));
 		mc2 = m_dup(m, M_NOWAIT);
 		if (mc2 != NULL) {
 			/* Keep the layer3 header aligned */
 			int i = min(mc2->m_pkthdr.len, max_protohdr);
 			mc2 = m_copyup(mc2, i, ETHER_ALIGN);
 		}
 		if (mc2 != NULL) {
 			mc2->m_pkthdr.rcvif = bifp;
 			(*bifp->if_input)(bifp, mc2);
 		}
 
 		/* Return the original packet for local processing. */
 		return (m);
 	}
 
 	if ((bif->bif_flags & IFBIF_STP) &&
 	    bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) {
 		BRIDGE_UNLOCK(sc);
 		return (m);
 	}
 
 #if (defined(INET) || defined(INET6))
 #   define OR_CARP_CHECK_WE_ARE_DST(iface) \
 	|| ((iface)->if_carp \
 	    && (*carp_forus_p)((iface), eh->ether_dhost))
 #   define OR_CARP_CHECK_WE_ARE_SRC(iface) \
 	|| ((iface)->if_carp \
 	    && (*carp_forus_p)((iface), eh->ether_shost))
 #else
 #   define OR_CARP_CHECK_WE_ARE_DST(iface)
 #   define OR_CARP_CHECK_WE_ARE_SRC(iface)
 #endif
 
 #ifdef INET6
 #   define OR_PFIL_HOOKED_INET6 \
 	|| PFIL_HOOKED(&V_inet6_pfil_hook)
 #else
 #   define OR_PFIL_HOOKED_INET6
 #endif
 
 #define GRAB_OUR_PACKETS(iface) \
 	if ((iface)->if_type == IFT_GIF) \
 		continue; \
 	/* It is destined for us. */ \
 	if (memcmp(IF_LLADDR((iface)), eh->ether_dhost,  ETHER_ADDR_LEN) == 0 \
 	    OR_CARP_CHECK_WE_ARE_DST((iface))				\
 	    ) {								\
 		if ((iface)->if_type == IFT_BRIDGE) {			\
 			ETHER_BPF_MTAP(iface, m);			\
 			if_inc_counter(iface, IFCOUNTER_IPACKETS, 1);				\
 			if_inc_counter(iface, IFCOUNTER_IBYTES, m->m_pkthdr.len);		\
 			/* Filter on the physical interface. */		\
 			if (V_pfil_local_phys &&			\
 			    (PFIL_HOOKED(&V_inet_pfil_hook)		\
 			     OR_PFIL_HOOKED_INET6)) {			\
 				if (bridge_pfil(&m, NULL, ifp,		\
 				    PFIL_IN) != 0 || m == NULL) {	\
 					BRIDGE_UNLOCK(sc);		\
 					return (NULL);			\
 				}					\
 				eh = mtod(m, struct ether_header *);	\
 			}						\
 		}							\
 		if (bif->bif_flags & IFBIF_LEARNING) {			\
 			error = bridge_rtupdate(sc, eh->ether_shost,	\
 			    vlan, bif, 0, IFBAF_DYNAMIC);		\
 			if (error && bif->bif_addrmax) {		\
 				BRIDGE_UNLOCK(sc);			\
 				m_freem(m);				\
 				return (NULL);				\
 			}						\
 		}							\
 		m->m_pkthdr.rcvif = iface;				\
 		BRIDGE_UNLOCK(sc);					\
 		return (m);						\
 	}								\
 									\
 	/* We just received a packet that we sent out. */		\
 	if (memcmp(IF_LLADDR((iface)), eh->ether_shost, ETHER_ADDR_LEN) == 0 \
 	    OR_CARP_CHECK_WE_ARE_SRC((iface))			\
 	    ) {								\
 		BRIDGE_UNLOCK(sc);					\
 		m_freem(m);						\
 		return (NULL);						\
 	}
 
 	/*
 	 * Unicast.  Make sure it's not for the bridge.
 	 */
 	do { GRAB_OUR_PACKETS(bifp) } while (0);
 
 	/*
 	 * Give a chance for ifp at first priority. This will help when	the
 	 * packet comes through the interface like VLAN's with the same MACs
 	 * on several interfaces from the same bridge. This also will save
 	 * some CPU cycles in case the destination interface and the input
 	 * interface (eq ifp) are the same.
 	 */
 	do { GRAB_OUR_PACKETS(ifp) } while (0);
 
 	/* Now check the all bridge members. */
 	LIST_FOREACH(bif2, &sc->sc_iflist, bif_next) {
 		GRAB_OUR_PACKETS(bif2->bif_ifp)
 	}
 
 #undef OR_CARP_CHECK_WE_ARE_DST
 #undef OR_CARP_CHECK_WE_ARE_SRC
 #undef OR_PFIL_HOOKED_INET6
 #undef GRAB_OUR_PACKETS
 
 	/* Perform the bridge forwarding function. */
 	bridge_forward(sc, bif, m);
 
 	return (NULL);
 }
 
 /*
  * bridge_broadcast:
  *
  *	Send a frame to all interfaces that are members of
  *	the bridge, except for the one on which the packet
  *	arrived.
  *
  *	NOTE: Releases the lock on return.
  */
 static void
 bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if,
     struct mbuf *m, int runfilt)
 {
 	struct bridge_iflist *dbif, *sbif;
 	struct mbuf *mc;
 	struct ifnet *dst_if;
 	int error = 0, used = 0, i;
 
 	sbif = bridge_lookup_member_if(sc, src_if);
 
 	BRIDGE_LOCK2REF(sc, error);
 	if (error) {
 		m_freem(m);
 		return;
 	}
 
 	/* Filter on the bridge interface before broadcasting */
 	if (runfilt && (PFIL_HOOKED(&V_inet_pfil_hook)
 #ifdef INET6
 	    || PFIL_HOOKED(&V_inet6_pfil_hook)
 #endif
 	    )) {
 		if (bridge_pfil(&m, sc->sc_ifp, NULL, PFIL_OUT) != 0)
 			goto out;
 		if (m == NULL)
 			goto out;
 	}
 
 	LIST_FOREACH(dbif, &sc->sc_iflist, bif_next) {
 		dst_if = dbif->bif_ifp;
 		if (dst_if == src_if)
 			continue;
 
 		/* Private segments can not talk to each other */
 		if (sbif && (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE))
 			continue;
 
 		if ((dbif->bif_flags & IFBIF_STP) &&
 		    dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
 			continue;
 
 		if ((dbif->bif_flags & IFBIF_DISCOVER) == 0 &&
 		    (m->m_flags & (M_BCAST|M_MCAST)) == 0)
 			continue;
 
 		if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0)
 			continue;
 
 		if (LIST_NEXT(dbif, bif_next) == NULL) {
 			mc = m;
 			used = 1;
 		} else {
 			mc = m_dup(m, M_NOWAIT);
 			if (mc == NULL) {
 				if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 				continue;
 			}
 		}
 
 		/*
 		 * Filter on the output interface. Pass a NULL bridge interface
 		 * pointer so we do not redundantly filter on the bridge for
 		 * each interface we broadcast on.
 		 */
 		if (runfilt && (PFIL_HOOKED(&V_inet_pfil_hook)
 #ifdef INET6
 		    || PFIL_HOOKED(&V_inet6_pfil_hook)
 #endif
 		    )) {
 			if (used == 0) {
 				/* Keep the layer3 header aligned */
 				i = min(mc->m_pkthdr.len, max_protohdr);
 				mc = m_copyup(mc, i, ETHER_ALIGN);
 				if (mc == NULL) {
 					if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 					continue;
 				}
 			}
 			if (bridge_pfil(&mc, NULL, dst_if, PFIL_OUT) != 0)
 				continue;
 			if (mc == NULL)
 				continue;
 		}
 
 		bridge_enqueue(sc, dst_if, mc);
 	}
 	if (used == 0)
 		m_freem(m);
 
 out:
 	BRIDGE_UNREF(sc);
 }
 
 /*
  * bridge_span:
  *
  *	Duplicate a packet out one or more interfaces that are in span mode,
  *	the original mbuf is unmodified.
  */
 static void
 bridge_span(struct bridge_softc *sc, struct mbuf *m)
 {
 	struct bridge_iflist *bif;
 	struct ifnet *dst_if;
 	struct mbuf *mc;
 
 	if (LIST_EMPTY(&sc->sc_spanlist))
 		return;
 
 	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) {
 		dst_if = bif->bif_ifp;
 
 		if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0)
 			continue;
 
 		mc = m_copypacket(m, M_NOWAIT);
 		if (mc == NULL) {
 			if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 			continue;
 		}
 
 		bridge_enqueue(sc, dst_if, mc);
 	}
 }
 
 /*
  * bridge_rtupdate:
  *
  *	Add a bridge routing entry.
  */
 static int
 bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan,
     struct bridge_iflist *bif, int setflags, uint8_t flags)
 {
 	struct bridge_rtnode *brt;
 	int error;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	/* Check the source address is valid and not multicast. */
 	if (ETHER_IS_MULTICAST(dst) ||
 	    (dst[0] == 0 && dst[1] == 0 && dst[2] == 0 &&
 	     dst[3] == 0 && dst[4] == 0 && dst[5] == 0) != 0)
 		return (EINVAL);
 
 	/* 802.1p frames map to vlan 1 */
 	if (vlan == 0)
 		vlan = 1;
 
 	/*
 	 * A route for this destination might already exist.  If so,
 	 * update it, otherwise create a new one.
 	 */
 	if ((brt = bridge_rtnode_lookup(sc, dst, vlan)) == NULL) {
 		if (sc->sc_brtcnt >= sc->sc_brtmax) {
 			sc->sc_brtexceeded++;
 			return (ENOSPC);
 		}
 		/* Check per interface address limits (if enabled) */
 		if (bif->bif_addrmax && bif->bif_addrcnt >= bif->bif_addrmax) {
 			bif->bif_addrexceeded++;
 			return (ENOSPC);
 		}
 
 		/*
 		 * Allocate a new bridge forwarding node, and
 		 * initialize the expiration time and Ethernet
 		 * address.
 		 */
 		brt = uma_zalloc(bridge_rtnode_zone, M_NOWAIT | M_ZERO);
 		if (brt == NULL)
 			return (ENOMEM);
 
 		if (bif->bif_flags & IFBIF_STICKY)
 			brt->brt_flags = IFBAF_STICKY;
 		else
 			brt->brt_flags = IFBAF_DYNAMIC;
 
 		memcpy(brt->brt_addr, dst, ETHER_ADDR_LEN);
 		brt->brt_vlan = vlan;
 
 		if ((error = bridge_rtnode_insert(sc, brt)) != 0) {
 			uma_zfree(bridge_rtnode_zone, brt);
 			return (error);
 		}
 		brt->brt_dst = bif;
 		bif->bif_addrcnt++;
 	}
 
 	if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC &&
 	    brt->brt_dst != bif) {
 		brt->brt_dst->bif_addrcnt--;
 		brt->brt_dst = bif;
 		brt->brt_dst->bif_addrcnt++;
 	}
 
 	if ((flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)
 		brt->brt_expire = time_uptime + sc->sc_brttimeout;
 	if (setflags)
 		brt->brt_flags = flags;
 
 	return (0);
 }
 
 /*
  * bridge_rtlookup:
  *
  *	Lookup the destination interface for an address.
  */
 static struct ifnet *
 bridge_rtlookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan)
 {
 	struct bridge_rtnode *brt;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	if ((brt = bridge_rtnode_lookup(sc, addr, vlan)) == NULL)
 		return (NULL);
 
 	return (brt->brt_ifp);
 }
 
 /*
  * bridge_rttrim:
  *
  *	Trim the routine table so that we have a number
  *	of routing entries less than or equal to the
  *	maximum number.
  */
 static void
 bridge_rttrim(struct bridge_softc *sc)
 {
 	struct bridge_rtnode *brt, *nbrt;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	/* Make sure we actually need to do this. */
 	if (sc->sc_brtcnt <= sc->sc_brtmax)
 		return;
 
 	/* Force an aging cycle; this might trim enough addresses. */
 	bridge_rtage(sc);
 	if (sc->sc_brtcnt <= sc->sc_brtmax)
 		return;
 
 	LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) {
 		if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) {
 			bridge_rtnode_destroy(sc, brt);
 			if (sc->sc_brtcnt <= sc->sc_brtmax)
 				return;
 		}
 	}
 }
 
 /*
  * bridge_timer:
  *
  *	Aging timer for the bridge.
  */
 static void
 bridge_timer(void *arg)
 {
 	struct bridge_softc *sc = arg;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	bridge_rtage(sc);
 
 	if (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING)
 		callout_reset(&sc->sc_brcallout,
 		    bridge_rtable_prune_period * hz, bridge_timer, sc);
 }
 
 /*
  * bridge_rtage:
  *
  *	Perform an aging cycle.
  */
 static void
 bridge_rtage(struct bridge_softc *sc)
 {
 	struct bridge_rtnode *brt, *nbrt;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) {
 		if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) {
 			if (time_uptime >= brt->brt_expire)
 				bridge_rtnode_destroy(sc, brt);
 		}
 	}
 }
 
 /*
  * bridge_rtflush:
  *
  *	Remove all dynamic addresses from the bridge.
  */
 static void
 bridge_rtflush(struct bridge_softc *sc, int full)
 {
 	struct bridge_rtnode *brt, *nbrt;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) {
 		if (full || (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)
 			bridge_rtnode_destroy(sc, brt);
 	}
 }
 
 /*
  * bridge_rtdaddr:
  *
  *	Remove an address from the table.
  */
 static int
 bridge_rtdaddr(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan)
 {
 	struct bridge_rtnode *brt;
 	int found = 0;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	/*
 	 * If vlan is zero then we want to delete for all vlans so the lookup
 	 * may return more than one.
 	 */
 	while ((brt = bridge_rtnode_lookup(sc, addr, vlan)) != NULL) {
 		bridge_rtnode_destroy(sc, brt);
 		found = 1;
 	}
 
 	return (found ? 0 : ENOENT);
 }
 
 /*
  * bridge_rtdelete:
  *
  *	Delete routes to a speicifc member interface.
  */
 static void
 bridge_rtdelete(struct bridge_softc *sc, struct ifnet *ifp, int full)
 {
 	struct bridge_rtnode *brt, *nbrt;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) {
 		if (brt->brt_ifp == ifp && (full ||
 			    (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC))
 			bridge_rtnode_destroy(sc, brt);
 	}
 }
 
 /*
  * bridge_rtable_init:
  *
  *	Initialize the route table for this bridge.
  */
 static void
 bridge_rtable_init(struct bridge_softc *sc)
 {
 	int i;
 
 	sc->sc_rthash = malloc(sizeof(*sc->sc_rthash) * BRIDGE_RTHASH_SIZE,
 	    M_DEVBUF, M_WAITOK);
 
 	for (i = 0; i < BRIDGE_RTHASH_SIZE; i++)
 		LIST_INIT(&sc->sc_rthash[i]);
 
 	sc->sc_rthash_key = arc4random();
 	LIST_INIT(&sc->sc_rtlist);
 }
 
 /*
  * bridge_rtable_fini:
  *
  *	Deconstruct the route table for this bridge.
  */
 static void
 bridge_rtable_fini(struct bridge_softc *sc)
 {
 
 	KASSERT(sc->sc_brtcnt == 0,
 	    ("%s: %d bridge routes referenced", __func__, sc->sc_brtcnt));
 	free(sc->sc_rthash, M_DEVBUF);
 }
 
 /*
  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
  */
 #define	mix(a, b, c)							\
 do {									\
 	a -= b; a -= c; a ^= (c >> 13);					\
 	b -= c; b -= a; b ^= (a << 8);					\
 	c -= a; c -= b; c ^= (b >> 13);					\
 	a -= b; a -= c; a ^= (c >> 12);					\
 	b -= c; b -= a; b ^= (a << 16);					\
 	c -= a; c -= b; c ^= (b >> 5);					\
 	a -= b; a -= c; a ^= (c >> 3);					\
 	b -= c; b -= a; b ^= (a << 10);					\
 	c -= a; c -= b; c ^= (b >> 15);					\
 } while (/*CONSTCOND*/0)
 
 static __inline uint32_t
 bridge_rthash(struct bridge_softc *sc, const uint8_t *addr)
 {
 	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->sc_rthash_key;
 
 	b += addr[5] << 8;
 	b += addr[4];
 	a += addr[3] << 24;
 	a += addr[2] << 16;
 	a += addr[1] << 8;
 	a += addr[0];
 
 	mix(a, b, c);
 
 	return (c & BRIDGE_RTHASH_MASK);
 }
 
 #undef mix
 
 static int
 bridge_rtnode_addr_cmp(const uint8_t *a, const uint8_t *b)
 {
 	int i, d;
 
 	for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) {
 		d = ((int)a[i]) - ((int)b[i]);
 	}
 
 	return (d);
 }
 
 /*
  * bridge_rtnode_lookup:
  *
  *	Look up a bridge route node for the specified destination. Compare the
  *	vlan id or if zero then just return the first match.
  */
 static struct bridge_rtnode *
 bridge_rtnode_lookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan)
 {
 	struct bridge_rtnode *brt;
 	uint32_t hash;
 	int dir;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	hash = bridge_rthash(sc, addr);
 	LIST_FOREACH(brt, &sc->sc_rthash[hash], brt_hash) {
 		dir = bridge_rtnode_addr_cmp(addr, brt->brt_addr);
 		if (dir == 0 && (brt->brt_vlan == vlan || vlan == 0))
 			return (brt);
 		if (dir > 0)
 			return (NULL);
 	}
 
 	return (NULL);
 }
 
 /*
  * bridge_rtnode_insert:
  *
  *	Insert the specified bridge node into the route table.  We
  *	assume the entry is not already in the table.
  */
 static int
 bridge_rtnode_insert(struct bridge_softc *sc, struct bridge_rtnode *brt)
 {
 	struct bridge_rtnode *lbrt;
 	uint32_t hash;
 	int dir;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	hash = bridge_rthash(sc, brt->brt_addr);
 
 	lbrt = LIST_FIRST(&sc->sc_rthash[hash]);
 	if (lbrt == NULL) {
 		LIST_INSERT_HEAD(&sc->sc_rthash[hash], brt, brt_hash);
 		goto out;
 	}
 
 	do {
 		dir = bridge_rtnode_addr_cmp(brt->brt_addr, lbrt->brt_addr);
 		if (dir == 0 && brt->brt_vlan == lbrt->brt_vlan)
 			return (EEXIST);
 		if (dir > 0) {
 			LIST_INSERT_BEFORE(lbrt, brt, brt_hash);
 			goto out;
 		}
 		if (LIST_NEXT(lbrt, brt_hash) == NULL) {
 			LIST_INSERT_AFTER(lbrt, brt, brt_hash);
 			goto out;
 		}
 		lbrt = LIST_NEXT(lbrt, brt_hash);
 	} while (lbrt != NULL);
 
 #ifdef DIAGNOSTIC
 	panic("bridge_rtnode_insert: impossible");
 #endif
 
 out:
 	LIST_INSERT_HEAD(&sc->sc_rtlist, brt, brt_list);
 	sc->sc_brtcnt++;
 
 	return (0);
 }
 
 /*
  * bridge_rtnode_destroy:
  *
  *	Destroy a bridge rtnode.
  */
 static void
 bridge_rtnode_destroy(struct bridge_softc *sc, struct bridge_rtnode *brt)
 {
 	BRIDGE_LOCK_ASSERT(sc);
 
 	LIST_REMOVE(brt, brt_hash);
 
 	LIST_REMOVE(brt, brt_list);
 	sc->sc_brtcnt--;
 	brt->brt_dst->bif_addrcnt--;
 	uma_zfree(bridge_rtnode_zone, brt);
 }
 
 /*
  * bridge_rtable_expire:
  *
  *	Set the expiry time for all routes on an interface.
  */
 static void
 bridge_rtable_expire(struct ifnet *ifp, int age)
 {
 	struct bridge_softc *sc = ifp->if_bridge;
 	struct bridge_rtnode *brt;
 
 	BRIDGE_LOCK(sc);
 
 	/*
 	 * If the age is zero then flush, otherwise set all the expiry times to
 	 * age for the interface
 	 */
 	if (age == 0)
 		bridge_rtdelete(sc, ifp, IFBF_FLUSHDYN);
 	else {
 		LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) {
 			/* Cap the expiry time to 'age' */
 			if (brt->brt_ifp == ifp &&
 			    brt->brt_expire > time_uptime + age &&
 			    (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)
 				brt->brt_expire = time_uptime + age;
 		}
 	}
 	BRIDGE_UNLOCK(sc);
 }
 
 /*
  * bridge_state_change:
  *
  *	Callback from the bridgestp code when a port changes states.
  */
 static void
 bridge_state_change(struct ifnet *ifp, int state)
 {
 	struct bridge_softc *sc = ifp->if_bridge;
 	static const char *stpstates[] = {
 		"disabled",
 		"listening",
 		"learning",
 		"forwarding",
 		"blocking",
 		"discarding"
 	};
 
 	CURVNET_SET(ifp->if_vnet);
 	if (V_log_stp)
 		log(LOG_NOTICE, "%s: state changed to %s on %s\n",
 		    sc->sc_ifp->if_xname, stpstates[state], ifp->if_xname);
 	CURVNET_RESTORE();
 }
 
 /*
  * Send bridge packets through pfil if they are one of the types pfil can deal
  * with, or if they are ARP or REVARP.  (pfil will pass ARP and REVARP without
  * question.) If *bifp or *ifp are NULL then packet filtering is skipped for
  * that interface.
  */
 static int
 bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir)
 {
 	int snap, error, i, hlen;
 	struct ether_header *eh1, eh2;
 	struct ip *ip;
 	struct llc llc1;
 	u_int16_t ether_type;
 
 	snap = 0;
 	error = -1;	/* Default error if not error == 0 */
 
 #if 0
 	/* we may return with the IP fields swapped, ensure its not shared */
 	KASSERT(M_WRITABLE(*mp), ("%s: modifying a shared mbuf", __func__));
 #endif
 
 	if (V_pfil_bridge == 0 && V_pfil_member == 0 && V_pfil_ipfw == 0)
 		return (0); /* filtering is disabled */
 
 	i = min((*mp)->m_pkthdr.len, max_protohdr);
 	if ((*mp)->m_len < i) {
 	    *mp = m_pullup(*mp, i);
 	    if (*mp == NULL) {
 		printf("%s: m_pullup failed\n", __func__);
 		return (-1);
 	    }
 	}
 
 	eh1 = mtod(*mp, struct ether_header *);
 	ether_type = ntohs(eh1->ether_type);
 
 	/*
 	 * Check for SNAP/LLC.
 	 */
 	if (ether_type < ETHERMTU) {
 		struct llc *llc2 = (struct llc *)(eh1 + 1);
 
 		if ((*mp)->m_len >= ETHER_HDR_LEN + 8 &&
 		    llc2->llc_dsap == LLC_SNAP_LSAP &&
 		    llc2->llc_ssap == LLC_SNAP_LSAP &&
 		    llc2->llc_control == LLC_UI) {
 			ether_type = htons(llc2->llc_un.type_snap.ether_type);
 			snap = 1;
 		}
 	}
 
 	/*
 	 * If we're trying to filter bridge traffic, don't look at anything
 	 * other than IP and ARP traffic.  If the filter doesn't understand
 	 * IPv6, don't allow IPv6 through the bridge either.  This is lame
 	 * since if we really wanted, say, an AppleTalk filter, we are hosed,
 	 * but of course we don't have an AppleTalk filter to begin with.
 	 * (Note that since pfil doesn't understand ARP it will pass *ALL*
 	 * ARP traffic.)
 	 */
 	switch (ether_type) {
 		case ETHERTYPE_ARP:
 		case ETHERTYPE_REVARP:
 			if (V_pfil_ipfw_arp == 0)
 				return (0); /* Automatically pass */
 			break;
 
 		case ETHERTYPE_IP:
 #ifdef INET6
 		case ETHERTYPE_IPV6:
 #endif /* INET6 */
 			break;
 		default:
 			/*
 			 * Check to see if the user wants to pass non-ip
 			 * packets, these will not be checked by pfil(9) and
 			 * passed unconditionally so the default is to drop.
 			 */
 			if (V_pfil_onlyip)
 				goto bad;
 	}
 
 	/* Run the packet through pfil before stripping link headers */
 	if (PFIL_HOOKED(&V_link_pfil_hook) && V_pfil_ipfw != 0 &&
 			dir == PFIL_OUT && ifp != NULL) {
 
 		error = pfil_run_hooks(&V_link_pfil_hook, mp, ifp, dir, NULL);
 
 		if (*mp == NULL || error != 0) /* packet consumed by filter */
 			return (error);
 	}
 
 	/* Strip off the Ethernet header and keep a copy. */
 	m_copydata(*mp, 0, ETHER_HDR_LEN, (caddr_t) &eh2);
 	m_adj(*mp, ETHER_HDR_LEN);
 
 	/* Strip off snap header, if present */
 	if (snap) {
 		m_copydata(*mp, 0, sizeof(struct llc), (caddr_t) &llc1);
 		m_adj(*mp, sizeof(struct llc));
 	}
 
 	/*
 	 * Check the IP header for alignment and errors
 	 */
 	if (dir == PFIL_IN) {
 		switch (ether_type) {
 			case ETHERTYPE_IP:
 				error = bridge_ip_checkbasic(mp);
 				break;
 #ifdef INET6
 			case ETHERTYPE_IPV6:
 				error = bridge_ip6_checkbasic(mp);
 				break;
 #endif /* INET6 */
 			default:
 				error = 0;
 		}
 		if (error)
 			goto bad;
 	}
 
 	error = 0;
 
 	/*
 	 * Run the packet through pfil
 	 */
 	switch (ether_type) {
 	case ETHERTYPE_IP:
 		/*
 		 * Run pfil on the member interface and the bridge, both can
 		 * be skipped by clearing pfil_member or pfil_bridge.
 		 *
 		 * Keep the order:
 		 *   in_if -> bridge_if -> out_if
 		 */
 		if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL)
 			error = pfil_run_hooks(&V_inet_pfil_hook, mp, bifp,
 					dir, NULL);
 
 		if (*mp == NULL || error != 0) /* filter may consume */
 			break;
 
 		if (V_pfil_member && ifp != NULL)
 			error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp,
 					dir, NULL);
 
 		if (*mp == NULL || error != 0) /* filter may consume */
 			break;
 
 		if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL)
 			error = pfil_run_hooks(&V_inet_pfil_hook, mp, bifp,
 					dir, NULL);
 
 		if (*mp == NULL || error != 0) /* filter may consume */
 			break;
 
 		/* check if we need to fragment the packet */
 		if (V_pfil_member && ifp != NULL && dir == PFIL_OUT) {
 			i = (*mp)->m_pkthdr.len;
 			if (i > ifp->if_mtu) {
 				error = bridge_fragment(ifp, *mp, &eh2, snap,
 					    &llc1);
 				return (error);
 			}
 		}
 
 		/* Recalculate the ip checksum. */
 		ip = mtod(*mp, struct ip *);
 		hlen = ip->ip_hl << 2;
 		if (hlen < sizeof(struct ip))
 			goto bad;
 		if (hlen > (*mp)->m_len) {
 			if ((*mp = m_pullup(*mp, hlen)) == 0)
 				goto bad;
 			ip = mtod(*mp, struct ip *);
 			if (ip == NULL)
 				goto bad;
 		}
 		ip->ip_sum = 0;
 		if (hlen == sizeof(struct ip))
 			ip->ip_sum = in_cksum_hdr(ip);
 		else
 			ip->ip_sum = in_cksum(*mp, hlen);
 
 		break;
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 		if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL)
 			error = pfil_run_hooks(&V_inet6_pfil_hook, mp, bifp,
 					dir, NULL);
 
 		if (*mp == NULL || error != 0) /* filter may consume */
 			break;
 
 		if (V_pfil_member && ifp != NULL)
 			error = pfil_run_hooks(&V_inet6_pfil_hook, mp, ifp,
 					dir, NULL);
 
 		if (*mp == NULL || error != 0) /* filter may consume */
 			break;
 
 		if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL)
 			error = pfil_run_hooks(&V_inet6_pfil_hook, mp, bifp,
 					dir, NULL);
 		break;
 #endif
 	default:
 		error = 0;
 		break;
 	}
 
 	if (*mp == NULL)
 		return (error);
 	if (error != 0)
 		goto bad;
 
 	error = -1;
 
 	/*
 	 * Finally, put everything back the way it was and return
 	 */
 	if (snap) {
 		M_PREPEND(*mp, sizeof(struct llc), M_NOWAIT);
 		if (*mp == NULL)
 			return (error);
 		bcopy(&llc1, mtod(*mp, caddr_t), sizeof(struct llc));
 	}
 
 	M_PREPEND(*mp, ETHER_HDR_LEN, M_NOWAIT);
 	if (*mp == NULL)
 		return (error);
 	bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN);
 
 	return (0);
 
 bad:
 	m_freem(*mp);
 	*mp = NULL;
 	return (error);
 }
 
 /*
  * Perform basic checks on header size since
  * pfil assumes ip_input has already processed
  * it for it.  Cut-and-pasted from ip_input.c.
  * Given how simple the IPv6 version is,
  * does the IPv4 version really need to be
  * this complicated?
  *
  * XXX Should we update ipstat here, or not?
  * XXX Right now we update ipstat but not
  * XXX csum_counter.
  */
 static int
 bridge_ip_checkbasic(struct mbuf **mp)
 {
 	struct mbuf *m = *mp;
 	struct ip *ip;
 	int len, hlen;
 	u_short sum;
 
 	if (*mp == NULL)
 		return (-1);
 
 	if (IP_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) {
 		if ((m = m_copyup(m, sizeof(struct ip),
 			(max_linkhdr + 3) & ~3)) == NULL) {
 			/* XXXJRT new stat, please */
 			KMOD_IPSTAT_INC(ips_toosmall);
 			goto bad;
 		}
 	} else if (__predict_false(m->m_len < sizeof (struct ip))) {
 		if ((m = m_pullup(m, sizeof (struct ip))) == NULL) {
 			KMOD_IPSTAT_INC(ips_toosmall);
 			goto bad;
 		}
 	}
 	ip = mtod(m, struct ip *);
 	if (ip == NULL) goto bad;
 
 	if (ip->ip_v != IPVERSION) {
 		KMOD_IPSTAT_INC(ips_badvers);
 		goto bad;
 	}
 	hlen = ip->ip_hl << 2;
 	if (hlen < sizeof(struct ip)) { /* minimum header length */
 		KMOD_IPSTAT_INC(ips_badhlen);
 		goto bad;
 	}
 	if (hlen > m->m_len) {
 		if ((m = m_pullup(m, hlen)) == 0) {
 			KMOD_IPSTAT_INC(ips_badhlen);
 			goto bad;
 		}
 		ip = mtod(m, struct ip *);
 		if (ip == NULL) goto bad;
 	}
 
 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
 		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
 	} else {
 		if (hlen == sizeof(struct ip)) {
 			sum = in_cksum_hdr(ip);
 		} else {
 			sum = in_cksum(m, hlen);
 		}
 	}
 	if (sum) {
 		KMOD_IPSTAT_INC(ips_badsum);
 		goto bad;
 	}
 
 	/* Retrieve the packet length. */
 	len = ntohs(ip->ip_len);
 
 	/*
 	 * Check for additional length bogosity
 	 */
 	if (len < hlen) {
 		KMOD_IPSTAT_INC(ips_badlen);
 		goto bad;
 	}
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IP header would have us expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len < len) {
 		KMOD_IPSTAT_INC(ips_tooshort);
 		goto bad;
 	}
 
 	/* Checks out, proceed */
 	*mp = m;
 	return (0);
 
 bad:
 	*mp = m;
 	return (-1);
 }
 
 #ifdef INET6
 /*
  * Same as above, but for IPv6.
  * Cut-and-pasted from ip6_input.c.
  * XXX Should we update ip6stat, or not?
  */
 static int
 bridge_ip6_checkbasic(struct mbuf **mp)
 {
 	struct mbuf *m = *mp;
 	struct ip6_hdr *ip6;
 
 	/*
 	 * If the IPv6 header is not aligned, slurp it up into a new
 	 * mbuf with space for link headers, in the event we forward
 	 * it.  Otherwise, if it is aligned, make sure the entire base
 	 * IPv6 header is in the first mbuf of the chain.
 	 */
 	if (IP6_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) {
 		struct ifnet *inifp = m->m_pkthdr.rcvif;
 		if ((m = m_copyup(m, sizeof(struct ip6_hdr),
 			    (max_linkhdr + 3) & ~3)) == NULL) {
 			/* XXXJRT new stat, please */
 			IP6STAT_INC(ip6s_toosmall);
 			in6_ifstat_inc(inifp, ifs6_in_hdrerr);
 			goto bad;
 		}
 	} else if (__predict_false(m->m_len < sizeof(struct ip6_hdr))) {
 		struct ifnet *inifp = m->m_pkthdr.rcvif;
 		if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
 			IP6STAT_INC(ip6s_toosmall);
 			in6_ifstat_inc(inifp, ifs6_in_hdrerr);
 			goto bad;
 		}
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 		IP6STAT_INC(ip6s_badvers);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
 		goto bad;
 	}
 
 	/* Checks out, proceed */
 	*mp = m;
 	return (0);
 
 bad:
 	*mp = m;
 	return (-1);
 }
 #endif /* INET6 */
 
 /*
  * bridge_fragment:
  *
  *	Return a fragmented mbuf chain.
  */
 static int
 bridge_fragment(struct ifnet *ifp, struct mbuf *m, struct ether_header *eh,
     int snap, struct llc *llc)
 {
 	struct mbuf *m0;
 	struct ip *ip;
 	int error = -1;
 
 	if (m->m_len < sizeof(struct ip) &&
 	    (m = m_pullup(m, sizeof(struct ip))) == NULL)
 		goto out;
 	ip = mtod(m, struct ip *);
 
 	m->m_pkthdr.csum_flags |= CSUM_IP;
 	error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist);
 	if (error)
 		goto out;
 
 	/* walk the chain and re-add the Ethernet header */
 	for (m0 = m; m0; m0 = m0->m_nextpkt) {
 		if (error == 0) {
 			if (snap) {
 				M_PREPEND(m0, sizeof(struct llc), M_NOWAIT);
 				if (m0 == NULL) {
 					error = ENOBUFS;
 					continue;
 				}
 				bcopy(llc, mtod(m0, caddr_t),
 				    sizeof(struct llc));
 			}
 			M_PREPEND(m0, ETHER_HDR_LEN, M_NOWAIT);
 			if (m0 == NULL) {
 				error = ENOBUFS;
 				continue;
 			}
 			bcopy(eh, mtod(m0, caddr_t), ETHER_HDR_LEN);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		KMOD_IPSTAT_INC(ips_fragmented);
 
 	return (error);
 
 out:
 	if (m != NULL)
 		m_freem(m);
 	return (error);
 }
 
 static void
 bridge_linkstate(struct ifnet *ifp)
 {
 	struct bridge_softc *sc = ifp->if_bridge;
 	struct bridge_iflist *bif;
 
 	BRIDGE_LOCK(sc);
 	bif = bridge_lookup_member_if(sc, ifp);
 	if (bif == NULL) {
 		BRIDGE_UNLOCK(sc);
 		return;
 	}
 	bridge_linkcheck(sc);
 	BRIDGE_UNLOCK(sc);
 
 	bstp_linkstate(&bif->bif_stp);
 }
 
 static void
 bridge_linkcheck(struct bridge_softc *sc)
 {
 	struct bridge_iflist *bif;
 	int new_link, hasls;
 
 	BRIDGE_LOCK_ASSERT(sc);
 	new_link = LINK_STATE_DOWN;
 	hasls = 0;
 	/* Our link is considered up if at least one of our ports is active */
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		if (bif->bif_ifp->if_capabilities & IFCAP_LINKSTATE)
 			hasls++;
 		if (bif->bif_ifp->if_link_state == LINK_STATE_UP) {
 			new_link = LINK_STATE_UP;
 			break;
 		}
 	}
 	if (!LIST_EMPTY(&sc->sc_iflist) && !hasls) {
 		/* If no interfaces support link-state then we default to up */
 		new_link = LINK_STATE_UP;
 	}
 	if_link_state_change(sc->sc_ifp, new_link);
 }
Index: projects/vnet/sys/net/if_enc.c
===================================================================
--- projects/vnet/sys/net/if_enc.c	(revision 297511)
+++ projects/vnet/sys/net/if_enc.c	(revision 297512)
@@ -1,420 +1,412 @@
 /*-
  * Copyright (c) 2006 The FreeBSD Project.
  * Copyright (c) 2015 Andrey V. Elsukov <ae@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_enc.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/bpf.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_var.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 
 #include <netipsec/ipsec.h>
 #include <netipsec/xform.h>
 
 #define ENCMTU		(1024+512)
 
 /* XXX this define must have the same value as in OpenBSD */
 #define M_CONF		0x0400	/* payload was encrypted (ESP-transport) */
 #define M_AUTH		0x0800	/* payload was authenticated (AH or ESP auth) */
 #define M_AUTH_AH	0x2000	/* header was authenticated (AH) */
 
 struct enchdr {
 	u_int32_t af;
 	u_int32_t spi;
 	u_int32_t flags;
 };
 struct enc_softc {
 	struct	ifnet *sc_ifp;
 };
 static VNET_DEFINE(struct enc_softc *, enc_sc);
 #define	V_enc_sc	VNET(enc_sc)
 static VNET_DEFINE(struct if_clone *, enc_cloner);
 #define	V_enc_cloner	VNET(enc_cloner)
 
 static int	enc_ioctl(struct ifnet *, u_long, caddr_t);
 static int	enc_output(struct ifnet *, struct mbuf *,
     const struct sockaddr *, struct route *);
 static int	enc_clone_create(struct if_clone *, int, caddr_t);
 static void	enc_clone_destroy(struct ifnet *);
 static int	enc_add_hhooks(struct enc_softc *);
 static void	enc_remove_hhooks(struct enc_softc *);
 
 static const char encname[] = "enc";
 
 /*
  * Before and after are relative to when we are stripping the
  * outer IP header.
  */
 static VNET_DEFINE(int, filter_mask_in) = IPSEC_ENC_BEFORE;
 static VNET_DEFINE(int, bpf_mask_in) = IPSEC_ENC_BEFORE;
 static VNET_DEFINE(int, filter_mask_out) = IPSEC_ENC_BEFORE;
 static VNET_DEFINE(int, bpf_mask_out) = IPSEC_ENC_BEFORE | IPSEC_ENC_AFTER;
 #define	V_filter_mask_in	VNET(filter_mask_in)
 #define	V_bpf_mask_in		VNET(bpf_mask_in)
 #define	V_filter_mask_out	VNET(filter_mask_out)
 #define	V_bpf_mask_out		VNET(bpf_mask_out)
 
 static SYSCTL_NODE(_net, OID_AUTO, enc, CTLFLAG_RW, 0, "enc sysctl");
 static SYSCTL_NODE(_net_enc, OID_AUTO, in, CTLFLAG_RW, 0, "enc input sysctl");
 static SYSCTL_NODE(_net_enc, OID_AUTO, out, CTLFLAG_RW, 0, "enc output sysctl");
 SYSCTL_INT(_net_enc_in, OID_AUTO, ipsec_filter_mask,
     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(filter_mask_in), 0,
     "IPsec input firewall filter mask");
 SYSCTL_INT(_net_enc_in, OID_AUTO, ipsec_bpf_mask,
     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(bpf_mask_in), 0,
     "IPsec input bpf mask");
 SYSCTL_INT(_net_enc_out, OID_AUTO, ipsec_filter_mask,
     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(filter_mask_out), 0,
     "IPsec output firewall filter mask");
 SYSCTL_INT(_net_enc_out, OID_AUTO, ipsec_bpf_mask,
     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(bpf_mask_out), 0,
     "IPsec output bpf mask");
 
 static void
 enc_clone_destroy(struct ifnet *ifp)
 {
 	struct enc_softc *sc;
 
 	sc = ifp->if_softc;
 	KASSERT(sc == V_enc_sc, ("sc != ifp->if_softc"));
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 	if_free(ifp);
 	free(sc, M_DEVBUF);
 	V_enc_sc = NULL;
 }
 
 static int
 enc_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct ifnet *ifp;
 	struct enc_softc *sc;
 
 	sc = malloc(sizeof(struct enc_softc), M_DEVBUF,
 	    M_WAITOK | M_ZERO);
 	ifp = sc->sc_ifp = if_alloc(IFT_ENC);
 	if (ifp == NULL) {
 		free(sc, M_DEVBUF);
 		return (ENOSPC);
 	}
 	if (V_enc_sc != NULL) {
 		if_free(ifp);
 		free(sc, M_DEVBUF);
 		return (EEXIST);
 	}
 	V_enc_sc = sc;
 	if_initname(ifp, encname, unit);
 	ifp->if_mtu = ENCMTU;
 	ifp->if_ioctl = enc_ioctl;
 	ifp->if_output = enc_output;
 	ifp->if_softc = sc;
 	if_attach(ifp);
 	bpfattach(ifp, DLT_ENC, sizeof(struct enchdr));
 	return (0);
 }
 
 static int
 enc_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
     struct route *ro)
 {
 
 	m_freem(m);
 	return (0);
 }
 
 static int
 enc_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 
 	if (cmd != SIOCSIFFLAGS)
 		return (EINVAL);
 	if (ifp->if_flags & IFF_UP)
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	else
 		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	return (0);
 }
 
 /*
  * One helper hook function is used by any hook points.
  * + from hhook_type we can determine the packet direction:
  *   HHOOK_TYPE_IPSEC_IN or HHOOK_TYPE_IPSEC_OUT;
  * + from hhook_id we can determine address family: AF_INET or AF_INET6;
  * + udata contains pointer to enc_softc;
  * + ctx_data contains pointer to struct ipsec_ctx_data.
  */
 static int
 enc_hhook(int32_t hhook_type, int32_t hhook_id, void *udata, void *ctx_data,
     void *hdata, struct osd *hosd)
 {
 	struct enchdr hdr;
 	struct ipsec_ctx_data *ctx;
 	struct enc_softc *sc;
 	struct ifnet *ifp, *rcvif;
 	struct pfil_head *ph;
 	int pdir;
 
 	sc = (struct enc_softc *)udata;
 	ifp = sc->sc_ifp;
 	if ((ifp->if_flags & IFF_UP) == 0)
 		return (0);
 
 	ctx = (struct ipsec_ctx_data *)ctx_data;
 	/* XXX: wrong hook point was used by caller? */
 	if (ctx->af != hhook_id)
 		return (EPFNOSUPPORT);
 
 	if (((hhook_type == HHOOK_TYPE_IPSEC_IN &&
 	    (ctx->enc & V_bpf_mask_in) != 0) ||
 	    (hhook_type == HHOOK_TYPE_IPSEC_OUT &&
 	    (ctx->enc & V_bpf_mask_out) != 0)) &&
 	    bpf_peers_present(ifp->if_bpf) != 0) {
 		hdr.af = ctx->af;
 		hdr.spi = ctx->sav->spi;
 		hdr.flags = 0;
 		if (ctx->sav->alg_enc != SADB_EALG_NONE)
 			hdr.flags |= M_CONF;
 		if (ctx->sav->alg_auth != SADB_AALG_NONE)
 			hdr.flags |= M_AUTH;
 		bpf_mtap2(ifp->if_bpf, &hdr, sizeof(hdr), *ctx->mp);
 	}
 
 	switch (hhook_type) {
 	case HHOOK_TYPE_IPSEC_IN:
 		if (ctx->enc == IPSEC_ENC_BEFORE) {
 			/* Do accounting only once */
 			if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 			if_inc_counter(ifp, IFCOUNTER_IBYTES,
 			    (*ctx->mp)->m_pkthdr.len);
 		}
 		if ((ctx->enc & V_filter_mask_in) == 0)
 			return (0); /* skip pfil processing */
 		pdir = PFIL_IN;
 		break;
 	case HHOOK_TYPE_IPSEC_OUT:
 		if (ctx->enc == IPSEC_ENC_BEFORE) {
 			/* Do accounting only once */
 			if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
 			    (*ctx->mp)->m_pkthdr.len);
 		}
 		if ((ctx->enc & V_filter_mask_out) == 0)
 			return (0); /* skip pfil processing */
 		pdir = PFIL_OUT;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	switch (hhook_id) {
 #ifdef INET
 	case AF_INET:
 		ph = &V_inet_pfil_hook;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ph = &V_inet6_pfil_hook;
 		break;
 #endif
 	default:
 		ph = NULL;
 	}
 	if (ph == NULL || !PFIL_HOOKED(ph))
 		return (0);
 	/* Make a packet looks like it was received on enc(4) */
 	rcvif = (*ctx->mp)->m_pkthdr.rcvif;
 	(*ctx->mp)->m_pkthdr.rcvif = ifp;
 	if (pfil_run_hooks(ph, ctx->mp, ifp, pdir, NULL) != 0 ||
 	    *ctx->mp == NULL) {
 		*ctx->mp = NULL; /* consumed by filter */
 		return (EACCES);
 	}
 	(*ctx->mp)->m_pkthdr.rcvif = rcvif;
 	return (0);
 }
 
 static int
 enc_add_hhooks(struct enc_softc *sc)
 {
 	struct hookinfo hki;
 	int error;
 
 	error = EPFNOSUPPORT;
 	hki.hook_func = enc_hhook;
 	hki.hook_helper = NULL;
 	hki.hook_udata = sc;
 #ifdef INET
 	hki.hook_id = AF_INET;
 	hki.hook_type = HHOOK_TYPE_IPSEC_IN;
 	error = hhook_add_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET],
 	    &hki, HHOOK_WAITOK);
 	if (error != 0)
 		return (error);
 	hki.hook_type = HHOOK_TYPE_IPSEC_OUT;
 	error = hhook_add_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET],
 	    &hki, HHOOK_WAITOK);
 	if (error != 0)
 		return (error);
 #endif
 #ifdef INET6
 	hki.hook_id = AF_INET6;
 	hki.hook_type = HHOOK_TYPE_IPSEC_IN;
 	error = hhook_add_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET6],
 	    &hki, HHOOK_WAITOK);
 	if (error != 0)
 		return (error);
 	hki.hook_type = HHOOK_TYPE_IPSEC_OUT;
 	error = hhook_add_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET6],
 	    &hki, HHOOK_WAITOK);
 	if (error != 0)
 		return (error);
 #endif
 	return (error);
 }
 
 static void
 enc_remove_hhooks(struct enc_softc *sc)
 {
 	struct hookinfo hki;
 
 	hki.hook_func = enc_hhook;
 	hki.hook_helper = NULL;
 	hki.hook_udata = sc;
 #ifdef INET
 	hki.hook_id = AF_INET;
 	hki.hook_type = HHOOK_TYPE_IPSEC_IN;
 	hhook_remove_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET], &hki);
 	hki.hook_type = HHOOK_TYPE_IPSEC_OUT;
 	hhook_remove_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET], &hki);
 #endif
 #ifdef INET6
 	hki.hook_id = AF_INET6;
 	hki.hook_type = HHOOK_TYPE_IPSEC_IN;
 	hhook_remove_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET6], &hki);
 	hki.hook_type = HHOOK_TYPE_IPSEC_OUT;
 	hhook_remove_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET6], &hki);
 #endif
 }
 
 static void
 vnet_enc_init(const void *unused __unused)
 {
 
 	V_enc_sc = NULL;
 	V_enc_cloner = if_clone_simple(encname, enc_clone_create,
 	    enc_clone_destroy, 1);
 }
 VNET_SYSINIT(vnet_enc_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_enc_init, NULL);
 
 static void
 vnet_enc_init_proto(void *unused __unused)
 {
 	KASSERT(V_enc_sc != NULL, ("%s: V_enc_sc is %p\n", __func__, V_enc_sc));
 
 	if (enc_add_hhooks(V_enc_sc) != 0)
 		enc_clone_destroy(V_enc_sc->sc_ifp);
 }
 VNET_SYSINIT(vnet_enc_init_proto, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_enc_init_proto, NULL);
 
 static void
 vnet_enc_uninit(const void *unused __unused)
 {
-
-	if_clone_detach(V_enc_cloner);
-}
-VNET_SYSUNINIT(vnet_enc_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY,
-    vnet_enc_uninit, NULL);
-
-static void
-vnet_enc_uninit_proto(void *unused __unused)
-{
 	KASSERT(V_enc_sc != NULL, ("%s: V_enc_sc is %p\n", __func__, V_enc_sc));
 
 	enc_remove_hhooks(V_enc_sc);
+	if_clone_detach(V_enc_cloner);
 }
-VNET_SYSUNINIT(vnet_enc_uninit_proto, SI_SUB_PROTO_IFATTACHDOMAIN,
-    SI_ORDER_ANY, vnet_enc_uninit_proto, NULL);
+VNET_SYSUNINIT(vnet_enc_uninit, SI_SUB_PSEUDO_DONE, SI_ORDER_ANY,
+    vnet_enc_uninit, NULL);
 
 static int
 enc_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 	case MOD_UNLOAD:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t enc_mod = {
 	"if_enc",
 	enc_modevent,
 	0
 };
 
 DECLARE_MODULE(if_enc, enc_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
Index: projects/vnet/sys/net/if_epair.c
===================================================================
--- projects/vnet/sys/net/if_epair.c	(revision 297511)
+++ projects/vnet/sys/net/if_epair.c	(revision 297512)
@@ -1,1010 +1,1010 @@
 /*-
  * Copyright (c) 2008 The FreeBSD Foundation
  * Copyright (c) 2009-2010 Bjoern A. Zeeb <bz@FreeBSD.org>
  * All rights reserved.
  *
  * This software was developed by CK Software GmbH under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  * notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  * notice, this list of conditions and the following disclaimer in the
  * documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * A pair of virtual back-to-back connected ethernet like interfaces
  * (``two interfaces with a virtual cross-over cable'').
  *
  * This is mostly intended to be used to provide connectivity between
  * different virtual network stack instances.
  */
 /*
  * Things to re-think once we have more experience:
  * - ifp->if_reassign function once we can test with vimage. Depending on
  *   how if_vmove() is going to be improved.
  * - Real random etheraddrs that are checked to be uniquish; we would need
  *   to re-do them in case we move the interface between network stacks
  *   in a private if_reassign function.
  *   In case we bridge to a real interface/network or between indepedent
  *   epairs on multiple stacks/machines, we may need this.
  *   For now let the user handle that case.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/refcount.h>
 #include <sys/queue.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/types.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_media.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 SYSCTL_DECL(_net_link);
 static SYSCTL_NODE(_net_link, OID_AUTO, epair, CTLFLAG_RW, 0, "epair sysctl");
 
 #ifdef EPAIR_DEBUG
 static int epair_debug = 0;
 SYSCTL_INT(_net_link_epair, OID_AUTO, epair_debug, CTLFLAG_RW,
     &epair_debug, 0, "if_epair(4) debugging.");
 #define	DPRINTF(fmt, arg...)						\
 	if (epair_debug)						\
 		printf("[%s:%d] " fmt, __func__, __LINE__, ##arg)
 #else
 #define	DPRINTF(fmt, arg...)
 #endif
 
 static void epair_nh_sintr(struct mbuf *);
 static struct mbuf *epair_nh_m2cpuid(struct mbuf *, uintptr_t, u_int *);
 static void epair_nh_drainedcpu(u_int);
 
 static void epair_start_locked(struct ifnet *);
 static int epair_media_change(struct ifnet *);
 static void epair_media_status(struct ifnet *, struct ifmediareq *);
 
 static int epair_clone_match(struct if_clone *, const char *);
 static int epair_clone_create(struct if_clone *, char *, size_t, caddr_t);
 static int epair_clone_destroy(struct if_clone *, struct ifnet *);
 
 static const char epairname[] = "epair";
 
 /* Netisr related definitions and sysctl. */
 static struct netisr_handler epair_nh = {
 	.nh_name	= epairname,
 	.nh_proto	= NETISR_EPAIR,
 	.nh_policy	= NETISR_POLICY_CPU,
 	.nh_handler	= epair_nh_sintr,
 	.nh_m2cpuid	= epair_nh_m2cpuid,
 	.nh_drainedcpu	= epair_nh_drainedcpu,
 };
 
 static int
 sysctl_epair_netisr_maxqlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&epair_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&epair_nh, qlimit));
 }
 SYSCTL_PROC(_net_link_epair, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_epair_netisr_maxqlen, "I",
     "Maximum if_epair(4) netisr \"hw\" queue length");
 
 struct epair_softc {
 	struct ifnet	*ifp;		/* This ifp. */
 	struct ifnet	*oifp;		/* other ifp of pair. */
 	struct ifmedia	media;		/* Media config (fake). */
 	u_int		refcount;	/* # of mbufs in flight. */
 	u_int		cpuid;		/* CPU ID assigned upon creation. */
 	void		(*if_qflush)(struct ifnet *);
 					/* Original if_qflush routine. */
 };
 
 /*
  * Per-CPU list of ifps with data in the ifq that needs to be flushed
  * to the netisr ``hw'' queue before we allow any further direct queuing
  * to the ``hw'' queue.
  */
 struct epair_ifp_drain {
 	STAILQ_ENTRY(epair_ifp_drain)	ifp_next;
 	struct ifnet			*ifp;
 };
 STAILQ_HEAD(eid_list, epair_ifp_drain);
 
 #define	EPAIR_LOCK_INIT(dpcpu)		mtx_init(&(dpcpu)->if_epair_mtx, \
 					    "if_epair", NULL, MTX_DEF)
 #define	EPAIR_LOCK_DESTROY(dpcpu)	mtx_destroy(&(dpcpu)->if_epair_mtx)
 #define	EPAIR_LOCK_ASSERT(dpcpu)	mtx_assert(&(dpcpu)->if_epair_mtx, \
 					    MA_OWNED)
 #define	EPAIR_LOCK(dpcpu)		mtx_lock(&(dpcpu)->if_epair_mtx)
 #define	EPAIR_UNLOCK(dpcpu)		mtx_unlock(&(dpcpu)->if_epair_mtx)
 
 #ifdef INVARIANTS
 #define	EPAIR_REFCOUNT_INIT(r, v)	refcount_init((r), (v))
 #define	EPAIR_REFCOUNT_AQUIRE(r)	refcount_acquire((r))
 #define	EPAIR_REFCOUNT_RELEASE(r)	refcount_release((r))
 #define	EPAIR_REFCOUNT_ASSERT(a, p)	KASSERT(a, p)
 #else
 #define	EPAIR_REFCOUNT_INIT(r, v)
 #define	EPAIR_REFCOUNT_AQUIRE(r)
 #define	EPAIR_REFCOUNT_RELEASE(r)
 #define	EPAIR_REFCOUNT_ASSERT(a, p)
 #endif
 
 static MALLOC_DEFINE(M_EPAIR, epairname,
     "Pair of virtual cross-over connected Ethernet-like interfaces");
 
 static VNET_DEFINE(struct if_clone *, epair_cloner);
 #define	V_epair_cloner	VNET(epair_cloner)
 
 /*
  * DPCPU area and functions.
  */
 struct epair_dpcpu {
 	struct mtx	if_epair_mtx;		/* Per-CPU locking. */
 	int		epair_drv_flags;	/* Per-CPU ``hw'' drv flags. */
 	struct eid_list	epair_ifp_drain_list;	/* Per-CPU list of ifps with
 						 * data in the ifq. */
 };
 DPCPU_DEFINE(struct epair_dpcpu, epair_dpcpu);
 
 static void
 epair_dpcpu_init(void)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct eid_list *s;
 	u_int cpuid;
 
 	CPU_FOREACH(cpuid) {
 		epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
 
 		/* Initialize per-cpu lock. */
 		EPAIR_LOCK_INIT(epair_dpcpu);
 
 		/* Driver flags are per-cpu as are our netisr "hw" queues. */
 		epair_dpcpu->epair_drv_flags = 0;
 
 		/*
 		 * Initialize per-cpu drain list.
 		 * Manually do what STAILQ_HEAD_INITIALIZER would do.
 		 */
 		s = &epair_dpcpu->epair_ifp_drain_list;
 		s->stqh_first = NULL;
 		s->stqh_last = &s->stqh_first;
 	} 
 }
 
 static void
 epair_dpcpu_detach(void)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	u_int cpuid;
 
 	CPU_FOREACH(cpuid) {
 		epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
 
 		/* Destroy per-cpu lock. */
 		EPAIR_LOCK_DESTROY(epair_dpcpu);
 	}
 }
 
 /*
  * Helper functions.
  */
 static u_int
 cpuid_from_ifp(struct ifnet *ifp)
 {
 	struct epair_softc *sc;
 
 	if (ifp == NULL)
 		return (0);
 	sc = ifp->if_softc;
 
 	return (sc->cpuid);
 }
 
 /*
  * Netisr handler functions.
  */
 static void
 epair_nh_sintr(struct mbuf *m)
 {
 	struct ifnet *ifp;
 	struct epair_softc *sc;
 
 	ifp = m->m_pkthdr.rcvif;
 	(*ifp->if_input)(ifp, m);
 	sc = ifp->if_softc;
 	EPAIR_REFCOUNT_RELEASE(&sc->refcount);
 	EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
 	    ("%s: ifp=%p sc->refcount not >= 1: %d",
 	    __func__, ifp, sc->refcount));
 	DPRINTF("ifp=%p refcount=%u\n", ifp, sc->refcount);
 }
 
 static struct mbuf *
 epair_nh_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
 {
 
 	*cpuid = cpuid_from_ifp(m->m_pkthdr.rcvif);
 
 	return (m);
 }
 
 static void
 epair_nh_drainedcpu(u_int cpuid)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct epair_ifp_drain *elm, *tvar;
 	struct ifnet *ifp;
 
 	epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
 	EPAIR_LOCK(epair_dpcpu);
 	/*
 	 * Assume our "hw" queue and possibly ifq will be emptied
 	 * again. In case we will overflow the "hw" queue while
 	 * draining, epair_start_locked will set IFF_DRV_OACTIVE
 	 * again and we will stop and return.
 	 */
 	STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list,
 	    ifp_next, tvar) {
 		ifp = elm->ifp;
 		epair_dpcpu->epair_drv_flags &= ~IFF_DRV_OACTIVE;
 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 		epair_start_locked(ifp);
 
 		IFQ_LOCK(&ifp->if_snd);
 		if (IFQ_IS_EMPTY(&ifp->if_snd)) {
 			struct epair_softc *sc;
 
 			STAILQ_REMOVE(&epair_dpcpu->epair_ifp_drain_list,
 			    elm, epair_ifp_drain, ifp_next);
 			/* The cached ifp goes off the list. */
 			sc = ifp->if_softc;
 			EPAIR_REFCOUNT_RELEASE(&sc->refcount);
 			EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
 			    ("%s: ifp=%p sc->refcount not >= 1: %d",
 			    __func__, ifp, sc->refcount));
 			free(elm, M_EPAIR);
 		}
 		IFQ_UNLOCK(&ifp->if_snd);
 
 		if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) != 0) {
 			/* Our "hw"q overflew again. */
 			epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
 			DPRINTF("hw queue length overflow at %u\n",
 			    epair_nh.nh_qlimit);
 			break;
 		}
 	}
 	EPAIR_UNLOCK(epair_dpcpu);
 }
 
 /*
  * Network interface (`if') related functions.
  */
 static void
 epair_remove_ifp_from_draining(struct ifnet *ifp)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct epair_ifp_drain *elm, *tvar;
 	u_int cpuid;
 
 	CPU_FOREACH(cpuid) {
 		epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
 		EPAIR_LOCK(epair_dpcpu);
 		STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list,
 		    ifp_next, tvar) {
 			if (ifp == elm->ifp) {
 				struct epair_softc *sc;
 
 				STAILQ_REMOVE(
 				    &epair_dpcpu->epair_ifp_drain_list, elm,
 				    epair_ifp_drain, ifp_next);
 				/* The cached ifp goes off the list. */
 				sc = ifp->if_softc;
 				EPAIR_REFCOUNT_RELEASE(&sc->refcount);
 				EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
 				    ("%s: ifp=%p sc->refcount not >= 1: %d",
 				    __func__, ifp, sc->refcount));
 				free(elm, M_EPAIR);
 			}
 		}
 		EPAIR_UNLOCK(epair_dpcpu);
 	}
 }
 
 static int
 epair_add_ifp_for_draining(struct ifnet *ifp)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct epair_softc *sc;
 	struct epair_ifp_drain *elm = NULL;
 
 	sc = ifp->if_softc;
 	epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
 	EPAIR_LOCK_ASSERT(epair_dpcpu);
 	STAILQ_FOREACH(elm, &epair_dpcpu->epair_ifp_drain_list, ifp_next)
 		if (elm->ifp == ifp)
 			break;
 	/* If the ifp is there already, return success. */
 	if (elm != NULL)
 		return (0);
 
 	elm = malloc(sizeof(struct epair_ifp_drain), M_EPAIR, M_NOWAIT|M_ZERO);
 	if (elm == NULL)
 		return (ENOMEM);
 
 	elm->ifp = ifp;
 	/* Add a reference for the ifp pointer on the list. */
 	EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
 	STAILQ_INSERT_TAIL(&epair_dpcpu->epair_ifp_drain_list, elm, ifp_next);
 
 	return (0);
 }
 
 static void
 epair_start_locked(struct ifnet *ifp)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct mbuf *m;
 	struct epair_softc *sc;
 	struct ifnet *oifp;
 	int error;
 
 	DPRINTF("ifp=%p\n", ifp);
 	sc = ifp->if_softc;
 	epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
 	EPAIR_LOCK_ASSERT(epair_dpcpu);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 	if ((ifp->if_flags & IFF_UP) == 0)
 		return;
 
 	/*
 	 * We get patckets here from ether_output via if_handoff()
 	 * and ned to put them into the input queue of the oifp
 	 * and call oifp->if_input() via netisr/epair_sintr().
 	 */
 	oifp = sc->oifp;
 	sc = oifp->if_softc;
 	for (;;) {
 		IFQ_DEQUEUE(&ifp->if_snd, m);
 		if (m == NULL)
 			break;
 		BPF_MTAP(ifp, m);
 
 		/*
 		 * In case the outgoing interface is not usable,
 		 * drop the packet.
 		 */
 		if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 		    (oifp->if_flags & IFF_UP) ==0) {
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			m_freem(m);
 			continue;
 		}
 		DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname);
 
 		/*
 		 * Add a reference so the interface cannot go while the
 		 * packet is in transit as we rely on rcvif to stay valid.
 		 */
 		EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
 		m->m_pkthdr.rcvif = oifp;
 		CURVNET_SET_QUIET(oifp->if_vnet);
 		error = netisr_queue(NETISR_EPAIR, m);
 		CURVNET_RESTORE();
 		if (!error) {
 			if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 			/* Someone else received the packet. */
 			if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
 		} else {
 			/* The packet was freed already. */
 			epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
 			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 			(void) epair_add_ifp_for_draining(ifp);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			EPAIR_REFCOUNT_RELEASE(&sc->refcount);
 			EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
 			    ("%s: ifp=%p sc->refcount not >= 1: %d",
 			    __func__, oifp, sc->refcount));
 		}
 	}
 }
 
 static void
 epair_start(struct ifnet *ifp)
 {
 	struct epair_dpcpu *epair_dpcpu;
 
 	epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu);
 	EPAIR_LOCK(epair_dpcpu);
 	epair_start_locked(ifp);
 	EPAIR_UNLOCK(epair_dpcpu);
 }
 
 static int
 epair_transmit_locked(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	struct epair_softc *sc;
 	struct ifnet *oifp;
 	int error, len;
 	short mflags;
 
 	DPRINTF("ifp=%p m=%p\n", ifp, m);
 	sc = ifp->if_softc;
 	epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
 	EPAIR_LOCK_ASSERT(epair_dpcpu);
 
 	if (m == NULL)
 		return (0);
 	
 	/*
 	 * We are not going to use the interface en/dequeue mechanism
 	 * on the TX side. We are called from ether_output_frame()
 	 * and will put the packet into the incoming queue of the
 	 * other interface of our pair via the netsir.
 	 */
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		m_freem(m);
 		return (ENXIO);
 	}
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	BPF_MTAP(ifp, m);
 
 	/*
 	 * In case the outgoing interface is not usable,
 	 * drop the packet.
 	 */
 	oifp = sc->oifp;
 	if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    (oifp->if_flags & IFF_UP) ==0) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (0);
 	}
 	len = m->m_pkthdr.len;
 	mflags = m->m_flags;
 	DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname);
 
 #ifdef ALTQ
 	/* Support ALTQ via the clasic if_start() path. */
 	IF_LOCK(&ifp->if_snd);
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		ALTQ_ENQUEUE(&ifp->if_snd, m, NULL, error);
 		if (error)
 			if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 		IF_UNLOCK(&ifp->if_snd);
 		if (!error) {
 			if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 			if (mflags & (M_BCAST|M_MCAST))
 				if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 			
 			if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0)
 				epair_start_locked(ifp);
 			else
 				(void)epair_add_ifp_for_draining(ifp);
 		}
 		return (error);
 	}
 	IF_UNLOCK(&ifp->if_snd);
 #endif
 
 	if ((epair_dpcpu->epair_drv_flags & IFF_DRV_OACTIVE) != 0) {
 		/*
 		 * Our hardware queue is full, try to fall back
 		 * queuing to the ifq but do not call ifp->if_start.
 		 * Either we are lucky or the packet is gone.
 		 */
 		IFQ_ENQUEUE(&ifp->if_snd, m, error);
 		if (!error)
 			(void)epair_add_ifp_for_draining(ifp);
 		return (error);
 	}
 	sc = oifp->if_softc;
 	/*
 	 * Add a reference so the interface cannot go while the
 	 * packet is in transit as we rely on rcvif to stay valid.
 	 */
 	EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
 	m->m_pkthdr.rcvif = oifp;
 	CURVNET_SET_QUIET(oifp->if_vnet);
 	error = netisr_queue(NETISR_EPAIR, m);
 	CURVNET_RESTORE();
 	if (!error) {
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		/*
 		 * IFQ_HANDOFF_ADJ/ip_handoff() update statistics,
 		 * but as we bypass all this we have to duplicate
 		 * the logic another time.
 		 */
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 		if (mflags & (M_BCAST|M_MCAST))
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 		/* Someone else received the packet. */
 		if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
 	} else {
 		/* The packet was freed already. */
 		epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
 		ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		EPAIR_REFCOUNT_RELEASE(&sc->refcount);
 		EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
 		    ("%s: ifp=%p sc->refcount not >= 1: %d",
 		    __func__, oifp, sc->refcount));
 	}
 
 	return (error);
 }
 
 static int
 epair_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epair_dpcpu *epair_dpcpu;
 	int error;
 
 	epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu);
 	EPAIR_LOCK(epair_dpcpu);
 	error = epair_transmit_locked(ifp, m);
 	EPAIR_UNLOCK(epair_dpcpu);
 	return (error);
 }
 
 static void
 epair_qflush(struct ifnet *ifp)
 {
 	struct epair_softc *sc;
 	
 	sc = ifp->if_softc;
 	KASSERT(sc != NULL, ("%s: ifp=%p, epair_softc gone? sc=%p\n",
 	    __func__, ifp, sc));
 	/*
 	 * Remove this ifp from all backpointer lists. The interface will not
 	 * usable for flushing anyway nor should it have anything to flush
 	 * after if_qflush().
 	 */
 	epair_remove_ifp_from_draining(ifp);
 
 	if (sc->if_qflush)
 		sc->if_qflush(ifp);
 }
 
 static int
 epair_media_change(struct ifnet *ifp __unused)
 {
 
 	/* Do nothing. */
 	return (0);
 }
 
 static void
 epair_media_status(struct ifnet *ifp __unused, struct ifmediareq *imr)
 {
 
 	imr->ifm_status = IFM_AVALID | IFM_ACTIVE;
 	imr->ifm_active = IFM_ETHER | IFM_10G_T | IFM_FDX;
 }
 
 static int
 epair_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct epair_softc *sc;
 	struct ifreq *ifr;
 	int error;
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 	case SIOCSIFFLAGS:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		error = 0;
 		break;
 
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		sc = ifp->if_softc;
 		error = ifmedia_ioctl(ifp, ifr, &sc->media, cmd);
 		break;
 
 	case SIOCSIFMTU:
 		/* We basically allow all kinds of MTUs. */
 		ifp->if_mtu = ifr->ifr_mtu;
 		error = 0;
 		break;
 
 	default:
 		/* Let the common ethernet handler process this. */
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 
 	return (error);
 }
 
 static void
 epair_init(void *dummy __unused)
 {
 }
 
 
 /*
  * Interface cloning functions.
  * We use our private ones so that we can create/destroy our secondary
  * device along with the primary one.
  */
 static int
 epair_clone_match(struct if_clone *ifc, const char *name)
 {
 	const char *cp;
 
 	DPRINTF("name='%s'\n", name);
 
 	/*
 	 * Our base name is epair.
 	 * Our interfaces will be named epair<n>[ab].
 	 * So accept anything of the following list:
 	 * - epair
 	 * - epair<n>
 	 * but not the epair<n>[ab] versions.
 	 */
 	if (strncmp(epairname, name, sizeof(epairname)-1) != 0)
 		return (0);
 
 	for (cp = name + sizeof(epairname) - 1; *cp != '\0'; cp++) {
 		if (*cp < '0' || *cp > '9')
 			return (0);
 	}
 
 	return (1);
 }
 
 static int
 epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 {
 	struct epair_softc *sca, *scb;
 	struct ifnet *ifp;
 	char *dp;
 	int error, unit, wildcard;
 	uint8_t eaddr[ETHER_ADDR_LEN];	/* 00:00:00:00:00:00 */
 
 	/*
 	 * We are abusing params to create our second interface.
 	 * Actually we already created it and called if_clone_create()
 	 * for it to do the official insertion procedure the moment we knew
 	 * it cannot fail anymore. So just do attach it here.
 	 */
 	if (params) {
 		scb = (struct epair_softc *)params;
 		ifp = scb->ifp;
 		/* Assign a hopefully unique, locally administered etheraddr. */
 		eaddr[0] = 0x02;
 		eaddr[3] = (ifp->if_index >> 8) & 0xff;
 		eaddr[4] = ifp->if_index & 0xff;
 		eaddr[5] = 0x0b;
 		ether_ifattach(ifp, eaddr);
 		/* Correctly set the name for the cloner list. */
 		strlcpy(name, scb->ifp->if_xname, len);
 		return (0);
 	}
 
 	/* Try to see if a special unit was requested. */
 	error = ifc_name2unit(name, &unit);
 	if (error != 0)
 		return (error);
 	wildcard = (unit < 0);
 
 	error = ifc_alloc_unit(ifc, &unit);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * If no unit had been given, we need to adjust the ifName.
 	 * Also make sure there is space for our extra [ab] suffix.
 	 */
 	for (dp = name; *dp != '\0'; dp++);
 	if (wildcard) {
 		error = snprintf(dp, len - (dp - name), "%d", unit);
 		if (error > len - (dp - name) - 1) {
 			/* ifName too long. */
 			ifc_free_unit(ifc, unit);
 			return (ENOSPC);
 		}
 		dp += error;
 	}
 	if (len - (dp - name) - 1 < 1) {
 		/* No space left for our [ab] suffix. */
 		ifc_free_unit(ifc, unit);
 		return (ENOSPC);
 	}
 	*dp = 'b';
 	/* Must not change dp so we can replace 'a' by 'b' later. */
 	*(dp+1) = '\0';
 
 	/* Check if 'a' and 'b' interfaces already exist. */ 
 	if (ifunit(name) != NULL)
 		return (EEXIST);
 	*dp = 'a';
 	if (ifunit(name) != NULL)
 		return (EEXIST);
 
 	/* Allocate memory for both [ab] interfaces */
 	sca = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO);
 	EPAIR_REFCOUNT_INIT(&sca->refcount, 1);
 	sca->ifp = if_alloc(IFT_ETHER);
 	if (sca->ifp == NULL) {
 		free(sca, M_EPAIR);
 		ifc_free_unit(ifc, unit);
 		return (ENOSPC);
 	}
 
 	scb = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO);
 	EPAIR_REFCOUNT_INIT(&scb->refcount, 1);
 	scb->ifp = if_alloc(IFT_ETHER);
 	if (scb->ifp == NULL) {
 		free(scb, M_EPAIR);
 		if_free(sca->ifp);
 		free(sca, M_EPAIR);
 		ifc_free_unit(ifc, unit);
 		return (ENOSPC);
 	}
 	
 	/*
 	 * Cross-reference the interfaces so we will be able to free both.
 	 */
 	sca->oifp = scb->ifp;
 	scb->oifp = sca->ifp;
 
 	/*
 	 * Calculate the cpuid for netisr queueing based on the
 	 * ifIndex of the interfaces. As long as we cannot configure
 	 * this or use cpuset information easily we cannot guarantee
 	 * cache locality but we can at least allow parallelism.
 	 */
 	sca->cpuid =
 	    netisr_get_cpuid(sca->ifp->if_index % netisr_get_cpucount());
 	scb->cpuid =
 	    netisr_get_cpuid(scb->ifp->if_index % netisr_get_cpucount());
 
 	/* Initialise pseudo media types. */
 	ifmedia_init(&sca->media, 0, epair_media_change, epair_media_status);
 	ifmedia_add(&sca->media, IFM_ETHER | IFM_10G_T, 0, NULL);
 	ifmedia_set(&sca->media, IFM_ETHER | IFM_10G_T);
 	ifmedia_init(&scb->media, 0, epair_media_change, epair_media_status);
 	ifmedia_add(&scb->media, IFM_ETHER | IFM_10G_T, 0, NULL);
 	ifmedia_set(&scb->media, IFM_ETHER | IFM_10G_T);
 	
 	/* Finish initialization of interface <n>a. */
 	ifp = sca->ifp;
 	ifp->if_softc = sca;
 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
 	ifp->if_dname = epairname;
 	ifp->if_dunit = unit;
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_capabilities = IFCAP_VLAN_MTU;
 	ifp->if_capenable = IFCAP_VLAN_MTU;
 	ifp->if_start = epair_start;
 	ifp->if_ioctl = epair_ioctl;
 	ifp->if_init  = epair_init;
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	/* Assign a hopefully unique, locally administered etheraddr. */
 	eaddr[0] = 0x02;
 	eaddr[3] = (ifp->if_index >> 8) & 0xff;
 	eaddr[4] = ifp->if_index & 0xff;
 	eaddr[5] = 0x0a;
 	ether_ifattach(ifp, eaddr);
 	sca->if_qflush = ifp->if_qflush;
 	ifp->if_qflush = epair_qflush;
 	ifp->if_transmit = epair_transmit;
 	ifp->if_baudrate = IF_Gbps(10);	/* arbitrary maximum */
 
 	/* Swap the name and finish initialization of interface <n>b. */
 	*dp = 'b';
 
 	ifp = scb->ifp;
 	ifp->if_softc = scb;
 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
 	ifp->if_dname = epairname;
 	ifp->if_dunit = unit;
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_capabilities = IFCAP_VLAN_MTU;
 	ifp->if_capenable = IFCAP_VLAN_MTU;
 	ifp->if_start = epair_start;
 	ifp->if_ioctl = epair_ioctl;
 	ifp->if_init  = epair_init;
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	/* We need to play some tricks here for the second interface. */
 	strlcpy(name, epairname, len);
 	error = if_clone_create(name, len, (caddr_t)scb);
 	if (error)
 		panic("%s: if_clone_create() for our 2nd iface failed: %d",
 		    __func__, error);
 	scb->if_qflush = ifp->if_qflush;
 	ifp->if_qflush = epair_qflush;
 	ifp->if_transmit = epair_transmit;
 	ifp->if_baudrate = IF_Gbps(10);	/* arbitrary maximum */
 
 	/*
 	 * Restore name to <n>a as the ifp for this will go into the
 	 * cloner list for the initial call.
 	 */
 	strlcpy(name, sca->ifp->if_xname, len);
 	DPRINTF("name='%s/%db' created sca=%p scb=%p\n", name, unit, sca, scb);
 
 	/* Tell the world, that we are ready to rock. */
 	sca->ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	scb->ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	if_link_state_change(sca->ifp, LINK_STATE_UP);
 	if_link_state_change(scb->ifp, LINK_STATE_UP);
 
 	return (0);
 }
 
 static int
 epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
 {
 	struct ifnet *oifp;
 	struct epair_softc *sca, *scb;
 	int unit, error;
 
 	DPRINTF("ifp=%p\n", ifp);
 
 	/*
 	 * In case we called into if_clone_destroyif() ourselves
 	 * again to remove the second interface, the softc will be
 	 * NULL. In that case so not do anything but return success.
 	 */
 	if (ifp->if_softc == NULL)
 		return (0);
 	
 	unit = ifp->if_dunit;
 	sca = ifp->if_softc;
 	oifp = sca->oifp;
 	scb = oifp->if_softc;
 
 	DPRINTF("ifp=%p oifp=%p\n", ifp, oifp);
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 	if_link_state_change(oifp, LINK_STATE_DOWN);
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	oifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 
 	/*
 	 * Get rid of our second half. As the other of the two
 	 * interfaces may reside in a different vnet, we need to
 	 * switch before freeing them.
 	 */
 	CURVNET_SET_QUIET(oifp->if_vnet);
 	ether_ifdetach(oifp);
 	/*
 	 * Wait for all packets to be dispatched to if_input.
 	 * The numbers can only go down as the interface is
 	 * detached so there is no need to use atomics.
 	 */
 	DPRINTF("scb refcnt=%u\n", scb->refcount);
 	EPAIR_REFCOUNT_ASSERT(scb->refcount == 1,
 	    ("%s: ifp=%p scb->refcount!=1: %d", __func__, oifp, scb->refcount));
 	oifp->if_softc = NULL;
 	error = if_clone_destroyif(ifc, oifp);
 	if (error)
 		panic("%s: if_clone_destroyif() for our 2nd iface failed: %d",
 		    __func__, error);
 	if_free(oifp);
 	ifmedia_removeall(&scb->media);
 	free(scb, M_EPAIR);
 	CURVNET_RESTORE();
 
 	ether_ifdetach(ifp);
 	/*
 	 * Wait for all packets to be dispatched to if_input.
 	 */
 	DPRINTF("sca refcnt=%u\n", sca->refcount);
 	EPAIR_REFCOUNT_ASSERT(sca->refcount == 1,
 	    ("%s: ifp=%p sca->refcount!=1: %d", __func__, ifp, sca->refcount));
 	if_free(ifp);
 	ifmedia_removeall(&sca->media);
 	free(sca, M_EPAIR);
 	ifc_free_unit(ifc, unit);
 
 	return (0);
 }
 
 static void
 vnet_epair_init(const void *unused __unused)
 {
 
 	V_epair_cloner = if_clone_advanced(epairname, 0,
 	    epair_clone_match, epair_clone_create, epair_clone_destroy);
 }
-VNET_SYSINIT(vnet_epair_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+VNET_SYSINIT(vnet_epair_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_epair_init, NULL);
 
 static void
 vnet_epair_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_epair_cloner);
 }
-VNET_SYSUNINIT(vnet_epair_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+VNET_SYSUNINIT(vnet_epair_uninit, SI_SUB_PSEUDO_DONE, SI_ORDER_ANY,
     vnet_epair_uninit, NULL);
 
 static int
 epair_modevent(module_t mod, int type, void *data)
 {
 	int qlimit;
 
 	switch (type) {
 	case MOD_LOAD:
 		/* For now limit us to one global mutex and one inq. */
 		epair_dpcpu_init();
 		epair_nh.nh_qlimit = 42 * ifqmaxlen; /* 42 shall be the number. */
 		if (TUNABLE_INT_FETCH("net.link.epair.netisr_maxqlen", &qlimit))
 		    epair_nh.nh_qlimit = qlimit;
 		netisr_register(&epair_nh);
 		if (bootverbose)
 			printf("%s initialized.\n", epairname);
 		break;
 	case MOD_UNLOAD:
 		netisr_unregister(&epair_nh);
 		epair_dpcpu_detach();
 		if (bootverbose)
 			printf("%s unloaded.\n", epairname);
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t epair_mod = {
 	"if_epair",
 	epair_modevent,
 	0
 };
 
 DECLARE_MODULE(if_epair, epair_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_epair, 1);
Index: projects/vnet/sys/net/if_lagg.c
===================================================================
--- projects/vnet/sys/net/if_lagg.c	(revision 297511)
+++ projects/vnet/sys/net/if_lagg.c	(revision 297512)
@@ -1,2240 +1,2240 @@
 /*	$OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $	*/
 
 /*
  * Copyright (c) 2005, 2006 Reyk Floeter <reyk@openbsd.org>
  * Copyright (c) 2007 Andrew Thompson <thompsa@FreeBSD.org>
  * Copyright (c) 2014, 2016 Marcelo Araujo <araujo@FreeBSD.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
  * copyright notice and this permission notice appear in all copies.
  *
  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/taskqueue.h>
 #include <sys/eventhandler.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_clone.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/bpf.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #endif
 #ifdef INET
 #include <netinet/in_systm.h>
 #include <netinet/if_ether.h>
 #endif
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif
 
 #include <net/if_vlan_var.h>
 #include <net/if_lagg.h>
 #include <net/ieee8023ad_lacp.h>
 
 /* Special flags we should propagate to the lagg ports. */
 static struct {
 	int flag;
 	int (*func)(struct ifnet *, int);
 } lagg_pflags[] = {
 	{IFF_PROMISC, ifpromisc},
 	{IFF_ALLMULTI, if_allmulti},
 	{0, NULL}
 };
 
 VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */
 #define	V_lagg_list	VNET(lagg_list)
 static VNET_DEFINE(struct mtx, lagg_list_mtx);
 #define	V_lagg_list_mtx	VNET(lagg_list_mtx)
 #define	LAGG_LIST_LOCK_INIT(x)		mtx_init(&V_lagg_list_mtx, \
 					"if_lagg list", NULL, MTX_DEF)
 #define	LAGG_LIST_LOCK_DESTROY(x)	mtx_destroy(&V_lagg_list_mtx)
 #define	LAGG_LIST_LOCK(x)		mtx_lock(&V_lagg_list_mtx)
 #define	LAGG_LIST_UNLOCK(x)		mtx_unlock(&V_lagg_list_mtx)
 eventhandler_tag	lagg_detach_cookie = NULL;
 
 static int	lagg_clone_create(struct if_clone *, int, caddr_t);
 static void	lagg_clone_destroy(struct ifnet *);
 static VNET_DEFINE(struct if_clone *, lagg_cloner);
 #define	V_lagg_cloner	VNET(lagg_cloner)
 static const char laggname[] = "lagg";
 
 static void	lagg_lladdr(struct lagg_softc *, uint8_t *);
 static void	lagg_capabilities(struct lagg_softc *);
 static void	lagg_port_lladdr(struct lagg_port *, uint8_t *, lagg_llqtype);
 static void	lagg_port_setlladdr(void *, int);
 static int	lagg_port_create(struct lagg_softc *, struct ifnet *);
 static int	lagg_port_destroy(struct lagg_port *, int);
 static struct mbuf *lagg_input(struct ifnet *, struct mbuf *);
 static void	lagg_linkstate(struct lagg_softc *);
 static void	lagg_port_state(struct ifnet *, int);
 static int	lagg_port_ioctl(struct ifnet *, u_long, caddr_t);
 static int	lagg_port_output(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *);
 static void	lagg_port_ifdetach(void *arg __unused, struct ifnet *);
 #ifdef LAGG_PORT_STACKING
 static int	lagg_port_checkstacking(struct lagg_softc *);
 #endif
 static void	lagg_port2req(struct lagg_port *, struct lagg_reqport *);
 static void	lagg_init(void *);
 static void	lagg_stop(struct lagg_softc *);
 static int	lagg_ioctl(struct ifnet *, u_long, caddr_t);
 static int	lagg_ether_setmulti(struct lagg_softc *);
 static int	lagg_ether_cmdmulti(struct lagg_port *, int);
 static	int	lagg_setflag(struct lagg_port *, int, int,
 		    int (*func)(struct ifnet *, int));
 static	int	lagg_setflags(struct lagg_port *, int status);
 static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt);
 static int	lagg_transmit(struct ifnet *, struct mbuf *);
 static void	lagg_qflush(struct ifnet *);
 static int	lagg_media_change(struct ifnet *);
 static void	lagg_media_status(struct ifnet *, struct ifmediareq *);
 static struct lagg_port *lagg_link_active(struct lagg_softc *,
 	    struct lagg_port *);
 
 /* Simple round robin */
 static void	lagg_rr_attach(struct lagg_softc *);
 static int	lagg_rr_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 
 /* Active failover */
 static int	lagg_fail_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 
 /* Loadbalancing */
 static void	lagg_lb_attach(struct lagg_softc *);
 static void	lagg_lb_detach(struct lagg_softc *);
 static int	lagg_lb_port_create(struct lagg_port *);
 static void	lagg_lb_port_destroy(struct lagg_port *);
 static int	lagg_lb_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 static int	lagg_lb_porttable(struct lagg_softc *, struct lagg_port *);
 
 /* Broadcast */
 static int    lagg_bcast_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_bcast_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 
 /* 802.3ad LACP */
 static void	lagg_lacp_attach(struct lagg_softc *);
 static void	lagg_lacp_detach(struct lagg_softc *);
 static int	lagg_lacp_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 static void	lagg_lacp_lladdr(struct lagg_softc *);
 
 /* lagg protocol table */
 static const struct lagg_proto {
 	lagg_proto	pr_num;
 	void		(*pr_attach)(struct lagg_softc *);
 	void		(*pr_detach)(struct lagg_softc *);
 	int		(*pr_start)(struct lagg_softc *, struct mbuf *);
 	struct mbuf *	(*pr_input)(struct lagg_softc *, struct lagg_port *,
 			    struct mbuf *);
 	int		(*pr_addport)(struct lagg_port *);
 	void		(*pr_delport)(struct lagg_port *);
 	void		(*pr_linkstate)(struct lagg_port *);
 	void 		(*pr_init)(struct lagg_softc *);
 	void 		(*pr_stop)(struct lagg_softc *);
 	void 		(*pr_lladdr)(struct lagg_softc *);
 	void		(*pr_request)(struct lagg_softc *, void *);
 	void		(*pr_portreq)(struct lagg_port *, void *);
 } lagg_protos[] = {
     {
 	.pr_num = LAGG_PROTO_NONE
     },
     {
 	.pr_num = LAGG_PROTO_ROUNDROBIN,
 	.pr_attach = lagg_rr_attach,
 	.pr_start = lagg_rr_start,
 	.pr_input = lagg_rr_input,
     },
     {
 	.pr_num = LAGG_PROTO_FAILOVER,
 	.pr_start = lagg_fail_start,
 	.pr_input = lagg_fail_input,
     },
     {
 	.pr_num = LAGG_PROTO_LOADBALANCE,
 	.pr_attach = lagg_lb_attach,
 	.pr_detach = lagg_lb_detach,
 	.pr_start = lagg_lb_start,
 	.pr_input = lagg_lb_input,
 	.pr_addport = lagg_lb_port_create,
 	.pr_delport = lagg_lb_port_destroy,
     },
     {
 	.pr_num = LAGG_PROTO_LACP,
 	.pr_attach = lagg_lacp_attach,
 	.pr_detach = lagg_lacp_detach,
 	.pr_start = lagg_lacp_start,
 	.pr_input = lagg_lacp_input,
 	.pr_addport = lacp_port_create,
 	.pr_delport = lacp_port_destroy,
 	.pr_linkstate = lacp_linkstate,
 	.pr_init = lacp_init,
 	.pr_stop = lacp_stop,
 	.pr_lladdr = lagg_lacp_lladdr,
 	.pr_request = lacp_req,
 	.pr_portreq = lacp_portreq,
     },
     {
 	.pr_num = LAGG_PROTO_BROADCAST,
 	.pr_start = lagg_bcast_start,
 	.pr_input = lagg_bcast_input,
     },
 };
 
 SYSCTL_DECL(_net_link);
 SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0,
     "Link Aggregation");
 
 /* Allow input on any failover links */
 static VNET_DEFINE(int, lagg_failover_rx_all);
 #define	V_lagg_failover_rx_all	VNET(lagg_failover_rx_all)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(lagg_failover_rx_all), 0,
     "Accept input from any interface in a failover lagg");
 
 /* Default value for using flowid */
 static VNET_DEFINE(int, def_use_flowid) = 1;
 #define	V_def_use_flowid	VNET(def_use_flowid)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN,
     &VNET_NAME(def_use_flowid), 0,
     "Default setting for using flow id for load sharing");
 
 /* Default value for flowid shift */
 static VNET_DEFINE(int, def_flowid_shift) = 16;
 #define	V_def_flowid_shift	VNET(def_flowid_shift)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN,
     &VNET_NAME(def_flowid_shift), 0,
     "Default setting for flowid shift for load sharing");
 
 static void
 vnet_lagg_init(const void *unused __unused)
 {
 
 	LAGG_LIST_LOCK_INIT();
 	SLIST_INIT(&V_lagg_list);
 	V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create,
 	    lagg_clone_destroy, 0);
 }
 VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_lagg_init, NULL);
 
 static void
 vnet_lagg_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_lagg_cloner);
 	LAGG_LIST_LOCK_DESTROY();
 }
 VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_lagg_uninit, NULL);
 
 static int
 lagg_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		lagg_input_p = lagg_input;
 		lagg_linkstate_p = lagg_port_state;
 		lagg_detach_cookie = EVENTHANDLER_REGISTER(
 		    ifnet_departure_event, lagg_port_ifdetach, NULL,
 		    EVENTHANDLER_PRI_ANY);
 		break;
 	case MOD_UNLOAD:
 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 		    lagg_detach_cookie);
 		lagg_input_p = NULL;
 		lagg_linkstate_p = NULL;
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t lagg_mod = {
 	"if_lagg",
 	lagg_modevent,
 	0
 };
 
 DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_lagg, 1);
 
 static void
 lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr)
 {
 
 	KASSERT(sc->sc_proto == LAGG_PROTO_NONE, ("%s: sc %p has proto",
 	    __func__, sc));
 
 	if (sc->sc_ifflags & IFF_DEBUG)
 		if_printf(sc->sc_ifp, "using proto %u\n", pr);
 
 	if (lagg_protos[pr].pr_attach != NULL)
 		lagg_protos[pr].pr_attach(sc);
 	sc->sc_proto = pr;
 }
 
 static void
 lagg_proto_detach(struct lagg_softc *sc)
 {
 	lagg_proto pr;
 
 	LAGG_WLOCK_ASSERT(sc);
 
 	pr = sc->sc_proto;
 	sc->sc_proto = LAGG_PROTO_NONE;
 
 	if (lagg_protos[pr].pr_detach != NULL)
 		lagg_protos[pr].pr_detach(sc);
 	else
 		LAGG_WUNLOCK(sc);
 }
 
 static int
 lagg_proto_start(struct lagg_softc *sc, struct mbuf *m)
 {
 
 	return (lagg_protos[sc->sc_proto].pr_start(sc, m));
 }
 
 static struct mbuf *
 lagg_proto_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 
 	return (lagg_protos[sc->sc_proto].pr_input(sc, lp, m));
 }
 
 static int
 lagg_proto_addport(struct lagg_softc *sc, struct lagg_port *lp)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_addport == NULL)
 		return (0);
 	else
 		return (lagg_protos[sc->sc_proto].pr_addport(lp));
 }
 
 static void
 lagg_proto_delport(struct lagg_softc *sc, struct lagg_port *lp)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_delport != NULL)
 		lagg_protos[sc->sc_proto].pr_delport(lp);
 }
 
 static void
 lagg_proto_linkstate(struct lagg_softc *sc, struct lagg_port *lp)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_linkstate != NULL)
 		lagg_protos[sc->sc_proto].pr_linkstate(lp);
 }
 
 static void
 lagg_proto_init(struct lagg_softc *sc)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_init != NULL)
 		lagg_protos[sc->sc_proto].pr_init(sc);
 }
 
 static void
 lagg_proto_stop(struct lagg_softc *sc)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_stop != NULL)
 		lagg_protos[sc->sc_proto].pr_stop(sc);
 }
 
 static void
 lagg_proto_lladdr(struct lagg_softc *sc)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_lladdr != NULL)
 		lagg_protos[sc->sc_proto].pr_lladdr(sc);
 }
 
 static void
 lagg_proto_request(struct lagg_softc *sc, void *v)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_request != NULL)
 		lagg_protos[sc->sc_proto].pr_request(sc, v);
 }
 
 static void
 lagg_proto_portreq(struct lagg_softc *sc, struct lagg_port *lp, void *v)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_portreq != NULL)
 		lagg_protos[sc->sc_proto].pr_portreq(lp, v);
 }
 
 /*
  * This routine is run via an vlan
  * config EVENT
  */
 static void
 lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
 {
 	struct lagg_softc *sc = ifp->if_softc;
 	struct lagg_port *lp;
 	struct rm_priotracker tracker;
 
 	if (ifp->if_softc !=  arg)   /* Not our event */
 		return;
 
 	LAGG_RLOCK(sc, &tracker);
 	if (!SLIST_EMPTY(&sc->sc_ports)) {
 		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 			EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag);
 	}
 	LAGG_RUNLOCK(sc, &tracker);
 }
 
 /*
  * This routine is run via an vlan
  * unconfig EVENT
  */
 static void
 lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
 {
 	struct lagg_softc *sc = ifp->if_softc;
 	struct lagg_port *lp;
 	struct rm_priotracker tracker;
 
 	if (ifp->if_softc !=  arg)   /* Not our event */
 		return;
 
 	LAGG_RLOCK(sc, &tracker);
 	if (!SLIST_EMPTY(&sc->sc_ports)) {
 		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 			EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag);
 	}
 	LAGG_RUNLOCK(sc, &tracker);
 }
 
 static int
 lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct lagg_softc *sc;
 	struct ifnet *ifp;
 	static const u_char eaddr[6];	/* 00:00:00:00:00:00 */
 
 	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
 	ifp = sc->sc_ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		free(sc, M_DEVBUF);
 		return (ENOSPC);
 	}
 
 	if (V_def_use_flowid)
 		sc->sc_opts |= LAGG_OPT_USE_FLOWID;
 	sc->flowid_shift = V_def_flowid_shift;
 
 	/* Hash all layers by default */
 	sc->sc_flags = MBUF_HASHFLAG_L2|MBUF_HASHFLAG_L3|MBUF_HASHFLAG_L4;
 
 	lagg_proto_attach(sc, LAGG_PROTO_DEFAULT);
 
 	LAGG_LOCK_INIT(sc);
 	SLIST_INIT(&sc->sc_ports);
 	TASK_INIT(&sc->sc_lladdr_task, 0, lagg_port_setlladdr, sc);
 
 	/* Initialise pseudo media types */
 	ifmedia_init(&sc->sc_media, 0, lagg_media_change,
 	    lagg_media_status);
 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
 	ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
 
 	if_initname(ifp, laggname, unit);
 	ifp->if_softc = sc;
 	ifp->if_transmit = lagg_transmit;
 	ifp->if_qflush = lagg_qflush;
 	ifp->if_init = lagg_init;
 	ifp->if_ioctl = lagg_ioctl;
 	ifp->if_get_counter = lagg_get_counter;
 	ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
 	ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
 
 	/*
 	 * Attach as an ordinary ethernet device, children will be attached
 	 * as special device IFT_IEEE8023ADLAG.
 	 */
 	ether_ifattach(ifp, eaddr);
 
 	sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
 		lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
 	sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
 		lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
 
 	/* Insert into the global list of laggs */
 	LAGG_LIST_LOCK();
 	SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries);
 	LAGG_LIST_UNLOCK();
 
 	return (0);
 }
 
 static void
 lagg_clone_destroy(struct ifnet *ifp)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	struct lagg_port *lp;
 
 	LAGG_WLOCK(sc);
 
 	lagg_stop(sc);
 	ifp->if_flags &= ~IFF_UP;
 
 	EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach);
 	EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach);
 
 	/* Shutdown and remove lagg ports */
 	while ((lp = SLIST_FIRST(&sc->sc_ports)) != NULL)
 		lagg_port_destroy(lp, 1);
 	/* Unhook the aggregation protocol */
 	lagg_proto_detach(sc);
 	LAGG_UNLOCK_ASSERT(sc);
 
 	ifmedia_removeall(&sc->sc_media);
 	ether_ifdetach(ifp);
 	if_free(ifp);
 
 	LAGG_LIST_LOCK();
 	SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries);
 	LAGG_LIST_UNLOCK();
 
 	taskqueue_drain(taskqueue_swi, &sc->sc_lladdr_task);
 	LAGG_LOCK_DESTROY(sc);
 	free(sc, M_DEVBUF);
 }
 
 /*
  * Set link-layer address on the lagg interface itself.
  * 
  * Set noinline to be dtrace-friendly
  */
 static __noinline void
 lagg_lladdr(struct lagg_softc *sc, uint8_t *lladdr)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 	struct lagg_port lp;
 
 	if (memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0)
 		return;
 
 	LAGG_WLOCK_ASSERT(sc);
 	/*
 	 * Set the link layer address on the lagg interface.
 	 * lagg_proto_lladdr() notifies the MAC change to
 	 * the aggregation protocol.  iflladdr_event handler which
 	 * may trigger gratuitous ARPs for INET will be handled in
 	 * a taskqueue.
 	 */
 	bcopy(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 	lagg_proto_lladdr(sc);
 
 	/*
 	 * Send notification request for lagg interface
 	 * itself. Note that new lladdr is already set.
 	 */
 	bzero(&lp, sizeof(lp));
 	lp.lp_ifp = sc->sc_ifp;
 	lp.lp_softc = sc;
 
 	/* Do not request lladdr change */
 	lagg_port_lladdr(&lp, lladdr, LAGG_LLQTYPE_VIRT);
 }
 
 static void
 lagg_capabilities(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	int cap = ~0, ena = ~0;
 	u_long hwa = ~0UL;
 	struct ifnet_hw_tsomax hw_tsomax;
 
 	LAGG_WLOCK_ASSERT(sc);
 
 	memset(&hw_tsomax, 0, sizeof(hw_tsomax));
 
 	/* Get capabilities from the lagg ports */
 	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		cap &= lp->lp_ifp->if_capabilities;
 		ena &= lp->lp_ifp->if_capenable;
 		hwa &= lp->lp_ifp->if_hwassist;
 		if_hw_tsomax_common(lp->lp_ifp, &hw_tsomax);
 	}
 	cap = (cap == ~0 ? 0 : cap);
 	ena = (ena == ~0 ? 0 : ena);
 	hwa = (hwa == ~0 ? 0 : hwa);
 
 	if (sc->sc_ifp->if_capabilities != cap ||
 	    sc->sc_ifp->if_capenable != ena ||
 	    sc->sc_ifp->if_hwassist != hwa ||
 	    if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax) != 0) {
 		sc->sc_ifp->if_capabilities = cap;
 		sc->sc_ifp->if_capenable = ena;
 		sc->sc_ifp->if_hwassist = hwa;
 		getmicrotime(&sc->sc_ifp->if_lastchange);
 
 		if (sc->sc_ifflags & IFF_DEBUG)
 			if_printf(sc->sc_ifp,
 			    "capabilities 0x%08x enabled 0x%08x\n", cap, ena);
 	}
 }
 
 /*
  * Enqueue interface lladdr notification.
  * If request is already queued, it is updated.
  * If setting lladdr is also desired, @do_change has to be set to 1.
  *
  * Set noinline to be dtrace-friendly
  */
 static __noinline void
 lagg_port_lladdr(struct lagg_port *lp, uint8_t *lladdr, lagg_llqtype llq_type)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *ifp = lp->lp_ifp;
 	struct lagg_llq *llq;
 
 	LAGG_WLOCK_ASSERT(sc);
 
 	/*
 	 * Do not enqueue requests where lladdr is the same for
 	 * "physical" interfaces (e.g. ports in lagg)
 	 */
 	if (llq_type == LAGG_LLQTYPE_PHYS &&
 	    memcmp(IF_LLADDR(ifp), lladdr, ETHER_ADDR_LEN) == 0)
 		return;
 
 	/* Check to make sure its not already queued to be changed */
 	SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) {
 		if (llq->llq_ifp == ifp) {
 			/* Update lladdr, it may have changed */
 			bcopy(lladdr, llq->llq_lladdr, ETHER_ADDR_LEN);
 			return;
 		}
 	}
 
 	llq = malloc(sizeof(struct lagg_llq), M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (llq == NULL)	/* XXX what to do */
 		return;
 
 	llq->llq_ifp = ifp;
 	llq->llq_type = llq_type;
 	bcopy(lladdr, llq->llq_lladdr, ETHER_ADDR_LEN);
 	/* XXX: We should insert to tail */
 	SLIST_INSERT_HEAD(&sc->sc_llq_head, llq, llq_entries);
 
 	taskqueue_enqueue(taskqueue_swi, &sc->sc_lladdr_task);
 }
 
 /*
  * Set the interface MAC address from a taskqueue to avoid a LOR.
  *
  * Set noinline to be dtrace-friendly
  */
 static __noinline void
 lagg_port_setlladdr(void *arg, int pending)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)arg;
 	struct lagg_llq *llq, *head;
 	struct ifnet *ifp;
 
 	/* Grab a local reference of the queue and remove it from the softc */
 	LAGG_WLOCK(sc);
 	head = SLIST_FIRST(&sc->sc_llq_head);
 	SLIST_FIRST(&sc->sc_llq_head) = NULL;
 	LAGG_WUNLOCK(sc);
 
 	/*
 	 * Traverse the queue and set the lladdr on each ifp. It is safe to do
 	 * unlocked as we have the only reference to it.
 	 */
 	for (llq = head; llq != NULL; llq = head) {
 		ifp = llq->llq_ifp;
 
 		CURVNET_SET(ifp->if_vnet);
 
 		/*
 		 * Set the link layer address on the laggport interface.
 		 * Note that if_setlladdr() or iflladdr_event handler
 		 * may result in arp transmission / lltable updates.
 		 */
 		if (llq->llq_type == LAGG_LLQTYPE_PHYS)
 			if_setlladdr(ifp, llq->llq_lladdr,
 			    ETHER_ADDR_LEN);
 		else
 			EVENTHANDLER_INVOKE(iflladdr_event, ifp);
 		CURVNET_RESTORE();
 		head = SLIST_NEXT(llq, llq_entries);
 		free(llq, M_DEVBUF);
 	}
 }
 
 static int
 lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp)
 {
 	struct lagg_softc *sc_ptr;
 	struct lagg_port *lp, *tlp;
 	int error, i;
 	uint64_t *pval;
 
 	LAGG_WLOCK_ASSERT(sc);
 
 	/* Limit the maximal number of lagg ports */
 	if (sc->sc_count >= LAGG_MAX_PORTS)
 		return (ENOSPC);
 
 	/* Check if port has already been associated to a lagg */
 	if (ifp->if_lagg != NULL) {
 		/* Port is already in the current lagg? */
 		lp = (struct lagg_port *)ifp->if_lagg;
 		if (lp->lp_softc == sc)
 			return (EEXIST);
 		return (EBUSY);
 	}
 
 	/* XXX Disallow non-ethernet interfaces (this should be any of 802) */
 	if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN)
 		return (EPROTONOSUPPORT);
 
 	/* Allow the first Ethernet member to define the MTU */
 	if (SLIST_EMPTY(&sc->sc_ports))
 		sc->sc_ifp->if_mtu = ifp->if_mtu;
 	else if (sc->sc_ifp->if_mtu != ifp->if_mtu) {
 		if_printf(sc->sc_ifp, "invalid MTU for %s\n",
 		    ifp->if_xname);
 		return (EINVAL);
 	}
 
 	if ((lp = malloc(sizeof(struct lagg_port),
 	    M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
 		return (ENOMEM);
 
 	/* Check if port is a stacked lagg */
 	LAGG_LIST_LOCK();
 	SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) {
 		if (ifp == sc_ptr->sc_ifp) {
 			LAGG_LIST_UNLOCK();
 			free(lp, M_DEVBUF);
 			return (EINVAL);
 			/* XXX disable stacking for the moment, its untested */
 #ifdef LAGG_PORT_STACKING
 			lp->lp_flags |= LAGG_PORT_STACK;
 			if (lagg_port_checkstacking(sc_ptr) >=
 			    LAGG_MAX_STACKING) {
 				LAGG_LIST_UNLOCK();
 				free(lp, M_DEVBUF);
 				return (E2BIG);
 			}
 #endif
 		}
 	}
 	LAGG_LIST_UNLOCK();
 
 	/* Change the interface type */
 	lp->lp_iftype = ifp->if_type;
 	ifp->if_type = IFT_IEEE8023ADLAG;
 	ifp->if_lagg = lp;
 	lp->lp_ioctl = ifp->if_ioctl;
 	ifp->if_ioctl = lagg_port_ioctl;
 	lp->lp_output = ifp->if_output;
 	ifp->if_output = lagg_port_output;
 
 	lp->lp_ifp = ifp;
 	lp->lp_softc = sc;
 
 	/* Save port link layer address */
 	bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN);
 
 	if (SLIST_EMPTY(&sc->sc_ports)) {
 		sc->sc_primary = lp;
 		/* First port in lagg. Update/notify lagg lladdress */
 		lagg_lladdr(sc, IF_LLADDR(ifp));
 	} else {
 
 		/*
 		 * Update link layer address for this port and
 		 * send notifications to other subsystems.
 		 */
 		lagg_port_lladdr(lp, IF_LLADDR(sc->sc_ifp), LAGG_LLQTYPE_PHYS);
 	}
 
 	/*
 	 * Insert into the list of ports.
 	 * Keep ports sorted by if_index. It is handy, when configuration
 	 * is predictable and `ifconfig laggN create ...` command
 	 * will lead to the same result each time.
 	 */
 	SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) {
 		if (tlp->lp_ifp->if_index < ifp->if_index && (
 		    SLIST_NEXT(tlp, lp_entries) == NULL ||
 		    SLIST_NEXT(tlp, lp_entries)->lp_ifp->if_index >
 		    ifp->if_index))
 			break;
 	}
 	if (tlp != NULL)
 		SLIST_INSERT_AFTER(tlp, lp, lp_entries);
 	else
 		SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries);
 	sc->sc_count++;
 
 	/* Update lagg capabilities */
 	lagg_capabilities(sc);
 	lagg_linkstate(sc);
 
 	/* Read port counters */
 	pval = lp->port_counters.val;
 	for (i = 0; i < IFCOUNTERS; i++, pval++)
 		*pval = ifp->if_get_counter(ifp, i);
 	/* Add multicast addresses and interface flags to this port */
 	lagg_ether_cmdmulti(lp, 1);
 	lagg_setflags(lp, 1);
 
 	if ((error = lagg_proto_addport(sc, lp)) != 0) {
 		/* Remove the port, without calling pr_delport. */
 		lagg_port_destroy(lp, 0);
 		return (error);
 	}
 
 	return (0);
 }
 
 #ifdef LAGG_PORT_STACKING
 static int
 lagg_port_checkstacking(struct lagg_softc *sc)
 {
 	struct lagg_softc *sc_ptr;
 	struct lagg_port *lp;
 	int m = 0;
 
 	LAGG_WLOCK_ASSERT(sc);
 
 	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (lp->lp_flags & LAGG_PORT_STACK) {
 			sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc;
 			m = MAX(m, lagg_port_checkstacking(sc_ptr));
 		}
 	}
 
 	return (m + 1);
 }
 #endif
 
 static int
 lagg_port_destroy(struct lagg_port *lp, int rundelport)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	struct lagg_port *lp_ptr, *lp0;
 	struct lagg_llq *llq;
 	struct ifnet *ifp = lp->lp_ifp;
 	uint64_t *pval, vdiff;
 	int i;
 
 	LAGG_WLOCK_ASSERT(sc);
 
 	if (rundelport)
 		lagg_proto_delport(sc, lp);
 
 	/*
 	 * Remove multicast addresses and interface flags from this port and
 	 * reset the MAC address, skip if the interface is being detached.
 	 */
 	if (!lp->lp_detaching) {
 		lagg_ether_cmdmulti(lp, 0);
 		lagg_setflags(lp, 0);
 		lagg_port_lladdr(lp, lp->lp_lladdr, LAGG_LLQTYPE_PHYS);
 	}
 
 	/* Restore interface */
 	ifp->if_type = lp->lp_iftype;
 	ifp->if_ioctl = lp->lp_ioctl;
 	ifp->if_output = lp->lp_output;
 	ifp->if_lagg = NULL;
 
 	/* Update detached port counters */
 	pval = lp->port_counters.val;
 	for (i = 0; i < IFCOUNTERS; i++, pval++) {
 		vdiff = ifp->if_get_counter(ifp, i) - *pval;
 		sc->detached_counters.val[i] += vdiff;
 	}
 
 	/* Finally, remove the port from the lagg */
 	SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries);
 	sc->sc_count--;
 
 	/* Update the primary interface */
 	if (lp == sc->sc_primary) {
 		uint8_t lladdr[ETHER_ADDR_LEN];
 
 		if ((lp0 = SLIST_FIRST(&sc->sc_ports)) == NULL) {
 			bzero(&lladdr, ETHER_ADDR_LEN);
 		} else {
 			bcopy(lp0->lp_lladdr,
 			    lladdr, ETHER_ADDR_LEN);
 		}
 		lagg_lladdr(sc, lladdr);
 
 		/* Mark lp0 as new primary */
 		sc->sc_primary = lp0;
 
 		/*
 		 * Enqueue lladdr update/notification for each port
 		 * (new primary needs update as well, to switch from
 		 * old lladdr to its 'real' one).
 		 */
 		SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries)
 			lagg_port_lladdr(lp_ptr, lladdr, LAGG_LLQTYPE_PHYS);
 	}
 
 	/* Remove any pending lladdr changes from the queue */
 	if (lp->lp_detaching) {
 		SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) {
 			if (llq->llq_ifp == ifp) {
 				SLIST_REMOVE(&sc->sc_llq_head, llq, lagg_llq,
 				    llq_entries);
 				free(llq, M_DEVBUF);
 				break;	/* Only appears once */
 			}
 		}
 	}
 
 	if (lp->lp_ifflags)
 		if_printf(ifp, "%s: lp_ifflags unclean\n", __func__);
 
 	free(lp, M_DEVBUF);
 
 	/* Update lagg capabilities */
 	lagg_capabilities(sc);
 	lagg_linkstate(sc);
 
 	return (0);
 }
 
 static int
 lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct lagg_reqport *rp = (struct lagg_reqport *)data;
 	struct lagg_softc *sc;
 	struct lagg_port *lp = NULL;
 	int error = 0;
 	struct rm_priotracker tracker;
 
 	/* Should be checked by the caller */
 	if (ifp->if_type != IFT_IEEE8023ADLAG ||
 	    (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL)
 		goto fallback;
 
 	switch (cmd) {
 	case SIOCGLAGGPORT:
 		if (rp->rp_portname[0] == '\0' ||
 		    ifunit(rp->rp_portname) != ifp) {
 			error = EINVAL;
 			break;
 		}
 
 		LAGG_RLOCK(sc, &tracker);
 		if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) {
 			error = ENOENT;
 			LAGG_RUNLOCK(sc, &tracker);
 			break;
 		}
 
 		lagg_port2req(lp, rp);
 		LAGG_RUNLOCK(sc, &tracker);
 		break;
 
 	case SIOCSIFCAP:
 		if (lp->lp_ioctl == NULL) {
 			error = EINVAL;
 			break;
 		}
 		error = (*lp->lp_ioctl)(ifp, cmd, data);
 		if (error)
 			break;
 
 		/* Update lagg interface capabilities */
 		LAGG_WLOCK(sc);
 		lagg_capabilities(sc);
 		LAGG_WUNLOCK(sc);
 		break;
 
 	case SIOCSIFMTU:
 		/* Do not allow the MTU to be changed once joined */
 		error = EINVAL;
 		break;
 
 	default:
 		goto fallback;
 	}
 
 	return (error);
 
 fallback:
 	if (lp->lp_ioctl != NULL)
 		return ((*lp->lp_ioctl)(ifp, cmd, data));
 
 	return (EINVAL);
 }
 
 /*
  * Requests counter @cnt data. 
  *
  * Counter value is calculated the following way:
  * 1) for each port, sum  difference between current and "initial" measurements.
  * 2) add lagg logical interface counters.
  * 3) add data from detached_counters array.
  *
  * We also do the following things on ports attach/detach:
  * 1) On port attach we store all counters it has into port_counter array. 
  * 2) On port detach we add the different between "initial" and
  *   current counters data to detached_counters array.
  */
 static uint64_t
 lagg_get_counter(struct ifnet *ifp, ift_counter cnt)
 {
 	struct lagg_softc *sc;
 	struct lagg_port *lp;
 	struct ifnet *lpifp;
 	struct rm_priotracker tracker;
 	uint64_t newval, oldval, vsum;
 
 	/* Revise this when we've got non-generic counters. */
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	sc = (struct lagg_softc *)ifp->if_softc;
 	LAGG_RLOCK(sc, &tracker);
 
 	vsum = 0;
 	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		/* Saved attached value */
 		oldval = lp->port_counters.val[cnt];
 		/* current value */
 		lpifp = lp->lp_ifp;
 		newval = lpifp->if_get_counter(lpifp, cnt);
 		/* Calculate diff and save new */
 		vsum += newval - oldval;
 	}
 
 	/*
 	 * Add counter data which might be added by upper
 	 * layer protocols operating on logical interface.
 	 */
 	vsum += if_get_counter_default(ifp, cnt);
 
 	/*
 	 * Add counter data from detached ports counters
 	 */
 	vsum += sc->detached_counters.val[cnt];
 
 	LAGG_RUNLOCK(sc, &tracker);
 
 	return (vsum);
 }
 
 /*
  * For direct output to child ports.
  */
 static int
 lagg_port_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 	struct lagg_port *lp = ifp->if_lagg;
 
 	switch (dst->sa_family) {
 		case pseudo_AF_HDRCMPLT:
 		case AF_UNSPEC:
 			return ((*lp->lp_output)(ifp, m, dst, ro));
 	}
 
 	/* drop any other frames */
 	m_freem(m);
 	return (ENETDOWN);
 }
 
 static void
 lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp)
 {
 	struct lagg_port *lp;
 	struct lagg_softc *sc;
 
 	if ((lp = ifp->if_lagg) == NULL)
 		return;
 	/* If the ifnet is just being renamed, don't do anything. */
 	if (ifp->if_flags & IFF_RENAMING)
 		return;
 
 	sc = lp->lp_softc;
 
 	LAGG_WLOCK(sc);
 	lp->lp_detaching = 1;
 	lagg_port_destroy(lp, 1);
 	LAGG_WUNLOCK(sc);
 }
 
 static void
 lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 
 	strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname));
 	strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname));
 	rp->rp_prio = lp->lp_prio;
 	rp->rp_flags = lp->lp_flags;
 	lagg_proto_portreq(sc, lp, &rp->rp_psc);
 
 	/* Add protocol specific flags */
 	switch (sc->sc_proto) {
 		case LAGG_PROTO_FAILOVER:
 			if (lp == sc->sc_primary)
 				rp->rp_flags |= LAGG_PORT_MASTER;
 			if (lp == lagg_link_active(sc, sc->sc_primary))
 				rp->rp_flags |= LAGG_PORT_ACTIVE;
 			break;
 
 		case LAGG_PROTO_ROUNDROBIN:
 		case LAGG_PROTO_LOADBALANCE:
 		case LAGG_PROTO_BROADCAST:
 			if (LAGG_PORTACTIVE(lp))
 				rp->rp_flags |= LAGG_PORT_ACTIVE;
 			break;
 
 		case LAGG_PROTO_LACP:
 			/* LACP has a different definition of active */
 			if (lacp_isactive(lp))
 				rp->rp_flags |= LAGG_PORT_ACTIVE;
 			if (lacp_iscollecting(lp))
 				rp->rp_flags |= LAGG_PORT_COLLECTING;
 			if (lacp_isdistributing(lp))
 				rp->rp_flags |= LAGG_PORT_DISTRIBUTING;
 			break;
 	}
 
 }
 
 static void
 lagg_init(void *xsc)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)xsc;
 	struct ifnet *ifp = sc->sc_ifp;
 	struct lagg_port *lp;
 
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 		return;
 
 	LAGG_WLOCK(sc);
 
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 
 	/*
 	 * Update the port lladdrs if needed.
 	 * This might be if_setlladdr() notification
 	 * that lladdr has been changed.
 	 */
 	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lagg_port_lladdr(lp, IF_LLADDR(ifp), LAGG_LLQTYPE_PHYS);
 
 	lagg_proto_init(sc);
 
 	LAGG_WUNLOCK(sc);
 }
 
 static void
 lagg_stop(struct lagg_softc *sc)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	LAGG_WLOCK_ASSERT(sc);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 
 	lagg_proto_stop(sc);
 }
 
 static int
 lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	struct lagg_reqall *ra = (struct lagg_reqall *)data;
 	struct lagg_reqopts *ro = (struct lagg_reqopts *)data;
 	struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf;
 	struct lagg_reqflags *rf = (struct lagg_reqflags *)data;
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct lagg_port *lp;
 	struct ifnet *tpif;
 	struct thread *td = curthread;
 	char *buf, *outbuf;
 	int count, buflen, len, error = 0;
 	struct rm_priotracker tracker;
 
 	bzero(&rpbuf, sizeof(rpbuf));
 
 	switch (cmd) {
 	case SIOCGLAGG:
 		LAGG_RLOCK(sc, &tracker);
 		count = 0;
 		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 			count++;
 		buflen = count * sizeof(struct lagg_reqport);
 		LAGG_RUNLOCK(sc, &tracker);
 
 		outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
 
 		LAGG_RLOCK(sc, &tracker);
 		ra->ra_proto = sc->sc_proto;
 		lagg_proto_request(sc, &ra->ra_psc);
 		count = 0;
 		buf = outbuf;
 		len = min(ra->ra_size, buflen);
 		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			if (len < sizeof(rpbuf))
 				break;
 
 			lagg_port2req(lp, &rpbuf);
 			memcpy(buf, &rpbuf, sizeof(rpbuf));
 			count++;
 			buf += sizeof(rpbuf);
 			len -= sizeof(rpbuf);
 		}
 		LAGG_RUNLOCK(sc, &tracker);
 		ra->ra_ports = count;
 		ra->ra_size = count * sizeof(rpbuf);
 		error = copyout(outbuf, ra->ra_port, ra->ra_size);
 		free(outbuf, M_TEMP);
 		break;
 	case SIOCSLAGG:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (ra->ra_proto >= LAGG_PROTO_MAX) {
 			error = EPROTONOSUPPORT;
 			break;
 		}
 
 		LAGG_WLOCK(sc);
 		lagg_proto_detach(sc);
 		LAGG_UNLOCK_ASSERT(sc);
 		lagg_proto_attach(sc, ra->ra_proto);
 		break;
 	case SIOCGLAGGOPTS:
 		ro->ro_opts = sc->sc_opts;
 		if (sc->sc_proto == LAGG_PROTO_LACP) {
 			struct lacp_softc *lsc;
 
 			lsc = (struct lacp_softc *)sc->sc_psc;
 			if (lsc->lsc_debug.lsc_tx_test != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_TXTEST;
 			if (lsc->lsc_debug.lsc_rx_test != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_RXTEST;
 			if (lsc->lsc_strict_mode != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_STRICT;
 			if (lsc->lsc_fast_timeout != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_TIMEOUT;
 
 			ro->ro_active = sc->sc_active;
 		} else {
 			ro->ro_active = 0;
 			SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 				ro->ro_active += LAGG_PORTACTIVE(lp);
 		}
 		ro->ro_bkt = sc->sc_bkt;
 		ro->ro_flapping = sc->sc_flapping;
 		ro->ro_flowid_shift = sc->flowid_shift;
 		break;
 	case SIOCSLAGGOPTS:
 		if (sc->sc_proto == LAGG_PROTO_ROUNDROBIN) {
 			if (ro->ro_bkt == 0)
 				sc->sc_bkt = 1; // Minimum 1 packet per iface.
 			else
 				sc->sc_bkt = ro->ro_bkt;
 		}
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (ro->ro_opts == 0)
 			break;
 		/*
 		 * Set options.  LACP options are stored in sc->sc_psc,
 		 * not in sc_opts.
 		 */
 		int valid, lacp;
 
 		switch (ro->ro_opts) {
 		case LAGG_OPT_USE_FLOWID:
 		case -LAGG_OPT_USE_FLOWID:
 		case LAGG_OPT_FLOWIDSHIFT:
 			valid = 1;
 			lacp = 0;
 			break;
 		case LAGG_OPT_LACP_TXTEST:
 		case -LAGG_OPT_LACP_TXTEST:
 		case LAGG_OPT_LACP_RXTEST:
 		case -LAGG_OPT_LACP_RXTEST:
 		case LAGG_OPT_LACP_STRICT:
 		case -LAGG_OPT_LACP_STRICT:
 		case LAGG_OPT_LACP_TIMEOUT:
 		case -LAGG_OPT_LACP_TIMEOUT:
 			valid = lacp = 1;
 			break;
 		default:
 			valid = lacp = 0;
 			break;
 		}
 
 		LAGG_WLOCK(sc);
 
 		if (valid == 0 ||
 		    (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) {
 			/* Invalid combination of options specified. */
 			error = EINVAL;
 			LAGG_WUNLOCK(sc);
 			break;	/* Return from SIOCSLAGGOPTS. */ 
 		}
 		/*
 		 * Store new options into sc->sc_opts except for
 		 * FLOWIDSHIFT and LACP options.
 		 */
 		if (lacp == 0) {
 			if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT)
 				sc->flowid_shift = ro->ro_flowid_shift;
 			else if (ro->ro_opts > 0)
 				sc->sc_opts |= ro->ro_opts;
 			else
 				sc->sc_opts &= ~ro->ro_opts;
 		} else {
 			struct lacp_softc *lsc;
 			struct lacp_port *lp;
 
 			lsc = (struct lacp_softc *)sc->sc_psc;
 
 			switch (ro->ro_opts) {
 			case LAGG_OPT_LACP_TXTEST:
 				lsc->lsc_debug.lsc_tx_test = 1;
 				break;
 			case -LAGG_OPT_LACP_TXTEST:
 				lsc->lsc_debug.lsc_tx_test = 0;
 				break;
 			case LAGG_OPT_LACP_RXTEST:
 				lsc->lsc_debug.lsc_rx_test = 1;
 				break;
 			case -LAGG_OPT_LACP_RXTEST:
 				lsc->lsc_debug.lsc_rx_test = 0;
 				break;
 			case LAGG_OPT_LACP_STRICT:
 				lsc->lsc_strict_mode = 1;
 				break;
 			case -LAGG_OPT_LACP_STRICT:
 				lsc->lsc_strict_mode = 0;
 				break;
 			case LAGG_OPT_LACP_TIMEOUT:
 				LACP_LOCK(lsc);
         			LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
                         		lp->lp_state |= LACP_STATE_TIMEOUT;
 				LACP_UNLOCK(lsc);
 				lsc->lsc_fast_timeout = 1;
 				break;
 			case -LAGG_OPT_LACP_TIMEOUT:
 				LACP_LOCK(lsc);
         			LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
                         		lp->lp_state &= ~LACP_STATE_TIMEOUT;
 				LACP_UNLOCK(lsc);
 				lsc->lsc_fast_timeout = 0;
 				break;
 			}
 		}
 		LAGG_WUNLOCK(sc);
 		break;
 	case SIOCGLAGGFLAGS:
 		rf->rf_flags = 0;
 		LAGG_RLOCK(sc, &tracker);
 		if (sc->sc_flags & MBUF_HASHFLAG_L2)
 			rf->rf_flags |= LAGG_F_HASHL2;
 		if (sc->sc_flags & MBUF_HASHFLAG_L3)
 			rf->rf_flags |= LAGG_F_HASHL3;
 		if (sc->sc_flags & MBUF_HASHFLAG_L4)
 			rf->rf_flags |= LAGG_F_HASHL4;
 		LAGG_RUNLOCK(sc, &tracker);
 		break;
 	case SIOCSLAGGHASH:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) {
 			error = EINVAL;
 			break;
 		}
 		LAGG_WLOCK(sc);
 		sc->sc_flags = 0;
 		if (rf->rf_flags & LAGG_F_HASHL2)
 			sc->sc_flags |= MBUF_HASHFLAG_L2;
 		if (rf->rf_flags & LAGG_F_HASHL3)
 			sc->sc_flags |= MBUF_HASHFLAG_L3;
 		if (rf->rf_flags & LAGG_F_HASHL4)
 			sc->sc_flags |= MBUF_HASHFLAG_L4;
 		LAGG_WUNLOCK(sc);
 		break;
 	case SIOCGLAGGPORT:
 		if (rp->rp_portname[0] == '\0' ||
 		    (tpif = ifunit(rp->rp_portname)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 
 		LAGG_RLOCK(sc, &tracker);
 		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
 		    lp->lp_softc != sc) {
 			error = ENOENT;
 			LAGG_RUNLOCK(sc, &tracker);
 			break;
 		}
 
 		lagg_port2req(lp, rp);
 		LAGG_RUNLOCK(sc, &tracker);
 		break;
 	case SIOCSLAGGPORT:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (rp->rp_portname[0] == '\0' ||
 		    (tpif = ifunit(rp->rp_portname)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 #ifdef INET6
 		/*
 		 * A laggport interface should not have inet6 address
 		 * because two interfaces with a valid link-local
 		 * scope zone must not be merged in any form.  This
 		 * restriction is needed to prevent violation of
 		 * link-local scope zone.  Attempts to add a laggport
 		 * interface which has inet6 addresses triggers
 		 * removal of all inet6 addresses on the member
 		 * interface.
 		 */
 		if (in6ifa_llaonifp(tpif)) {
-			in6_ifdetach(tpif);
+			in6_ifdetach(tpif, 1);
 				if_printf(sc->sc_ifp,
 				    "IPv6 addresses on %s have been removed "
 				    "before adding it as a member to prevent "
 				    "IPv6 address scope violation.\n",
 				    tpif->if_xname);
 		}
 #endif
 		LAGG_WLOCK(sc);
 		error = lagg_port_create(sc, tpif);
 		LAGG_WUNLOCK(sc);
 		break;
 	case SIOCSLAGGDELPORT:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (rp->rp_portname[0] == '\0' ||
 		    (tpif = ifunit(rp->rp_portname)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 
 		LAGG_WLOCK(sc);
 		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
 		    lp->lp_softc != sc) {
 			error = ENOENT;
 			LAGG_WUNLOCK(sc);
 			break;
 		}
 
 		error = lagg_port_destroy(lp, 1);
 		LAGG_WUNLOCK(sc);
 		break;
 	case SIOCSIFFLAGS:
 		/* Set flags on ports too */
 		LAGG_WLOCK(sc);
 		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			lagg_setflags(lp, 1);
 		}
 		LAGG_WUNLOCK(sc);
 
 		if (!(ifp->if_flags & IFF_UP) &&
 		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			/*
 			 * If interface is marked down and it is running,
 			 * then stop and disable it.
 			 */
 			LAGG_WLOCK(sc);
 			lagg_stop(sc);
 			LAGG_WUNLOCK(sc);
 		} else if ((ifp->if_flags & IFF_UP) &&
 		    !(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			/*
 			 * If interface is marked up and it is stopped, then
 			 * start it.
 			 */
 			(*ifp->if_init)(sc);
 		}
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		LAGG_WLOCK(sc);
 		error = lagg_ether_setmulti(sc);
 		LAGG_WUNLOCK(sc);
 		break;
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
 		break;
 
 	case SIOCSIFCAP:
 	case SIOCSIFMTU:
 		/* Do not allow the MTU or caps to be directly changed */
 		error = EINVAL;
 		break;
 
 	default:
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 	return (error);
 }
 
 static int
 lagg_ether_setmulti(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 
 	LAGG_WLOCK_ASSERT(sc);
 
 	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		/* First, remove any existing filter entries. */
 		lagg_ether_cmdmulti(lp, 0);
 		/* copy all addresses from the lagg interface to the port */
 		lagg_ether_cmdmulti(lp, 1);
 	}
 	return (0);
 }
 
 static int
 lagg_ether_cmdmulti(struct lagg_port *lp, int set)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *ifp = lp->lp_ifp;
 	struct ifnet *scifp = sc->sc_ifp;
 	struct lagg_mc *mc;
 	struct ifmultiaddr *ifma;
 	int error;
 
 	LAGG_WLOCK_ASSERT(sc);
 
 	if (set) {
 		IF_ADDR_WLOCK(scifp);
 		TAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_LINK)
 				continue;
 			mc = malloc(sizeof(struct lagg_mc), M_DEVBUF, M_NOWAIT);
 			if (mc == NULL) {
 				IF_ADDR_WUNLOCK(scifp);
 				return (ENOMEM);
 			}
 			bcopy(ifma->ifma_addr, &mc->mc_addr,
 			    ifma->ifma_addr->sa_len);
 			mc->mc_addr.sdl_index = ifp->if_index;
 			mc->mc_ifma = NULL;
 			SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries);
 		}
 		IF_ADDR_WUNLOCK(scifp);
 		SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) {
 			error = if_addmulti(ifp,
 			    (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma);
 			if (error)
 				return (error);
 		}
 	} else {
 		while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) {
 			SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries);
 			if (mc->mc_ifma && !lp->lp_detaching)
 				if_delmulti_ifma(mc->mc_ifma);
 			free(mc, M_DEVBUF);
 		}
 	}
 	return (0);
 }
 
 /* Handle a ref counted flag that should be set on the lagg port as well */
 static int
 lagg_setflag(struct lagg_port *lp, int flag, int status,
     int (*func)(struct ifnet *, int))
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *scifp = sc->sc_ifp;
 	struct ifnet *ifp = lp->lp_ifp;
 	int error;
 
 	LAGG_WLOCK_ASSERT(sc);
 
 	status = status ? (scifp->if_flags & flag) : 0;
 	/* Now "status" contains the flag value or 0 */
 
 	/*
 	 * See if recorded ports status is different from what
 	 * we want it to be.  If it is, flip it.  We record ports
 	 * status in lp_ifflags so that we won't clear ports flag
 	 * we haven't set.  In fact, we don't clear or set ports
 	 * flags directly, but get or release references to them.
 	 * That's why we can be sure that recorded flags still are
 	 * in accord with actual ports flags.
 	 */
 	if (status != (lp->lp_ifflags & flag)) {
 		error = (*func)(ifp, status);
 		if (error)
 			return (error);
 		lp->lp_ifflags &= ~flag;
 		lp->lp_ifflags |= status;
 	}
 	return (0);
 }
 
 /*
  * Handle IFF_* flags that require certain changes on the lagg port
  * if "status" is true, update ports flags respective to the lagg
  * if "status" is false, forcedly clear the flags set on port.
  */
 static int
 lagg_setflags(struct lagg_port *lp, int status)
 {
 	int error, i;
 
 	for (i = 0; lagg_pflags[i].flag; i++) {
 		error = lagg_setflag(lp, lagg_pflags[i].flag,
 		    status, lagg_pflags[i].func);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 static int
 lagg_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	int error, len, mcast;
 	struct rm_priotracker tracker;
 
 	len = m->m_pkthdr.len;
 	mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
 
 	LAGG_RLOCK(sc, &tracker);
 	/* We need a Tx algorithm and at least one port */
 	if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) {
 		LAGG_RUNLOCK(sc, &tracker);
 		m_freem(m);
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENXIO);
 	}
 
 	ETHER_BPF_MTAP(ifp, m);
 
 	error = lagg_proto_start(sc, m);
 	LAGG_RUNLOCK(sc, &tracker);
 
 	if (error != 0)
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 
 	return (error);
 }
 
 /*
  * The ifp->if_qflush entry point for lagg(4) is no-op.
  */
 static void
 lagg_qflush(struct ifnet *ifp __unused)
 {
 }
 
 static struct mbuf *
 lagg_input(struct ifnet *ifp, struct mbuf *m)
 {
 	struct lagg_port *lp = ifp->if_lagg;
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *scifp = sc->sc_ifp;
 	struct rm_priotracker tracker;
 
 	LAGG_RLOCK(sc, &tracker);
 	if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    (lp->lp_flags & LAGG_PORT_DISABLED) ||
 	    sc->sc_proto == LAGG_PROTO_NONE) {
 		LAGG_RUNLOCK(sc, &tracker);
 		m_freem(m);
 		return (NULL);
 	}
 
 	ETHER_BPF_MTAP(scifp, m);
 
 	if (lp->lp_detaching != 0) {
 		m_freem(m);
 		m = NULL;
 	} else
 		m = lagg_proto_input(sc, lp, m);
 
 	if (m != NULL) {
 		if (scifp->if_flags & IFF_MONITOR) {
 			m_freem(m);
 			m = NULL;
 		}
 	}
 
 	LAGG_RUNLOCK(sc, &tracker);
 	return (m);
 }
 
 static int
 lagg_media_change(struct ifnet *ifp)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 
 	if (sc->sc_ifflags & IFF_DEBUG)
 		printf("%s\n", __func__);
 
 	/* Ignore */
 	return (0);
 }
 
 static void
 lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	struct lagg_port *lp;
 	struct rm_priotracker tracker;
 
 	imr->ifm_status = IFM_AVALID;
 	imr->ifm_active = IFM_ETHER | IFM_AUTO;
 
 	LAGG_RLOCK(sc, &tracker);
 	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (LAGG_PORTACTIVE(lp))
 			imr->ifm_status |= IFM_ACTIVE;
 	}
 	LAGG_RUNLOCK(sc, &tracker);
 }
 
 static void
 lagg_linkstate(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	int new_link = LINK_STATE_DOWN;
 	uint64_t speed;
 
 	/* Our link is considered up if at least one of our ports is active */
 	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (lp->lp_ifp->if_link_state == LINK_STATE_UP) {
 			new_link = LINK_STATE_UP;
 			break;
 		}
 	}
 	if_link_state_change(sc->sc_ifp, new_link);
 
 	/* Update if_baudrate to reflect the max possible speed */
 	switch (sc->sc_proto) {
 		case LAGG_PROTO_FAILOVER:
 			sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ?
 			    sc->sc_primary->lp_ifp->if_baudrate : 0;
 			break;
 		case LAGG_PROTO_ROUNDROBIN:
 		case LAGG_PROTO_LOADBALANCE:
 		case LAGG_PROTO_BROADCAST:
 			speed = 0;
 			SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 				speed += lp->lp_ifp->if_baudrate;
 			sc->sc_ifp->if_baudrate = speed;
 			break;
 		case LAGG_PROTO_LACP:
 			/* LACP updates if_baudrate itself */
 			break;
 	}
 }
 
 static void
 lagg_port_state(struct ifnet *ifp, int state)
 {
 	struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg;
 	struct lagg_softc *sc = NULL;
 
 	if (lp != NULL)
 		sc = lp->lp_softc;
 	if (sc == NULL)
 		return;
 
 	LAGG_WLOCK(sc);
 	lagg_linkstate(sc);
 	lagg_proto_linkstate(sc, lp);
 	LAGG_WUNLOCK(sc);
 }
 
 struct lagg_port *
 lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp)
 {
 	struct lagg_port *lp_next, *rval = NULL;
 	// int new_link = LINK_STATE_DOWN;
 
 	LAGG_RLOCK_ASSERT(sc);
 	/*
 	 * Search a port which reports an active link state.
 	 */
 
 	if (lp == NULL)
 		goto search;
 	if (LAGG_PORTACTIVE(lp)) {
 		rval = lp;
 		goto found;
 	}
 	if ((lp_next = SLIST_NEXT(lp, lp_entries)) != NULL &&
 	    LAGG_PORTACTIVE(lp_next)) {
 		rval = lp_next;
 		goto found;
 	}
 
 search:
 	SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
 		if (LAGG_PORTACTIVE(lp_next)) {
 			rval = lp_next;
 			goto found;
 		}
 	}
 
 found:
 	if (rval != NULL) {
 		/*
 		 * The IEEE 802.1D standard assumes that a lagg with
 		 * multiple ports is always full duplex. This is valid
 		 * for load sharing laggs and if at least two links
 		 * are active. Unfortunately, checking the latter would
 		 * be too expensive at this point.
 		 XXX
 		if ((sc->sc_capabilities & IFCAP_LAGG_FULLDUPLEX) &&
 		    (sc->sc_count > 1))
 			new_link = LINK_STATE_FULL_DUPLEX;
 		else
 			new_link = rval->lp_link_state;
 		 */
 	}
 
 	return (rval);
 }
 
 int
 lagg_enqueue(struct ifnet *ifp, struct mbuf *m)
 {
 
 	return (ifp->if_transmit)(ifp, m);
 }
 
 /*
  * Simple round robin aggregation
  */
 static void
 lagg_rr_attach(struct lagg_softc *sc)
 {
 	sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX;
 	sc->sc_seq = 0;
 	sc->sc_bkt_count = sc->sc_bkt;
 }
 
 static int
 lagg_rr_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_port *lp;
 	uint32_t p;
 
 	if (sc->sc_bkt_count == 0 && sc->sc_bkt > 0)
 		sc->sc_bkt_count = sc->sc_bkt;
 
 	if (sc->sc_bkt > 0) {
 		atomic_subtract_int(&sc->sc_bkt_count, 1);
 	if (atomic_cmpset_int(&sc->sc_bkt_count, 0, sc->sc_bkt))
 		p = atomic_fetchadd_32(&sc->sc_seq, 1);
 	else
 		p = sc->sc_seq; 
 	} else
 		p = atomic_fetchadd_32(&sc->sc_seq, 1);
 
 	p %= sc->sc_count;
 	lp = SLIST_FIRST(&sc->sc_ports);
 
 	while (p--)
 		lp = SLIST_NEXT(lp, lp_entries);
 
 	/*
 	 * Check the port's link state. This will return the next active
 	 * port if the link is down or the port is NULL.
 	 */
 	if ((lp = lagg_link_active(sc, lp)) == NULL) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	/* Just pass in the packet to our lagg device */
 	m->m_pkthdr.rcvif = ifp;
 
 	return (m);
 }
 
 /*
  * Broadcast mode
  */
 static int
 lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	int active_ports = 0;
 	int errors = 0;
 	int ret;
 	struct lagg_port *lp, *last = NULL;
 	struct mbuf *m0;
 
 	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (!LAGG_PORTACTIVE(lp))
 			continue;
 
 		active_ports++;
 
 		if (last != NULL) {
 			m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 			if (m0 == NULL) {
 				ret = ENOBUFS;
 				errors++;
 				break;
 			}
 
 			ret = lagg_enqueue(last->lp_ifp, m0);
 			if (ret != 0)
 				errors++;
 		}
 		last = lp;
 	}
 	if (last == NULL) {
 		m_freem(m);
 		return (ENOENT);
 	}
 	if ((last = lagg_link_active(sc, last)) == NULL) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	ret = lagg_enqueue(last->lp_ifp, m);
 	if (ret != 0)
 		errors++;
 
 	if (errors == 0)
 		return (ret);
 
 	return (0);
 }
 
 static struct mbuf*
 lagg_bcast_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	/* Just pass in the packet to our lagg device */
 	m->m_pkthdr.rcvif = ifp;
 	return (m);
 }
 
 /*
  * Active failover
  */
 static int
 lagg_fail_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_port *lp;
 
 	/* Use the master port if active or the next available port */
 	if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 	struct lagg_port *tmp_tp;
 
 	if (lp == sc->sc_primary || V_lagg_failover_rx_all) {
 		m->m_pkthdr.rcvif = ifp;
 		return (m);
 	}
 
 	if (!LAGG_PORTACTIVE(sc->sc_primary)) {
 		tmp_tp = lagg_link_active(sc, sc->sc_primary);
 		/*
 		 * If tmp_tp is null, we've recieved a packet when all
 		 * our links are down. Weird, but process it anyways.
 		 */
 		if ((tmp_tp == NULL || tmp_tp == lp)) {
 			m->m_pkthdr.rcvif = ifp;
 			return (m);
 		}
 	}
 
 	m_freem(m);
 	return (NULL);
 }
 
 /*
  * Loadbalancing
  */
 static void
 lagg_lb_attach(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	struct lagg_lb *lb;
 
 	lb = malloc(sizeof(struct lagg_lb), M_DEVBUF, M_WAITOK | M_ZERO);
 
 	sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX;
 
 	lb->lb_key = m_ether_tcpip_hash_init();
 	sc->sc_psc = lb;
 
 	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lagg_lb_port_create(lp);
 }
 
 static void
 lagg_lb_detach(struct lagg_softc *sc)
 {
 	struct lagg_lb *lb;
 
 	lb = (struct lagg_lb *)sc->sc_psc;
 	LAGG_WUNLOCK(sc);
 	if (lb != NULL)
 		free(lb, M_DEVBUF);
 }
 
 static int
 lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp)
 {
 	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
 	struct lagg_port *lp_next;
 	int i = 0;
 
 	bzero(&lb->lb_ports, sizeof(lb->lb_ports));
 	SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
 		if (lp_next == lp)
 			continue;
 		if (i >= LAGG_MAX_PORTS)
 			return (EINVAL);
 		if (sc->sc_ifflags & IFF_DEBUG)
 			printf("%s: port %s at index %d\n",
 			    sc->sc_ifname, lp_next->lp_ifp->if_xname, i);
 		lb->lb_ports[i++] = lp_next;
 	}
 
 	return (0);
 }
 
 static int
 lagg_lb_port_create(struct lagg_port *lp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	return (lagg_lb_porttable(sc, NULL));
 }
 
 static void
 lagg_lb_port_destroy(struct lagg_port *lp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	lagg_lb_porttable(sc, lp);
 }
 
 static int
 lagg_lb_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
 	struct lagg_port *lp = NULL;
 	uint32_t p = 0;
 
 	if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) &&
 	    M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
 		p = m->m_pkthdr.flowid >> sc->flowid_shift;
 	else
 		p = m_ether_tcpip_hash(sc->sc_flags, m, lb->lb_key);
 	p %= sc->sc_count;
 	lp = lb->lb_ports[p];
 
 	/*
 	 * Check the port's link state. This will return the next active
 	 * port if the link is down or the port is NULL.
 	 */
 	if ((lp = lagg_link_active(sc, lp)) == NULL) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	/* Just pass in the packet to our lagg device */
 	m->m_pkthdr.rcvif = ifp;
 
 	return (m);
 }
 
 /*
  * 802.3ad LACP
  */
 static void
 lagg_lacp_attach(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 
 	lacp_attach(sc);
 	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_create(lp);
 }
 
 static void
 lagg_lacp_detach(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	void *psc;
 
 	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_destroy(lp);
 
 	psc = sc->sc_psc;
 	sc->sc_psc = NULL;
 	LAGG_WUNLOCK(sc);
 
 	lacp_detach(psc);
 }
 
 static void
 lagg_lacp_lladdr(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 
 	/* purge all the lacp ports */
 	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_destroy(lp);
 
 	/* add them back in */
 	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_create(lp);
 }
 
 static int
 lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_port *lp;
 
 	lp = lacp_select_tx_port(sc, m);
 	if (lp == NULL) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 	struct ether_header *eh;
 	u_short etype;
 
 	eh = mtod(m, struct ether_header *);
 	etype = ntohs(eh->ether_type);
 
 	/* Tap off LACP control messages */
 	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) {
 		m = lacp_input(lp, m);
 		if (m == NULL)
 			return (NULL);
 	}
 
 	/*
 	 * If the port is not collecting or not in the active aggregator then
 	 * free and return.
 	 */
 	if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) {
 		m_freem(m);
 		return (NULL);
 	}
 
 	m->m_pkthdr.rcvif = ifp;
 	return (m);
 }
 
Index: projects/vnet/sys/net/if_loop.c
===================================================================
--- projects/vnet/sys/net/if_loop.c	(revision 297511)
+++ projects/vnet/sys/net/if_loop.c	(revision 297512)
@@ -1,446 +1,446 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_loop.c	8.2 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 /*
  * Loopback interface driver for protocol testing and timing.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/bpf.h>
 #include <net/vnet.h>
 
 #ifdef	INET
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #endif
 
 #ifdef INET6
 #ifndef INET
 #include <netinet/in.h>
 #endif
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 #ifdef TINY_LOMTU
 #define	LOMTU	(1024+512)
 #elif defined(LARGE_LOMTU)
 #define LOMTU	131072
 #else
 #define LOMTU	16384
 #endif
 
 #define	LO_CSUM_FEATURES	(CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP)
 #define	LO_CSUM_FEATURES6	(CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6)
 #define	LO_CSUM_SET		(CSUM_DATA_VALID | CSUM_DATA_VALID_IPV6 | \
 				    CSUM_PSEUDO_HDR | \
 				    CSUM_IP_CHECKED | CSUM_IP_VALID | \
 				    CSUM_SCTP_VALID)
 
 int		loioctl(struct ifnet *, u_long, caddr_t);
 int		looutput(struct ifnet *ifp, struct mbuf *m,
 		    const struct sockaddr *dst, struct route *ro);
 static int	lo_clone_create(struct if_clone *, int, caddr_t);
 static void	lo_clone_destroy(struct ifnet *);
 
 VNET_DEFINE(struct ifnet *, loif);	/* Used externally */
 
 #ifdef VIMAGE
 static VNET_DEFINE(struct if_clone *, lo_cloner);
 #define	V_lo_cloner		VNET(lo_cloner)
 #endif
 
 static struct if_clone *lo_cloner;
 static const char loname[] = "lo";
 
 static void
 lo_clone_destroy(struct ifnet *ifp)
 {
 
 #ifndef VIMAGE
 	/* XXX: destroying lo0 will lead to panics. */
 	KASSERT(V_loif != ifp, ("%s: destroying lo0", __func__));
 #endif
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 	if_free(ifp);
 }
 
 static int
 lo_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct ifnet *ifp;
 
 	ifp = if_alloc(IFT_LOOP);
 	if (ifp == NULL)
 		return (ENOSPC);
 
 	if_initname(ifp, loname, unit);
 	ifp->if_mtu = LOMTU;
 	ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST;
 	ifp->if_ioctl = loioctl;
 	ifp->if_output = looutput;
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	ifp->if_capabilities = ifp->if_capenable =
 	    IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6;
 	ifp->if_hwassist = LO_CSUM_FEATURES | LO_CSUM_FEATURES6;
 	if_attach(ifp);
 	bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));
 	if (V_loif == NULL)
 		V_loif = ifp;
 
 	return (0);
 }
 
 static void
 vnet_loif_init(const void *unused __unused)
 {
 
 #ifdef VIMAGE
 	lo_cloner = if_clone_simple(loname, lo_clone_create, lo_clone_destroy,
 	    1);
 	V_lo_cloner = lo_cloner;
 #else
 	lo_cloner = if_clone_simple(loname, lo_clone_create, lo_clone_destroy,
 	    1);
 #endif
 }
-VNET_SYSINIT(vnet_loif_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+VNET_SYSINIT(vnet_loif_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_loif_init, NULL);
 
 #ifdef VIMAGE
 static void
 vnet_loif_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_lo_cloner);
 	V_loif = NULL;
 }
-VNET_SYSUNINIT(vnet_loif_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+VNET_SYSUNINIT(vnet_loif_uninit, SI_SUB_PSEUDO_DONE, SI_ORDER_SECOND,
     vnet_loif_uninit, NULL);
 #endif
 
 static int
 loop_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		break;
 
 	case MOD_UNLOAD:
 		printf("loop module unload - not possible for this module type\n");
 		return (EINVAL);
 
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t loop_mod = {
 	"if_lo",
 	loop_modevent,
 	0
 };
 
 DECLARE_MODULE(if_lo, loop_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
 
 int
 looutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
     struct route *ro)
 {
 	u_int32_t af;
 #ifdef MAC
 	int error;
 #endif
 
 	M_ASSERTPKTHDR(m); /* check if we have the packet header */
 
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error) {
 		m_freem(m);
 		return (error);
 	}
 #endif
 
 	if (ro != NULL && ro->ro_flags & (RT_REJECT|RT_BLACKHOLE)) {
 		m_freem(m);
 		return (ro->ro_flags & RT_BLACKHOLE ? 0 : EHOSTUNREACH);
 	}
 
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
 
 	/* BPF writes need to be handled specially. */
 	if (dst->sa_family == AF_UNSPEC || dst->sa_family == pseudo_AF_HDRCMPLT)
 		bcopy(dst->sa_data, &af, sizeof(af));
 	else
 		af = dst->sa_family;
 
 #if 1	/* XXX */
 	switch (af) {
 	case AF_INET:
 		if (ifp->if_capenable & IFCAP_RXCSUM) {
 			m->m_pkthdr.csum_data = 0xffff;
 			m->m_pkthdr.csum_flags = LO_CSUM_SET;
 		}
 		m->m_pkthdr.csum_flags &= ~LO_CSUM_FEATURES;
 		break;
 	case AF_INET6:
 #if 0
 		/*
 		 * XXX-BZ for now always claim the checksum is good despite
 		 * any interface flags.   This is a workaround for 9.1-R and
 		 * a proper solution ought to be sought later.
 		 */
 		if (ifp->if_capenable & IFCAP_RXCSUM_IPV6) {
 			m->m_pkthdr.csum_data = 0xffff;
 			m->m_pkthdr.csum_flags = LO_CSUM_SET;
 		}
 #else
 		m->m_pkthdr.csum_data = 0xffff;
 		m->m_pkthdr.csum_flags = LO_CSUM_SET;
 #endif
 		m->m_pkthdr.csum_flags &= ~LO_CSUM_FEATURES6;
 		break;
 	default:
 		printf("looutput: af=%d unexpected\n", af);
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 #endif
 	return (if_simloop(ifp, m, af, 0));
 }
 
 /*
  * if_simloop()
  *
  * This function is to support software emulation of hardware loopback,
  * i.e., for interfaces with the IFF_SIMPLEX attribute. Since they can't
  * hear their own broadcasts, we create a copy of the packet that we
  * would normally receive via a hardware loopback.
  *
  * This function expects the packet to include the media header of length hlen.
  */
 int
 if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen)
 {
 	int isr;
 
 	M_ASSERTPKTHDR(m);
 	m_tag_delete_nonpersistent(m);
 	m->m_pkthdr.rcvif = ifp;
 
 #ifdef MAC
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	/*
 	 * Let BPF see incoming packet in the following manner:
 	 *  - Emulated packet loopback for a simplex interface
 	 *    (net/if_ethersubr.c)
 	 *	-> passes it to ifp's BPF
 	 *  - IPv4/v6 multicast packet loopback (netinet(6)/ip(6)_output.c)
 	 *	-> not passes it to any BPF
 	 *  - Normal packet loopback from myself to myself (net/if_loop.c)
 	 *	-> passes to lo0's BPF (even in case of IPv6, where ifp!=lo0)
 	 */
 	if (hlen > 0) {
 		if (bpf_peers_present(ifp->if_bpf)) {
 			bpf_mtap(ifp->if_bpf, m);
 		}
 	} else {
 		if (bpf_peers_present(V_loif->if_bpf)) {
 			if ((m->m_flags & M_MCAST) == 0 || V_loif == ifp) {
 				/* XXX beware sizeof(af) != 4 */
 				u_int32_t af1 = af;
 
 				/*
 				 * We need to prepend the address family.
 				 */
 				bpf_mtap2(V_loif->if_bpf, &af1, sizeof(af1), m);
 			}
 		}
 	}
 
 	/* Strip away media header */
 	if (hlen > 0) {
 		m_adj(m, hlen);
 #ifndef __NO_STRICT_ALIGNMENT
 		/*
 		 * Some archs do not like unaligned data, so
 		 * we move data down in the first mbuf.
 		 */
 		if (mtod(m, vm_offset_t) & 3) {
 			KASSERT(hlen >= 3, ("if_simloop: hlen too small"));
 			bcopy(m->m_data,
 			    (char *)(mtod(m, vm_offset_t)
 				- (mtod(m, vm_offset_t) & 3)),
 			    m->m_len);
 			m->m_data -= (mtod(m,vm_offset_t) & 3);
 		}
 #endif
 	}
 
 	/* Deliver to upper layer protocol */
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		isr = NETISR_IP;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		m->m_flags |= M_LOOP;
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		printf("if_simloop: can't handle af=%d\n", af);
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	netisr_queue(isr, m);	/* mbuf is free'd on failure. */
 	return (0);
 }
 
 /*
  * Process an ioctl request.
  */
 /* ARGSUSED */
 int
 loioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	int error = 0, mask;
 
 	switch (cmd) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 		/*
 		 * Everything else is done at a higher level.
 		 */
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (ifr == 0) {
 			error = EAFNOSUPPORT;		/* XXX */
 			break;
 		}
 		switch (ifr->ifr_addr.sa_family) {
 
 #ifdef INET
 		case AF_INET:
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			break;
 #endif
 
 		default:
 			error = EAFNOSUPPORT;
 			break;
 		}
 		break;
 
 	case SIOCSIFMTU:
 		ifp->if_mtu = ifr->ifr_mtu;
 		break;
 
 	case SIOCSIFFLAGS:
 		break;
 
 	case SIOCSIFCAP:
 		mask = ifp->if_capenable ^ ifr->ifr_reqcap;
 		if ((mask & IFCAP_RXCSUM) != 0)
 			ifp->if_capenable ^= IFCAP_RXCSUM;
 		if ((mask & IFCAP_TXCSUM) != 0)
 			ifp->if_capenable ^= IFCAP_TXCSUM;
 		if ((mask & IFCAP_RXCSUM_IPV6) != 0) {
 #if 0
 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
 #else
 			error = EOPNOTSUPP;
 			break;
 #endif
 		}
 		if ((mask & IFCAP_TXCSUM_IPV6) != 0) {
 #if 0
 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
 #else
 			error = EOPNOTSUPP;
 			break;
 #endif
 		}
 		ifp->if_hwassist = 0;
 		if (ifp->if_capenable & IFCAP_TXCSUM)
 			ifp->if_hwassist = LO_CSUM_FEATURES;
 #if 0
 		if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
 			ifp->if_hwassist |= LO_CSUM_FEATURES6;
 #endif
 		break;
 
 	default:
 		error = EINVAL;
 	}
 	return (error);
 }
Index: projects/vnet/sys/net/route.c
===================================================================
--- projects/vnet/sys/net/route.c	(revision 297511)
+++ projects/vnet/sys/net/route.c	(revision 297512)
@@ -1,2293 +1,2302 @@
 /*-
  * Copyright (c) 1980, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)route.c	8.3.1.1 (Berkeley) 2/23/95
  * $FreeBSD$
  */
 /************************************************************************
  * Note: In this file a 'fib' is a "forwarding information base"	*
  * Which is the new name for an in kernel routing (next hop) table.	*
  ***********************************************************************/
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 #include "opt_sctp.h"
 #include "opt_mrouting.h"
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/route_var.h>
 #include <net/vnet.h>
 #include <net/flowtable.h>
 
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/ip_mroute.h>
 
 #include <vm/uma.h>
 
 #define	RT_MAXFIBS	UINT16_MAX
 
 /* Kernel config default option. */
 #ifdef ROUTETABLES
 #if ROUTETABLES <= 0
 #error "ROUTETABLES defined too low"
 #endif
 #if ROUTETABLES > RT_MAXFIBS
 #error "ROUTETABLES defined too big"
 #endif
 #define	RT_NUMFIBS	ROUTETABLES
 #endif /* ROUTETABLES */
 /* Initialize to default if not otherwise set. */
 #ifndef	RT_NUMFIBS
 #define	RT_NUMFIBS	1
 #endif
 
 #if defined(INET) || defined(INET6)
 #ifdef SCTP
 extern void sctp_addr_change(struct ifaddr *ifa, int cmd);
 #endif /* SCTP */
 #endif
 
 
 /* This is read-only.. */
 u_int rt_numfibs = RT_NUMFIBS;
 SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, "");
 
 /*
  * By default add routes to all fibs for new interfaces.
  * Once this is set to 0 then only allocate routes on interface
  * changes for the FIB of the caller when adding a new set of addresses
  * to an interface.  XXX this is a shotgun aproach to a problem that needs
  * a more fine grained solution.. that will come.
  * XXX also has the problems getting the FIB from curthread which will not
  * always work given the fib can be overridden and prefixes can be added
  * from the network stack context.
  */
 VNET_DEFINE(u_int, rt_add_addr_allfibs) = 1;
 SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET,
     &VNET_NAME(rt_add_addr_allfibs), 0, "");
 
 VNET_DEFINE(struct rtstat, rtstat);
 #define	V_rtstat	VNET(rtstat)
 
 VNET_DEFINE(struct rib_head *, rt_tables);
 #define	V_rt_tables	VNET(rt_tables)
 
 VNET_DEFINE(int, rttrash);		/* routes not in table but not freed */
 #define	V_rttrash	VNET(rttrash)
 
 
 /*
  * Convert a 'struct radix_node *' to a 'struct rtentry *'.
  * The operation can be done safely (in this code) because a
  * 'struct rtentry' starts with two 'struct radix_node''s, the first
  * one representing leaf nodes in the routing tree, which is
  * what the code in radix.c passes us as a 'struct radix_node'.
  *
  * But because there are a lot of assumptions in this conversion,
  * do not cast explicitly, but always use the macro below.
  */
 #define RNTORT(p)	((struct rtentry *)(p))
 
 static VNET_DEFINE(uma_zone_t, rtzone);		/* Routing table UMA zone. */
 #define	V_rtzone	VNET(rtzone)
 
 static int rtrequest1_fib_change(struct rib_head *, struct rt_addrinfo *,
     struct rtentry **, u_int);
 static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *);
 static int rt_ifdelroute(const struct rtentry *rt, void *arg);
 static struct rtentry *rt_unlinkrte(struct rib_head *rnh,
     struct rt_addrinfo *info, int *perror);
 static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info);
 #ifdef RADIX_MPATH
 static struct radix_node *rt_mpath_unlink(struct rib_head *rnh,
     struct rt_addrinfo *info, struct rtentry *rto, int *perror);
 #endif
 static int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info,
     int flags);
 
 struct if_mtuinfo
 {
 	struct ifnet	*ifp;
 	int		mtu;
 };
 
 static int	if_updatemtu_cb(struct radix_node *, void *);
 
 /*
  * handler for net.my_fibnum
  */
 static int
 sysctl_my_fibnum(SYSCTL_HANDLER_ARGS)
 {
         int fibnum;
         int error;
  
         fibnum = curthread->td_proc->p_fibnum;
         error = sysctl_handle_int(oidp, &fibnum, 0, req);
         return (error);
 }
 
 SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD,
             NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller");
 
 static __inline struct rib_head **
 rt_tables_get_rnh_ptr(int table, int fam)
 {
 	struct rib_head **rnh;
 
 	KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.",
 	    __func__));
 	KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.",
 	    __func__));
 
 	/* rnh is [fib=0][af=0]. */
 	rnh = (struct rib_head **)V_rt_tables;
 	/* Get the offset to the requested table and fam. */
 	rnh += table * (AF_MAX+1) + fam;
 
 	return (rnh);
 }
 
 struct rib_head *
 rt_tables_get_rnh(int table, int fam)
 {
 
 	return (*rt_tables_get_rnh_ptr(table, fam));
 }
 
 /*
  * route initialization must occur before ip6_init2(), which happenas at
  * SI_ORDER_MIDDLE.
  */
 static void
 route_init(void)
 {
 
 	/* whack the tunable ints into  line. */
 	if (rt_numfibs > RT_MAXFIBS)
 		rt_numfibs = RT_MAXFIBS;
 	if (rt_numfibs == 0)
 		rt_numfibs = 1;
 }
 SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0);
 
 static int
 rtentry_zinit(void *mem, int size, int how)
 {
 	struct rtentry *rt = mem;
 
 	rt->rt_pksent = counter_u64_alloc(how);
 	if (rt->rt_pksent == NULL)
 		return (ENOMEM);
 
 	RT_LOCK_INIT(rt);
 
 	return (0);
 }
 
 static void
 rtentry_zfini(void *mem, int size)
 {
 	struct rtentry *rt = mem;
 
 	RT_LOCK_DESTROY(rt);
 	counter_u64_free(rt->rt_pksent);
 }
 
 static int
 rtentry_ctor(void *mem, int size, void *arg, int how)
 {
 	struct rtentry *rt = mem;
 
 	bzero(rt, offsetof(struct rtentry, rt_endzero));
 	counter_u64_zero(rt->rt_pksent);
 	rt->rt_chain = NULL;
 
 	return (0);
 }
 
 static void
 rtentry_dtor(void *mem, int size, void *arg)
 {
 	struct rtentry *rt = mem;
 
 	RT_UNLOCK_COND(rt);
 }
 
 static void
 vnet_route_init(const void *unused __unused)
 {
 	struct domain *dom;
 	struct rib_head **rnh;
 	int table;
 	int fam;
 
 	V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) *
 	    sizeof(struct rib_head *), M_RTABLE, M_WAITOK|M_ZERO);
 
 	V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry),
 	    rtentry_ctor, rtentry_dtor,
 	    rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0);
 	for (dom = domains; dom; dom = dom->dom_next) {
 		if (dom->dom_rtattach == NULL)
 			continue;
 
 		for  (table = 0; table < rt_numfibs; table++) {
 			fam = dom->dom_family;
 			if (table != 0 && fam != AF_INET6 && fam != AF_INET)
 				break;
 
 			rnh = rt_tables_get_rnh_ptr(table, fam);
 			if (rnh == NULL)
 				panic("%s: rnh NULL", __func__);
 			dom->dom_rtattach((void **)rnh, 0);
 		}
 	}
 }
 VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
     vnet_route_init, 0);
 
 #ifdef VIMAGE
 static void
 vnet_route_uninit(const void *unused __unused)
 {
 	int table;
 	int fam;
 	struct domain *dom;
 	struct rib_head **rnh;
 
 	for (dom = domains; dom; dom = dom->dom_next) {
 		if (dom->dom_rtdetach == NULL)
 			continue;
 
 		for (table = 0; table < rt_numfibs; table++) {
 			fam = dom->dom_family;
 
 			if (table != 0 && fam != AF_INET6 && fam != AF_INET)
 				break;
 
 			rnh = rt_tables_get_rnh_ptr(table, fam);
 			if (rnh == NULL)
 				panic("%s: rnh NULL", __func__);
 			dom->dom_rtdetach((void **)rnh, 0);
 		}
 	}
 
 	free(V_rt_tables, M_RTABLE);
 	uma_zdestroy(V_rtzone);
 }
 VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     vnet_route_uninit, 0);
 #endif
 
 struct rib_head *
 rt_table_init(int offset)
 {
 	struct rib_head *rh;
 
 	rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO);
 
 	/* TODO: These details should be hidded inside radix.c */
 	/* Init masks tree */
 	rn_inithead_internal(&rh->head, rh->rnh_nodes, offset);
 	rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0);
 	rh->head.rnh_masks = &rh->rmhead;
 
 	/* Init locks */
 	rw_init(&rh->rib_lock, "rib head lock");
 
 	/* Finally, set base callbacks */
 	rh->rnh_addaddr = rn_addroute;
 	rh->rnh_deladdr = rn_delete;
 	rh->rnh_matchaddr = rn_match;
 	rh->rnh_lookup = rn_lookup;
 	rh->rnh_walktree = rn_walktree;
 	rh->rnh_walktree_from = rn_walktree_from;
 
 	return (rh);
 }
 
 static int
 rt_freeentry(struct radix_node *rn, void *arg)
 {
 	struct radix_head * const rnh = arg;
 	struct radix_node *x;
 
 	x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh);
 	if (x != NULL)
 		R_Free(x);
 	return (0);
 }
 
 void
 rt_table_destroy(struct rib_head *rh)
 {
 
 	rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head);
 
 	/* Assume table is already empty */
 	rw_destroy(&rh->rib_lock);
 	free(rh, M_RTABLE);
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct setfib_args {
 	int     fibnum;
 };
 #endif
 int
 sys_setfib(struct thread *td, struct setfib_args *uap)
 {
 	if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs)
 		return EINVAL;
 	td->td_proc->p_fibnum = uap->fibnum;
 	return (0);
 }
 
 /*
  * Packet routing routines.
  */
 void
 rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum)
 {
 	struct rtentry *rt;
 
 	if ((rt = ro->ro_rt) != NULL) {
 		if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
 			return;
 		RTFREE(rt);
 		ro->ro_rt = NULL;
 	}
 	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum);
 	if (ro->ro_rt)
 		RT_UNLOCK(ro->ro_rt);
 }
 
 /*
  * Look up the route that matches the address given
  * Or, at least try.. Create a cloned route if needed.
  *
  * The returned route, if any, is locked.
  */
 struct rtentry *
 rtalloc1(struct sockaddr *dst, int report, u_long ignflags)
 {
 
 	return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB));
 }
 
 struct rtentry *
 rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags,
 		    u_int fibnum)
 {
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct rtentry *newrt;
 	struct rt_addrinfo info;
 	int err = 0, msgtype = RTM_MISS;
 
 	KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	newrt = NULL;
 	if (rh == NULL)
 		goto miss;
 
 	/*
 	 * Look up the address in the table for that Address Family
 	 */
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr(dst, &rh->head);
 	if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		newrt = RNTORT(rn);
 		RT_LOCK(newrt);
 		RT_ADDREF(newrt);
 		RIB_RUNLOCK(rh);
 		return (newrt);
 
 	} else
 		RIB_RUNLOCK(rh);
 	
 	/*
 	 * Either we hit the root or couldn't find any match,
 	 * Which basically means
 	 * "caint get there frm here"
 	 */
 miss:
 	V_rtstat.rts_unreach++;
 
 	if (report) {
 		/*
 		 * If required, report the failure to the supervising
 		 * Authorities.
 		 * For a delete, this is not an error. (report == 0)
 		 */
 		bzero(&info, sizeof(info));
 		info.rti_info[RTAX_DST] = dst;
 		rt_missmsg_fib(msgtype, &info, 0, err, fibnum);
 	}
 	return (newrt);
 }
 
 /*
  * Remove a reference count from an rtentry.
  * If the count gets low enough, take it out of the routing table
  */
 void
 rtfree(struct rtentry *rt)
 {
 	struct rib_head *rnh;
 
 	KASSERT(rt != NULL,("%s: NULL rt", __func__));
 	rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
 	KASSERT(rnh != NULL,("%s: NULL rnh", __func__));
 
 	RT_LOCK_ASSERT(rt);
 
 	/*
 	 * The callers should use RTFREE_LOCKED() or RTFREE(), so
 	 * we should come here exactly with the last reference.
 	 */
 	RT_REMREF(rt);
 	if (rt->rt_refcnt > 0) {
 		log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt);
 		goto done;
 	}
 
 	/*
 	 * On last reference give the "close method" a chance
 	 * to cleanup private state.  This also permits (for
 	 * IPv4 and IPv6) a chance to decide if the routing table
 	 * entry should be purged immediately or at a later time.
 	 * When an immediate purge is to happen the close routine
 	 * typically calls rtexpunge which clears the RTF_UP flag
 	 * on the entry so that the code below reclaims the storage.
 	 */
 	if (rt->rt_refcnt == 0 && rnh->rnh_close)
 		rnh->rnh_close((struct radix_node *)rt, &rnh->head);
 
 	/*
 	 * If we are no longer "up" (and ref == 0)
 	 * then we can free the resources associated
 	 * with the route.
 	 */
 	if ((rt->rt_flags & RTF_UP) == 0) {
 		if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
 			panic("rtfree 2");
 		/*
 		 * the rtentry must have been removed from the routing table
 		 * so it is represented in rttrash.. remove that now.
 		 */
 		V_rttrash--;
 #ifdef	DIAGNOSTIC
 		if (rt->rt_refcnt < 0) {
 			printf("rtfree: %p not freed (neg refs)\n", rt);
 			goto done;
 		}
 #endif
 		/*
 		 * release references on items we hold them on..
 		 * e.g other routes and ifaddrs.
 		 */
 		if (rt->rt_ifa)
 			ifa_free(rt->rt_ifa);
 		/*
 		 * The key is separatly alloc'd so free it (see rt_setgate()).
 		 * This also frees the gateway, as they are always malloc'd
 		 * together.
 		 */
 		R_Free(rt_key(rt));
 
 		/*
 		 * and the rtentry itself of course
 		 */
 		uma_zfree(V_rtzone, rt);
 		return;
 	}
 done:
 	RT_UNLOCK(rt);
 }
 
 
 /*
  * Force a routing table entry to the specified
  * destination to go through the given gateway.
  * Normally called as a result of a routing redirect
  * message from the network layer.
  */
 void
 rtredirect_fib(struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct sockaddr *src,
 	u_int fibnum)
 {
 	struct rtentry *rt;
 	int error = 0;
 	short *stat = NULL;
 	struct rt_addrinfo info;
 	struct ifaddr *ifa;
 	struct rib_head *rnh;
 
 	ifa = NULL;
 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	if (rnh == NULL) {
 		error = EAFNOSUPPORT;
 		goto out;
 	}
 
 	/* verify the gateway is directly reachable */
 	if ((ifa = ifa_ifwithnet(gateway, 0, fibnum)) == NULL) {
 		error = ENETUNREACH;
 		goto out;
 	}
 	rt = rtalloc1_fib(dst, 0, 0UL, fibnum);	/* NB: rt is locked */
 	/*
 	 * If the redirect isn't from our current router for this dst,
 	 * it's either old or wrong.  If it redirects us to ourselves,
 	 * we have a routing loop, perhaps as a result of an interface
 	 * going down recently.
 	 */
 	if (!(flags & RTF_DONE) && rt) {
 		if (!sa_equal(src, rt->rt_gateway)) {
 			error = EINVAL;
 			goto done;
 		}
 		if (rt->rt_ifa != ifa && ifa->ifa_addr->sa_family != AF_LINK) {
 			error = EINVAL;
 			goto done;
 		}
 	}
 	if ((flags & RTF_GATEWAY) && ifa_ifwithaddr_check(gateway)) {
 		error = EHOSTUNREACH;
 		goto done;
 	}
 	/*
 	 * Create a new entry if we just got back a wildcard entry
 	 * or the lookup failed.  This is necessary for hosts
 	 * which use routing redirects generated by smart gateways
 	 * to dynamically build the routing tables.
 	 */
 	if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
 		goto create;
 	/*
 	 * Don't listen to the redirect if it's
 	 * for a route to an interface.
 	 */
 	if (rt->rt_flags & RTF_GATEWAY) {
 		if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
 			/*
 			 * Changing from route to net => route to host.
 			 * Create new route, rather than smashing route to net.
 			 */
 		create:
 			if (rt != NULL)
 				RTFREE_LOCKED(rt);
 		
 			flags |= RTF_DYNAMIC;
 			bzero((caddr_t)&info, sizeof(info));
 			info.rti_info[RTAX_DST] = dst;
 			info.rti_info[RTAX_GATEWAY] = gateway;
 			info.rti_info[RTAX_NETMASK] = netmask;
 			info.rti_ifa = ifa;
 			info.rti_flags = flags;
 			error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum);
 			if (rt != NULL) {
 				RT_LOCK(rt);
 				flags = rt->rt_flags;
 			}
 			
 			stat = &V_rtstat.rts_dynamic;
 		} else {
 
 			/*
 			 * Smash the current notion of the gateway to
 			 * this destination.  Should check about netmask!!!
 			 */
 			if ((flags & RTF_GATEWAY) == 0)
 				rt->rt_flags &= ~RTF_GATEWAY;
 			rt->rt_flags |= RTF_MODIFIED;
 			flags |= RTF_MODIFIED;
 			stat = &V_rtstat.rts_newgateway;
 			/*
 			 * add the key and gateway (in one malloc'd chunk).
 			 */
 			RT_UNLOCK(rt);
 			RIB_WLOCK(rnh);
 			RT_LOCK(rt);
 			rt_setgate(rt, rt_key(rt), gateway);
 			RIB_WUNLOCK(rnh);
 		}
 	} else
 		error = EHOSTUNREACH;
 done:
 	if (rt)
 		RTFREE_LOCKED(rt);
 out:
 	if (error)
 		V_rtstat.rts_badredirect++;
 	else if (stat != NULL)
 		(*stat)++;
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = dst;
 	info.rti_info[RTAX_GATEWAY] = gateway;
 	info.rti_info[RTAX_NETMASK] = netmask;
 	info.rti_info[RTAX_AUTHOR] = src;
 	rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum);
 	if (ifa != NULL)
 		ifa_free(ifa);
 }
 
 /*
  * Routing table ioctl interface.
  */
 int
 rtioctl_fib(u_long req, caddr_t data, u_int fibnum)
 {
 
 	/*
 	 * If more ioctl commands are added here, make sure the proper
 	 * super-user checks are being performed because it is possible for
 	 * prison-root to make it this far if raw sockets have been enabled
 	 * in jails.
 	 */
 #ifdef INET
 	/* Multicast goop, grrr... */
 	return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP;
 #else /* INET */
 	return ENXIO;
 #endif /* INET */
 }
 
 struct ifaddr *
 ifa_ifwithroute(int flags, const struct sockaddr *dst, struct sockaddr *gateway,
 				u_int fibnum)
 {
 	struct ifaddr *ifa;
 	int not_found = 0;
 
 	if ((flags & RTF_GATEWAY) == 0) {
 		/*
 		 * If we are adding a route to an interface,
 		 * and the interface is a pt to pt link
 		 * we should search for the destination
 		 * as our clue to the interface.  Otherwise
 		 * we can use the local address.
 		 */
 		ifa = NULL;
 		if (flags & RTF_HOST)
 			ifa = ifa_ifwithdstaddr(dst, fibnum);
 		if (ifa == NULL)
 			ifa = ifa_ifwithaddr(gateway);
 	} else {
 		/*
 		 * If we are adding a route to a remote net
 		 * or host, the gateway may still be on the
 		 * other end of a pt to pt link.
 		 */
 		ifa = ifa_ifwithdstaddr(gateway, fibnum);
 	}
 	if (ifa == NULL)
 		ifa = ifa_ifwithnet(gateway, 0, fibnum);
 	if (ifa == NULL) {
 		struct rtentry *rt = rtalloc1_fib(gateway, 0, 0, fibnum);
 		if (rt == NULL)
 			return (NULL);
 		/*
 		 * dismiss a gateway that is reachable only
 		 * through the default router
 		 */
 		switch (gateway->sa_family) {
 		case AF_INET:
 			if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY)
 				not_found = 1;
 			break;
 		case AF_INET6:
 			if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr))
 				not_found = 1;
 			break;
 		default:
 			break;
 		}
 		if (!not_found && rt->rt_ifa != NULL) {
 			ifa = rt->rt_ifa;
 			ifa_ref(ifa);
 		}
 		RT_REMREF(rt);
 		RT_UNLOCK(rt);
 		if (not_found || ifa == NULL)
 			return (NULL);
 	}
 	if (ifa->ifa_addr->sa_family != dst->sa_family) {
 		struct ifaddr *oifa = ifa;
 		ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
 		if (ifa == NULL)
 			ifa = oifa;
 		else
 			ifa_free(oifa);
 	}
 	return (ifa);
 }
 
 /*
  * Do appropriate manipulations of a routing tree given
  * all the bits of info needed
  */
 int
 rtrequest_fib(int req,
 	struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct rtentry **ret_nrt,
 	u_int fibnum)
 {
 	struct rt_addrinfo info;
 
 	if (dst->sa_len == 0)
 		return(EINVAL);
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_flags = flags;
 	info.rti_info[RTAX_DST] = dst;
 	info.rti_info[RTAX_GATEWAY] = gateway;
 	info.rti_info[RTAX_NETMASK] = netmask;
 	return rtrequest1_fib(req, &info, ret_nrt, fibnum);
 }
 
 
 /*
  * Copy most of @rt data into @info.
  *
  * If @flags contains NHR_COPY, copies dst,netmask and gw to the
  * pointers specified by @info structure. Assume such pointers
  * are zeroed sockaddr-like structures with sa_len field initialized
  * to reflect size of the provided buffer. if no NHR_COPY is specified,
  * point dst,netmask and gw @info fields to appropriate @rt values.
  *
  * if @flags contains NHR_REF, do refcouting on rt_ifp.
  *
  * Returns 0 on success.
  */
 int
 rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags)
 {
 	struct rt_metrics *rmx;
 	struct sockaddr *src, *dst;
 	int sa_len;
 
 	if (flags & NHR_COPY) {
 		/* Copy destination if dst is non-zero */
 		src = rt_key(rt);
 		dst = info->rti_info[RTAX_DST];
 		sa_len = src->sa_len;
 		if (dst != NULL) {
 			if (src->sa_len > dst->sa_len)
 				return (ENOMEM);
 			memcpy(dst, src, src->sa_len);
 			info->rti_addrs |= RTA_DST;
 		}
 
 		/* Copy mask if set && dst is non-zero */
 		src = rt_mask(rt);
 		dst = info->rti_info[RTAX_NETMASK];
 		if (src != NULL && dst != NULL) {
 
 			/*
 			 * Radix stores different value in sa_len,
 			 * assume rt_mask() to have the same length
 			 * as rt_key()
 			 */
 			if (sa_len > dst->sa_len)
 				return (ENOMEM);
 			memcpy(dst, src, src->sa_len);
 			info->rti_addrs |= RTA_NETMASK;
 		}
 
 		/* Copy gateway is set && dst is non-zero */
 		src = rt->rt_gateway;
 		dst = info->rti_info[RTAX_GATEWAY];
 		if ((rt->rt_flags & RTF_GATEWAY) && src != NULL && dst != NULL){
 			if (src->sa_len > dst->sa_len)
 				return (ENOMEM);
 			memcpy(dst, src, src->sa_len);
 			info->rti_addrs |= RTA_GATEWAY;
 		}
 	} else {
 		info->rti_info[RTAX_DST] = rt_key(rt);
 		info->rti_addrs |= RTA_DST;
 		if (rt_mask(rt) != NULL) {
 			info->rti_info[RTAX_NETMASK] = rt_mask(rt);
 			info->rti_addrs |= RTA_NETMASK;
 		}
 		if (rt->rt_flags & RTF_GATEWAY) {
 			info->rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 			info->rti_addrs |= RTA_GATEWAY;
 		}
 	}
 
 	rmx = info->rti_rmx;
 	if (rmx != NULL) {
 		info->rti_mflags |= RTV_MTU;
 		rmx->rmx_mtu = rt->rt_mtu;
 	}
 
 	info->rti_flags = rt->rt_flags;
 	info->rti_ifp = rt->rt_ifp;
 	info->rti_ifa = rt->rt_ifa;
 
 	if (flags & NHR_REF) {
 		/* Do 'traditional' refcouting */
 		if_ref(info->rti_ifp);
 	}
 
 	return (0);
 }
 
 /*
  * Lookups up route entry for @dst in RIB database for fib @fibnum.
  * Exports entry data to @info using rt_exportinfo().
  *
  * if @flags contains NHR_REF, refcouting is performed on rt_ifp.
  *   All references can be released later by calling rib_free_info()
  *
  * Returns 0 on success.
  * Returns ENOENT for lookup failure, ENOMEM for export failure.
  */
 int
 rib_lookup_info(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags,
     uint32_t flowid, struct rt_addrinfo *info)
 {
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct rtentry *rt;
 	int error;
 
 	KASSERT((fibnum < rt_numfibs), ("rib_lookup_rte: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	if (rh == NULL)
 		return (ENOENT);
 
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr(__DECONST(void *, dst), &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		rt = RNTORT(rn);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(rt->rt_ifp)) {
 			flags = (flags & NHR_REF) | NHR_COPY;
 			error = rt_exportinfo(rt, info, flags);
 			RIB_RUNLOCK(rh);
 
 			return (error);
 		}
 	}
 	RIB_RUNLOCK(rh);
 
 	return (ENOENT);
 }
 
 /*
  * Releases all references acquired by rib_lookup_info() when
  * called with NHR_REF flags.
  */
 void
 rib_free_info(struct rt_addrinfo *info)
 {
 
 	if_rele(info->rti_ifp);
 }
 
 /*
  * Iterates over all existing fibs in system calling
  *  @setwa_f function prior to traversing each fib.
  *  Calls @wa_f function for each element in current fib.
  * If af is not AF_UNSPEC, iterates over fibs in particular
  * address family.
  */
 void
 rt_foreach_fib_walk(int af, rt_setwarg_t *setwa_f, rt_walktree_f_t *wa_f,
     void *arg)
 {
 	struct rib_head *rnh;
 	uint32_t fibnum;
 	int i;
 
 	for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 		/* Do we want some specific family? */
 		if (af != AF_UNSPEC) {
 			rnh = rt_tables_get_rnh(fibnum, af);
 			if (rnh == NULL)
 				continue;
 			if (setwa_f != NULL)
 				setwa_f(rnh, fibnum, af, arg);
 
 			RIB_WLOCK(rnh);
 			rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg);
 			RIB_WUNLOCK(rnh);
 			continue;
 		}
 
 		for (i = 1; i <= AF_MAX; i++) {
 			rnh = rt_tables_get_rnh(fibnum, i);
 			if (rnh == NULL)
 				continue;
 			if (setwa_f != NULL)
 				setwa_f(rnh, fibnum, i, arg);
 
 			RIB_WLOCK(rnh);
 			rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg);
 			RIB_WUNLOCK(rnh);
 		}
 	}
 }
 
 struct rt_delinfo
 {
 	struct rt_addrinfo info;
 	struct rib_head *rnh;
 	struct rtentry *head;
 };
 
 /*
  * Conditionally unlinks @rn from radix tree based
  * on info data passed in @arg.
  */
 static int
 rt_checkdelroute(struct radix_node *rn, void *arg)
 {
 	struct rt_delinfo *di;
 	struct rt_addrinfo *info;
 	struct rtentry *rt;
 	int error;
 
 	di = (struct rt_delinfo *)arg;
 	rt = (struct rtentry *)rn;
 	info = &di->info;
 	error = 0;
 
 	info->rti_info[RTAX_DST] = rt_key(rt);
 	info->rti_info[RTAX_NETMASK] = rt_mask(rt);
 	info->rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 
 	rt = rt_unlinkrte(di->rnh, info, &error);
 	if (rt == NULL) {
 		/* Either not allowed or not matched. Skip entry */
 		return (0);
 	}
 
 	/* Entry was unlinked. Add to the list and return */
 	rt->rt_chain = di->head;
 	di->head = rt;
 
 	return (0);
 }
 
 /*
  * Iterates over all existing fibs in system.
  * Deletes each element for which @filter_f function returned
  * non-zero value.
  * If @af is not AF_UNSPEC, iterates over fibs in particular
  * address family.
  */
 void
 rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg)
 {
 	struct rib_head *rnh;
 	struct rt_delinfo di;
 	struct rtentry *rt;
 	uint32_t fibnum;
 	int i, start, end;
 
 	bzero(&di, sizeof(di));
 	di.info.rti_filter = filter_f;
 	di.info.rti_filterdata = arg;
 
 	for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 		/* Do we want some specific family? */
 		if (af != AF_UNSPEC) {
 			start = af;
 			end = af;
 		} else {
 			start = 1;
 			end = AF_MAX;
 		}
 
 		for (i = start; i <= end; i++) {
 			rnh = rt_tables_get_rnh(fibnum, i);
 			if (rnh == NULL)
 				continue;
 			di.rnh = rnh;
 
 			RIB_WLOCK(rnh);
 			rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
 			RIB_WUNLOCK(rnh);
 
 			if (di.head == NULL)
 				continue;
 
 			/* We might have something to reclaim */
 			while (di.head != NULL) {
 				rt = di.head;
 				di.head = rt->rt_chain;
 				rt->rt_chain = NULL;
 
 				/* TODO std rt -> rt_addrinfo export */
 				di.info.rti_info[RTAX_DST] = rt_key(rt);
 				di.info.rti_info[RTAX_NETMASK] = rt_mask(rt);
 
 				rt_notifydelete(rt, &di.info);
 				RTFREE_LOCKED(rt);
 			}
 
 		}
 	}
 }
 
 /*
  * Delete Routes for a Network Interface
  *
  * Called for each routing entry via the rnh->rnh_walktree() call above
  * to delete all route entries referencing a detaching network interface.
  *
  * Arguments:
  *	rt	pointer to rtentry
  *	arg	argument passed to rnh->rnh_walktree() - detaching interface
  *
  * Returns:
  *	0	successful
  *	errno	failed - reason indicated
  */
 static int
 rt_ifdelroute(const struct rtentry *rt, void *arg)
 {
 	struct ifnet	*ifp = arg;
 
 	if (rt->rt_ifp != ifp)
 		return (0);
 
 	/*
 	 * Protect (sorta) against walktree recursion problems
 	 * with cloned routes
 	 */
 	if ((rt->rt_flags & RTF_UP) == 0)
 		return (0);
 
 	return (1);
 }
 
 /*
  * Delete all remaining routes using this interface
  * Unfortuneatly the only way to do this is to slog through
  * the entire routing table looking for routes which point
  * to this interface...oh well...
  */
 void
+rt_flushifroutes_af(struct ifnet *ifp, int af)
+{
+	KASSERT((af >= 1 && af <= AF_MAX), ("%s: af %d not >= 1 and <= %d",
+	    __func__, af, AF_MAX));
+
+	rt_foreach_fib_walk_del(af, rt_ifdelroute, ifp);
+}
+
+void
 rt_flushifroutes(struct ifnet *ifp)
 {
 
 	rt_foreach_fib_walk_del(AF_UNSPEC, rt_ifdelroute, ifp);
 }
 
 /*
  * Conditionally unlinks rtentry matching data inside @info from @rnh.
  * Returns unlinked, locked and referenced @rtentry on success,
  * Returns NULL and sets @perror to:
  * ESRCH - if prefix was not found,
  * EADDRINUSE - if trying to delete PINNED route without appropriate flag.
  * ENOENT - if supplied filter function returned 0 (not matched).
  */
 static struct rtentry *
 rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror)
 {
 	struct sockaddr *dst, *netmask;
 	struct rtentry *rt;
 	struct radix_node *rn;
 
 	dst = info->rti_info[RTAX_DST];
 	netmask = info->rti_info[RTAX_NETMASK];
 
 	rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
 	if (rt == NULL) {
 		*perror = ESRCH;
 		return (NULL);
 	}
 
 	if ((info->rti_flags & RTF_PINNED) == 0) {
 		/* Check if target route can be deleted */
 		if (rt->rt_flags & RTF_PINNED) {
 			*perror = EADDRINUSE;
 			return (NULL);
 		}
 	}
 
 	if (info->rti_filter != NULL) {
 		if (info->rti_filter(rt, info->rti_filterdata) == 0) {
 			/* Not matched */
 			*perror = ENOENT;
 			return (NULL);
 		}
 
 		/*
 		 * Filter function requested rte deletion.
 		 * Ease the caller work by filling in remaining info
 		 * from that particular entry.
 		 */
 		info->rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 	}
 
 	/*
 	 * Remove the item from the tree and return it.
 	 * Complain if it is not there and do no more processing.
 	 */
 	*perror = ESRCH;
 #ifdef RADIX_MPATH
 	if (rt_mpath_capable(rnh))
 		rn = rt_mpath_unlink(rnh, info, rt, perror);
 	else
 #endif
 	rn = rnh->rnh_deladdr(dst, netmask, &rnh->head);
 	if (rn == NULL)
 		return (NULL);
 
 	if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
 		panic ("rtrequest delete");
 
 	rt = RNTORT(rn);
 	RT_LOCK(rt);
 	RT_ADDREF(rt);
 	rt->rt_flags &= ~RTF_UP;
 
 	*perror = 0;
 
 	return (rt);
 }
 
 static void
 rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info)
 {
 	struct ifaddr *ifa;
 
 	/*
 	 * give the protocol a chance to keep things in sync.
 	 */
 	ifa = rt->rt_ifa;
 	if (ifa != NULL && ifa->ifa_rtrequest != NULL)
 		ifa->ifa_rtrequest(RTM_DELETE, rt, info);
 
 	/*
 	 * One more rtentry floating around that is not
 	 * linked to the routing table. rttrash will be decremented
 	 * when RTFREE(rt) is eventually called.
 	 */
 	V_rttrash++;
 }
 
 
 /*
  * These (questionable) definitions of apparent local variables apply
  * to the next two functions.  XXXXXX!!!
  */
 #define	dst	info->rti_info[RTAX_DST]
 #define	gateway	info->rti_info[RTAX_GATEWAY]
 #define	netmask	info->rti_info[RTAX_NETMASK]
 #define	ifaaddr	info->rti_info[RTAX_IFA]
 #define	ifpaddr	info->rti_info[RTAX_IFP]
 #define	flags	info->rti_flags
 
 /*
  * Look up rt_addrinfo for a specific fib.  Note that if rti_ifa is defined,
  * it will be referenced so the caller must free it.
  */
 int
 rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
 {
 	struct ifaddr *ifa;
 	int error = 0;
 
 	/*
 	 * ifp may be specified by sockaddr_dl
 	 * when protocol address is ambiguous.
 	 */
 	if (info->rti_ifp == NULL && ifpaddr != NULL &&
 	    ifpaddr->sa_family == AF_LINK &&
 	    (ifa = ifa_ifwithnet(ifpaddr, 0, fibnum)) != NULL) {
 		info->rti_ifp = ifa->ifa_ifp;
 		ifa_free(ifa);
 	}
 	if (info->rti_ifa == NULL && ifaaddr != NULL)
 		info->rti_ifa = ifa_ifwithaddr(ifaaddr);
 	if (info->rti_ifa == NULL) {
 		struct sockaddr *sa;
 
 		sa = ifaaddr != NULL ? ifaaddr :
 		    (gateway != NULL ? gateway : dst);
 		if (sa != NULL && info->rti_ifp != NULL)
 			info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
 		else if (dst != NULL && gateway != NULL)
 			info->rti_ifa = ifa_ifwithroute(flags, dst, gateway,
 							fibnum);
 		else if (sa != NULL)
 			info->rti_ifa = ifa_ifwithroute(flags, sa, sa,
 							fibnum);
 	}
 	if ((ifa = info->rti_ifa) != NULL) {
 		if (info->rti_ifp == NULL)
 			info->rti_ifp = ifa->ifa_ifp;
 	} else
 		error = ENETUNREACH;
 	return (error);
 }
 
 static int
 if_updatemtu_cb(struct radix_node *rn, void *arg)
 {
 	struct rtentry *rt;
 	struct if_mtuinfo *ifmtu;
 
 	rt = (struct rtentry *)rn;
 	ifmtu = (struct if_mtuinfo *)arg;
 
 	if (rt->rt_ifp != ifmtu->ifp)
 		return (0);
 
 	if (rt->rt_mtu >= ifmtu->mtu) {
 		/* We have to decrease mtu regardless of flags */
 		rt->rt_mtu = ifmtu->mtu;
 		return (0);
 	}
 
 	/*
 	 * New MTU is bigger. Check if are allowed to alter it
 	 */
 	if ((rt->rt_flags & (RTF_FIXEDMTU | RTF_GATEWAY | RTF_HOST)) != 0) {
 
 		/*
 		 * Skip routes with user-supplied MTU and
 		 * non-interface routes
 		 */
 		return (0);
 	}
 
 	/* We are safe to update route MTU */
 	rt->rt_mtu = ifmtu->mtu;
 
 	return (0);
 }
 
 void
 rt_updatemtu(struct ifnet *ifp)
 {
 	struct if_mtuinfo ifmtu;
 	struct rib_head *rnh;
 	int i, j;
 
 	ifmtu.ifp = ifp;
 
 	/*
 	 * Try to update rt_mtu for all routes using this interface
 	 * Unfortunately the only way to do this is to traverse all
 	 * routing tables in all fibs/domains.
 	 */
 	for (i = 1; i <= AF_MAX; i++) {
 		ifmtu.mtu = if_getmtu_family(ifp, i);
 		for (j = 0; j < rt_numfibs; j++) {
 			rnh = rt_tables_get_rnh(j, i);
 			if (rnh == NULL)
 				continue;
 			RIB_WLOCK(rnh);
 			rnh->rnh_walktree(&rnh->head, if_updatemtu_cb, &ifmtu);
 			RIB_WUNLOCK(rnh);
 		}
 	}
 }
 
 
 #if 0
 int p_sockaddr(char *buf, int buflen, struct sockaddr *s);
 int rt_print(char *buf, int buflen, struct rtentry *rt);
 
 int
 p_sockaddr(char *buf, int buflen, struct sockaddr *s)
 {
 	void *paddr = NULL;
 
 	switch (s->sa_family) {
 	case AF_INET:
 		paddr = &((struct sockaddr_in *)s)->sin_addr;
 		break;
 	case AF_INET6:
 		paddr = &((struct sockaddr_in6 *)s)->sin6_addr;
 		break;
 	}
 
 	if (paddr == NULL)
 		return (0);
 
 	if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL)
 		return (0);
 	
 	return (strlen(buf));
 }
 
 int
 rt_print(char *buf, int buflen, struct rtentry *rt)
 {
 	struct sockaddr *addr, *mask;
 	int i = 0;
 
 	addr = rt_key(rt);
 	mask = rt_mask(rt);
 
 	i = p_sockaddr(buf, buflen, addr);
 	if (!(rt->rt_flags & RTF_HOST)) {
 		buf[i++] = '/';
 		i += p_sockaddr(buf + i, buflen - i, mask);
 	}
 
 	if (rt->rt_flags & RTF_GATEWAY) {
 		buf[i++] = '>';
 		i += p_sockaddr(buf + i, buflen - i, rt->rt_gateway);
 	}
 
 	return (i);
 }
 #endif
 
 #ifdef RADIX_MPATH
 /*
  * Deletes key for single-path routes, unlinks rtentry with
  * gateway specified in @info from multi-path routes.
  *
  * Returnes unlinked entry. In case of failure, returns NULL
  * and sets @perror to ESRCH.
  */
 static struct radix_node *
 rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info,
     struct rtentry *rto, int *perror)
 {
 	/*
 	 * if we got multipath routes, we require users to specify
 	 * a matching RTAX_GATEWAY.
 	 */
 	struct rtentry *rt; // *rto = NULL;
 	struct radix_node *rn;
 	struct sockaddr *gw;
 
 	gw = info->rti_info[RTAX_GATEWAY];
 	rt = rt_mpath_matchgate(rto, gw);
 	if (rt == NULL) {
 		*perror = ESRCH;
 		return (NULL);
 	}
 
 	/*
 	 * this is the first entry in the chain
 	 */
 	if (rto == rt) {
 		rn = rn_mpath_next((struct radix_node *)rt);
 		/*
 		 * there is another entry, now it's active
 		 */
 		if (rn) {
 			rto = RNTORT(rn);
 			RT_LOCK(rto);
 			rto->rt_flags |= RTF_UP;
 			RT_UNLOCK(rto);
 		} else if (rt->rt_flags & RTF_GATEWAY) {
 			/*
 			 * For gateway routes, we need to 
 			 * make sure that we we are deleting
 			 * the correct gateway. 
 			 * rt_mpath_matchgate() does not 
 			 * check the case when there is only
 			 * one route in the chain.  
 			 */
 			if (gw &&
 			    (rt->rt_gateway->sa_len != gw->sa_len ||
 				memcmp(rt->rt_gateway, gw, gw->sa_len))) {
 				*perror = ESRCH;
 				return (NULL);
 			}
 		}
 
 		/*
 		 * use the normal delete code to remove
 		 * the first entry
 		 */
 		rn = rnh->rnh_deladdr(dst, netmask, &rnh->head);
 		*perror = 0;
 		return (rn);
 	}
 		
 	/*
 	 * if the entry is 2nd and on up
 	 */
 	if (rt_mpath_deldup(rto, rt) == 0)
 		panic ("rtrequest1: rt_mpath_deldup");
 	*perror = 0;
 	rn = (struct radix_node *)rt;
 	return (rn);
 }
 #endif
 
 #ifdef FLOWTABLE
 static struct rtentry *
 rt_flowtable_check_route(struct rib_head *rnh, struct rt_addrinfo *info)
 {
 #if defined(INET6) || defined(INET)
 	struct radix_node *rn;
 #endif
 	struct rtentry *rt0;
 
 	rt0 = NULL;
 	/* "flow-table" only supports IPv6 and IPv4 at the moment. */
 	switch (dst->sa_family) {
 #ifdef INET6
 	case AF_INET6:
 #endif
 #ifdef INET
 	case AF_INET:
 #endif
 #if defined(INET6) || defined(INET)
 		rn = rnh->rnh_matchaddr(dst, &rnh->head);
 		if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
 			struct sockaddr *mask;
 			u_char *m, *n;
 			int len;
 
 			/*
 			 * compare mask to see if the new route is
 			 * more specific than the existing one
 			 */
 			rt0 = RNTORT(rn);
 			RT_LOCK(rt0);
 			RT_ADDREF(rt0);
 			RT_UNLOCK(rt0);
 			/*
 			 * A host route is already present, so
 			 * leave the flow-table entries as is.
 			 */
 			if (rt0->rt_flags & RTF_HOST) {
 				RTFREE(rt0);
 				rt0 = NULL;
 			} else if (!(flags & RTF_HOST) && netmask) {
 				mask = rt_mask(rt0);
 				len = mask->sa_len;
 				m = (u_char *)mask;
 				n = (u_char *)netmask;
 				while (len-- > 0) {
 					if (*n != *m)
 						break;
 					n++;
 					m++;
 				}
 				if (len == 0 || (*n < *m)) {
 					RTFREE(rt0);
 					rt0 = NULL;
 				}
 			}
 		}
 #endif/* INET6 || INET */
 	}
 
 	return (rt0);
 }
 #endif
 
 int
 rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
 				u_int fibnum)
 {
 	int error = 0;
 	struct rtentry *rt, *rt_old;
 #ifdef FLOWTABLE
 	struct rtentry *rt0;
 #endif
 	struct radix_node *rn;
 	struct rib_head *rnh;
 	struct ifaddr *ifa;
 	struct sockaddr *ndst;
 	struct sockaddr_storage mdst;
 
 	KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
 	KASSERT((flags & RTF_RNH_LOCKED) == 0, ("rtrequest1_fib: locked"));
 	switch (dst->sa_family) {
 	case AF_INET6:
 	case AF_INET:
 		/* We support multiple FIBs. */
 		break;
 	default:
 		fibnum = RT_DEFAULT_FIB;
 		break;
 	}
 
 	/*
 	 * Find the correct routing tree to use for this Address Family
 	 */
 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	if (rnh == NULL)
 		return (EAFNOSUPPORT);
 
 	/*
 	 * If we are adding a host route then we don't want to put
 	 * a netmask in the tree, nor do we want to clone it.
 	 */
 	if (flags & RTF_HOST)
 		netmask = NULL;
 
 	switch (req) {
 	case RTM_DELETE:
 		if (netmask) {
 			rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
 			dst = (struct sockaddr *)&mdst;
 		}
 
 		RIB_WLOCK(rnh);
 		rt = rt_unlinkrte(rnh, info, &error);
 		RIB_WUNLOCK(rnh);
 		if (error != 0)
 			return (error);
 
 		rt_notifydelete(rt, info);
 
 		/*
 		 * If the caller wants it, then it can have it,
 		 * but it's up to it to free the rtentry as we won't be
 		 * doing it.
 		 */
 		if (ret_nrt) {
 			*ret_nrt = rt;
 			RT_UNLOCK(rt);
 		} else
 			RTFREE_LOCKED(rt);
 		break;
 	case RTM_RESOLVE:
 		/*
 		 * resolve was only used for route cloning
 		 * here for compat
 		 */
 		break;
 	case RTM_ADD:
 		if ((flags & RTF_GATEWAY) && !gateway)
 			return (EINVAL);
 		if (dst && gateway && (dst->sa_family != gateway->sa_family) && 
 		    (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
 			return (EINVAL);
 
 		if (info->rti_ifa == NULL) {
 			error = rt_getifa_fib(info, fibnum);
 			if (error)
 				return (error);
 		} else
 			ifa_ref(info->rti_ifa);
 		ifa = info->rti_ifa;
 		rt = uma_zalloc(V_rtzone, M_NOWAIT);
 		if (rt == NULL) {
 			ifa_free(ifa);
 			return (ENOBUFS);
 		}
 		rt->rt_flags = RTF_UP | flags;
 		rt->rt_fibnum = fibnum;
 		/*
 		 * Add the gateway. Possibly re-malloc-ing the storage for it.
 		 */
 		if ((error = rt_setgate(rt, dst, gateway)) != 0) {
 			ifa_free(ifa);
 			uma_zfree(V_rtzone, rt);
 			return (error);
 		}
 
 		/*
 		 * point to the (possibly newly malloc'd) dest address.
 		 */
 		ndst = (struct sockaddr *)rt_key(rt);
 
 		/*
 		 * make sure it contains the value we want (masked if needed).
 		 */
 		if (netmask) {
 			rt_maskedcopy(dst, ndst, netmask);
 		} else
 			bcopy(dst, ndst, dst->sa_len);
 
 		/*
 		 * We use the ifa reference returned by rt_getifa_fib().
 		 * This moved from below so that rnh->rnh_addaddr() can
 		 * examine the ifa and  ifa->ifa_ifp if it so desires.
 		 */
 		rt->rt_ifa = ifa;
 		rt->rt_ifp = ifa->ifa_ifp;
 		rt->rt_weight = 1;
 
 		rt_setmetrics(info, rt);
 
 		RIB_WLOCK(rnh);
 		RT_LOCK(rt);
 #ifdef RADIX_MPATH
 		/* do not permit exactly the same dst/mask/gw pair */
 		if (rt_mpath_capable(rnh) &&
 			rt_mpath_conflict(rnh, rt, netmask)) {
 			RIB_WUNLOCK(rnh);
 
 			ifa_free(rt->rt_ifa);
 			R_Free(rt_key(rt));
 			uma_zfree(V_rtzone, rt);
 			return (EEXIST);
 		}
 #endif
 
 #ifdef FLOWTABLE
 		rt0 = rt_flowtable_check_route(rnh, info);
 #endif /* FLOWTABLE */
 
 		/* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
 		rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes);
 
 		rt_old = NULL;
 		if (rn == NULL && (info->rti_flags & RTF_PINNED) != 0) {
 
 			/*
 			 * Force removal and re-try addition
 			 * TODO: better multipath&pinned support
 			 */
 			struct sockaddr *info_dst = info->rti_info[RTAX_DST];
 			info->rti_info[RTAX_DST] = ndst;
 			/* Do not delete existing PINNED(interface) routes */
 			info->rti_flags &= ~RTF_PINNED;
 			rt_old = rt_unlinkrte(rnh, info, &error);
 			info->rti_flags |= RTF_PINNED;
 			info->rti_info[RTAX_DST] = info_dst;
 			if (rt_old != NULL)
 				rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head,
 				    rt->rt_nodes);
 		}
 		RIB_WUNLOCK(rnh);
 
 		if (rt_old != NULL)
 			RT_UNLOCK(rt_old);
 
 		/*
 		 * If it still failed to go into the tree,
 		 * then un-make it (this should be a function)
 		 */
 		if (rn == NULL) {
 			ifa_free(rt->rt_ifa);
 			R_Free(rt_key(rt));
 			uma_zfree(V_rtzone, rt);
 #ifdef FLOWTABLE
 			if (rt0 != NULL)
 				RTFREE(rt0);
 #endif
 			return (EEXIST);
 		} 
 #ifdef FLOWTABLE
 		else if (rt0 != NULL) {
 			flowtable_route_flush(dst->sa_family, rt0);
 			RTFREE(rt0);
 		}
 #endif
 
 		if (rt_old != NULL) {
 			rt_notifydelete(rt_old, info);
 			RTFREE(rt_old);
 		}
 
 		/*
 		 * If this protocol has something to add to this then
 		 * allow it to do that as well.
 		 */
 		if (ifa->ifa_rtrequest)
 			ifa->ifa_rtrequest(req, rt, info);
 
 		/*
 		 * actually return a resultant rtentry and
 		 * give the caller a single reference.
 		 */
 		if (ret_nrt) {
 			*ret_nrt = rt;
 			RT_ADDREF(rt);
 		}
 		RT_UNLOCK(rt);
 		break;
 	case RTM_CHANGE:
 		RIB_WLOCK(rnh);
 		error = rtrequest1_fib_change(rnh, info, ret_nrt, fibnum);
 		RIB_WUNLOCK(rnh);
 		break;
 	default:
 		error = EOPNOTSUPP;
 	}
 
 	return (error);
 }
 
 #undef dst
 #undef gateway
 #undef netmask
 #undef ifaaddr
 #undef ifpaddr
 #undef flags
 
 static int
 rtrequest1_fib_change(struct rib_head *rnh, struct rt_addrinfo *info,
     struct rtentry **ret_nrt, u_int fibnum)
 {
 	struct rtentry *rt = NULL;
 	int error = 0;
 	int free_ifa = 0;
 	int family, mtu;
 	struct if_mtuinfo ifmtu;
 
 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
 	    info->rti_info[RTAX_NETMASK], &rnh->head);
 
 	if (rt == NULL)
 		return (ESRCH);
 
 #ifdef RADIX_MPATH
 	/*
 	 * If we got multipath routes,
 	 * we require users to specify a matching RTAX_GATEWAY.
 	 */
 	if (rt_mpath_capable(rnh)) {
 		rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]);
 		if (rt == NULL)
 			return (ESRCH);
 	}
 #endif
 
 	RT_LOCK(rt);
 
 	rt_setmetrics(info, rt);
 
 	/*
 	 * New gateway could require new ifaddr, ifp;
 	 * flags may also be different; ifp may be specified
 	 * by ll sockaddr when protocol address is ambiguous
 	 */
 	if (((rt->rt_flags & RTF_GATEWAY) &&
 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
 	    info->rti_info[RTAX_IFP] != NULL ||
 	    (info->rti_info[RTAX_IFA] != NULL &&
 	     !sa_equal(info->rti_info[RTAX_IFA], rt->rt_ifa->ifa_addr))) {
 
 		error = rt_getifa_fib(info, fibnum);
 		if (info->rti_ifa != NULL)
 			free_ifa = 1;
 
 		if (error != 0)
 			goto bad;
 	}
 
 	/* Check if outgoing interface has changed */
 	if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa &&
 	    rt->rt_ifa != NULL && rt->rt_ifa->ifa_rtrequest != NULL) {
 		rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, info);
 		ifa_free(rt->rt_ifa);
 	}
 	/* Update gateway address */
 	if (info->rti_info[RTAX_GATEWAY] != NULL) {
 		error = rt_setgate(rt, rt_key(rt), info->rti_info[RTAX_GATEWAY]);
 		if (error != 0)
 			goto bad;
 
 		rt->rt_flags &= ~RTF_GATEWAY;
 		rt->rt_flags |= (RTF_GATEWAY & info->rti_flags);
 	}
 
 	if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa) {
 		ifa_ref(info->rti_ifa);
 		rt->rt_ifa = info->rti_ifa;
 		rt->rt_ifp = info->rti_ifp;
 	}
 	/* Allow some flags to be toggled on change. */
 	rt->rt_flags &= ~RTF_FMASK;
 	rt->rt_flags |= info->rti_flags & RTF_FMASK;
 
 	if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest != NULL)
 	       rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info);
 
 	/* Alter route MTU if necessary */
 	if (rt->rt_ifp != NULL) {
 		family = info->rti_info[RTAX_DST]->sa_family;
 		mtu = if_getmtu_family(rt->rt_ifp, family);
 		/* Set default MTU */
 		if (rt->rt_mtu == 0)
 			rt->rt_mtu = mtu;
 		if (rt->rt_mtu != mtu) {
 			/* Check if we really need to update */
 			ifmtu.ifp = rt->rt_ifp;
 			ifmtu.mtu = mtu;
 			if_updatemtu_cb(rt->rt_nodes, &ifmtu);
 		}
 	}
 
 	if (ret_nrt) {
 		*ret_nrt = rt;
 		RT_ADDREF(rt);
 	}
 bad:
 	RT_UNLOCK(rt);
 	if (free_ifa != 0)
 		ifa_free(info->rti_ifa);
 	return (error);
 }
 
 static void
 rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt)
 {
 
 	if (info->rti_mflags & RTV_MTU) {
 		if (info->rti_rmx->rmx_mtu != 0) {
 
 			/*
 			 * MTU was explicitly provided by user.
 			 * Keep it.
 			 */
 			rt->rt_flags |= RTF_FIXEDMTU;
 		} else {
 
 			/*
 			 * User explicitly sets MTU to 0.
 			 * Assume rollback to default.
 			 */
 			rt->rt_flags &= ~RTF_FIXEDMTU;
 		}
 		rt->rt_mtu = info->rti_rmx->rmx_mtu;
 	}
 	if (info->rti_mflags & RTV_WEIGHT)
 		rt->rt_weight = info->rti_rmx->rmx_weight;
 	/* Kernel -> userland timebase conversion. */
 	if (info->rti_mflags & RTV_EXPIRE)
 		rt->rt_expire = info->rti_rmx->rmx_expire ?
 		    info->rti_rmx->rmx_expire - time_second + time_uptime : 0;
 }
 
 int
 rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate)
 {
 	/* XXX dst may be overwritten, can we move this to below */
 	int dlen = SA_SIZE(dst), glen = SA_SIZE(gate);
 
 	/*
 	 * Prepare to store the gateway in rt->rt_gateway.
 	 * Both dst and gateway are stored one after the other in the same
 	 * malloc'd chunk. If we have room, we can reuse the old buffer,
 	 * rt_gateway already points to the right place.
 	 * Otherwise, malloc a new block and update the 'dst' address.
 	 */
 	if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) {
 		caddr_t new;
 
 		R_Malloc(new, caddr_t, dlen + glen);
 		if (new == NULL)
 			return ENOBUFS;
 		/*
 		 * XXX note, we copy from *dst and not *rt_key(rt) because
 		 * rt_setgate() can be called to initialize a newly
 		 * allocated route entry, in which case rt_key(rt) == NULL
 		 * (and also rt->rt_gateway == NULL).
 		 * Free()/free() handle a NULL argument just fine.
 		 */
 		bcopy(dst, new, dlen);
 		R_Free(rt_key(rt));	/* free old block, if any */
 		rt_key(rt) = (struct sockaddr *)new;
 		rt->rt_gateway = (struct sockaddr *)(new + dlen);
 	}
 
 	/*
 	 * Copy the new gateway value into the memory chunk.
 	 */
 	bcopy(gate, rt->rt_gateway, glen);
 
 	return (0);
 }
 
 void
 rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask)
 {
 	u_char *cp1 = (u_char *)src;
 	u_char *cp2 = (u_char *)dst;
 	u_char *cp3 = (u_char *)netmask;
 	u_char *cplim = cp2 + *cp3;
 	u_char *cplim2 = cp2 + *cp1;
 
 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
 	cp3 += 2;
 	if (cplim > cplim2)
 		cplim = cplim2;
 	while (cp2 < cplim)
 		*cp2++ = *cp1++ & *cp3++;
 	if (cp2 < cplim2)
 		bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
 }
 
 /*
  * Set up a routing table entry, normally
  * for an interface.
  */
 #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */
 static inline  int
 rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum)
 {
 	struct sockaddr *dst;
 	struct sockaddr *netmask;
 	struct rtentry *rt = NULL;
 	struct rt_addrinfo info;
 	int error = 0;
 	int startfib, endfib;
 	char tempbuf[_SOCKADDR_TMPSIZE];
 	int didwork = 0;
 	int a_failure = 0;
 	static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
 	struct rib_head *rnh;
 
 	if (flags & RTF_HOST) {
 		dst = ifa->ifa_dstaddr;
 		netmask = NULL;
 	} else {
 		dst = ifa->ifa_addr;
 		netmask = ifa->ifa_netmask;
 	}
 	if (dst->sa_len == 0)
 		return(EINVAL);
 	switch (dst->sa_family) {
 	case AF_INET6:
 	case AF_INET:
 		/* We support multiple FIBs. */
 		break;
 	default:
 		fibnum = RT_DEFAULT_FIB;
 		break;
 	}
 	if (fibnum == RT_ALL_FIBS) {
 		if (V_rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD)
 			startfib = endfib = ifa->ifa_ifp->if_fib;
 		else {
 			startfib = 0;
 			endfib = rt_numfibs - 1;
 		}
 	} else {
 		KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum"));
 		startfib = fibnum;
 		endfib = fibnum;
 	}
 
 	/*
 	 * If it's a delete, check that if it exists,
 	 * it's on the correct interface or we might scrub
 	 * a route to another ifa which would
 	 * be confusing at best and possibly worse.
 	 */
 	if (cmd == RTM_DELETE) {
 		/*
 		 * It's a delete, so it should already exist..
 		 * If it's a net, mask off the host bits
 		 * (Assuming we have a mask)
 		 * XXX this is kinda inet specific..
 		 */
 		if (netmask != NULL) {
 			rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask);
 			dst = (struct sockaddr *)tempbuf;
 		}
 	}
 	/*
 	 * Now go through all the requested tables (fibs) and do the
 	 * requested action. Realistically, this will either be fib 0
 	 * for protocols that don't do multiple tables or all the
 	 * tables for those that do.
 	 */
 	for ( fibnum = startfib; fibnum <= endfib; fibnum++) {
 		if (cmd == RTM_DELETE) {
 			struct radix_node *rn;
 			/*
 			 * Look up an rtentry that is in the routing tree and
 			 * contains the correct info.
 			 */
 			rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 			if (rnh == NULL)
 				/* this table doesn't exist but others might */
 				continue;
 			RIB_RLOCK(rnh);
 			rn = rnh->rnh_lookup(dst, netmask, &rnh->head);
 #ifdef RADIX_MPATH
 			if (rt_mpath_capable(rnh)) {
 
 				if (rn == NULL) 
 					error = ESRCH;
 				else {
 					rt = RNTORT(rn);
 					/*
 					 * for interface route the
 					 * rt->rt_gateway is sockaddr_intf
 					 * for cloning ARP entries, so
 					 * rt_mpath_matchgate must use the
 					 * interface address
 					 */
 					rt = rt_mpath_matchgate(rt,
 					    ifa->ifa_addr);
 					if (rt == NULL) 
 						error = ESRCH;
 				}
 			}
 #endif
 			error = (rn == NULL ||
 			    (rn->rn_flags & RNF_ROOT) ||
 			    RNTORT(rn)->rt_ifa != ifa);
 			RIB_RUNLOCK(rnh);
 			if (error) {
 				/* this is only an error if bad on ALL tables */
 				continue;
 			}
 		}
 		/*
 		 * Do the actual request
 		 */
 		bzero((caddr_t)&info, sizeof(info));
 		info.rti_ifa = ifa;
 		info.rti_flags = flags |
 		    (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED;
 		info.rti_info[RTAX_DST] = dst;
 		/* 
 		 * doing this for compatibility reasons
 		 */
 		if (cmd == RTM_ADD)
 			info.rti_info[RTAX_GATEWAY] =
 			    (struct sockaddr *)&null_sdl;
 		else
 			info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
 		info.rti_info[RTAX_NETMASK] = netmask;
 		error = rtrequest1_fib(cmd, &info, &rt, fibnum);
 
 		if (error == 0 && rt != NULL) {
 			/*
 			 * notify any listening routing agents of the change
 			 */
 			RT_LOCK(rt);
 #ifdef RADIX_MPATH
 			/*
 			 * in case address alias finds the first address
 			 * e.g. ifconfig bge0 192.0.2.246/24
 			 * e.g. ifconfig bge0 192.0.2.247/24
 			 * the address set in the route is 192.0.2.246
 			 * so we need to replace it with 192.0.2.247
 			 */
 			if (memcmp(rt->rt_ifa->ifa_addr,
 			    ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
 				ifa_free(rt->rt_ifa);
 				ifa_ref(ifa);
 				rt->rt_ifp = ifa->ifa_ifp;
 				rt->rt_ifa = ifa;
 			}
 #endif
 			/* 
 			 * doing this for compatibility reasons
 			 */
 			if (cmd == RTM_ADD) {
 			    ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type  =
 				rt->rt_ifp->if_type;
 			    ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
 				rt->rt_ifp->if_index;
 			}
 			RT_ADDREF(rt);
 			RT_UNLOCK(rt);
 			rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum);
 			RT_LOCK(rt);
 			RT_REMREF(rt);
 			if (cmd == RTM_DELETE) {
 				/*
 				 * If we are deleting, and we found an entry,
 				 * then it's been removed from the tree..
 				 * now throw it away.
 				 */
 				RTFREE_LOCKED(rt);
 			} else {
 				if (cmd == RTM_ADD) {
 					/*
 					 * We just wanted to add it..
 					 * we don't actually need a reference.
 					 */
 					RT_REMREF(rt);
 				}
 				RT_UNLOCK(rt);
 			}
 			didwork = 1;
 		}
 		if (error)
 			a_failure = error;
 	}
 	if (cmd == RTM_DELETE) {
 		if (didwork) {
 			error = 0;
 		} else {
 			/* we only give an error if it wasn't in any table */
 			error = ((flags & RTF_HOST) ?
 			    EHOSTUNREACH : ENETUNREACH);
 		}
 	} else {
 		if (a_failure) {
 			/* return an error if any of them failed */
 			error = a_failure;
 		}
 	}
 	return (error);
 }
 
 /*
  * Set up a routing table entry, normally
  * for an interface.
  */
 int
 rtinit(struct ifaddr *ifa, int cmd, int flags)
 {
 	struct sockaddr *dst;
 	int fib = RT_DEFAULT_FIB;
 
 	if (flags & RTF_HOST) {
 		dst = ifa->ifa_dstaddr;
 	} else {
 		dst = ifa->ifa_addr;
 	}
 
 	switch (dst->sa_family) {
 	case AF_INET6:
 	case AF_INET:
 		/* We do support multiple FIBs. */
 		fib = RT_ALL_FIBS;
 		break;
 	}
 	return (rtinit1(ifa, cmd, flags, fib));
 }
 
 /*
  * Announce interface address arrival/withdraw
  * Returns 0 on success.
  */
 int
 rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
 	    ("unexpected cmd %d", cmd));
 	
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 #if defined(INET) || defined(INET6)
 #ifdef SCTP
 	/*
 	 * notify the SCTP stack
 	 * this will only get called when an address is added/deleted
 	 * XXX pass the ifaddr struct instead if ifa->ifa_addr...
 	 */
 	sctp_addr_change(ifa, cmd);
 #endif /* SCTP */
 #endif
 	return (rtsock_addrmsg(cmd, ifa, fibnum));
 }
 
 /*
  * Announce route addition/removal.
  * Users of this function MUST validate input data BEFORE calling.
  * However we have to be able to handle invalid data:
  * if some userland app sends us "invalid" route message (invalid mask,
  * no dst, wrong address families, etc...) we need to pass it back
  * to app (and any other rtsock consumers) with rtm_errno field set to
  * non-zero value.
  * Returns 0 on success.
  */
 int
 rt_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt,
     int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
 	    ("unexpected cmd %d", cmd));
 	
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 	KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__));
 
 	return (rtsock_routemsg(cmd, ifp, error, rt, fibnum));
 }
 
 void
 rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt)
 {
 
 	rt_newaddrmsg_fib(cmd, ifa, error, rt, RT_ALL_FIBS);
 }
 
 /*
  * This is called to generate messages from the routing socket
  * indicating a network interface has had addresses associated with it.
  */
 void
 rt_newaddrmsg_fib(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt,
     int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
 		("unexpected cmd %u", cmd));
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 	if (cmd == RTM_ADD) {
 		rt_addrmsg(cmd, ifa, fibnum);
 		if (rt != NULL)
 			rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum);
 	} else {
 		if (rt != NULL)
 			rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum);
 		rt_addrmsg(cmd, ifa, fibnum);
 	}
 }
 
Index: projects/vnet/sys/net/route.h
===================================================================
--- projects/vnet/sys/net/route.h	(revision 297511)
+++ projects/vnet/sys/net/route.h	(revision 297512)
@@ -1,465 +1,466 @@
 /*-
  * Copyright (c) 1980, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)route.h	8.4 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _NET_ROUTE_H_
 #define _NET_ROUTE_H_
 
 #include <sys/counter.h>
 #include <net/vnet.h>
 
 /*
  * Kernel resident routing tables.
  *
  * The routing tables are initialized when interface addresses
  * are set by making entries for all directly connected interfaces.
  */
 
 /*
  * Struct route consiste of a destination address,
  * a route entry pointer, link-layer prepend data pointer along
  * with its length.
  */
 struct route {
 	struct	rtentry *ro_rt;
 	char		*ro_prepend;
 	uint16_t	ro_plen;
 	uint16_t	ro_flags;
 	uint16_t	ro_mtu;	/* saved ro_rt mtu */
 	uint16_t	spare;
 	struct	sockaddr ro_dst;
 };
 
 #define	RT_L2_ME_BIT		2	/* dst L2 addr is our address */
 #define	RT_MAY_LOOP_BIT		3	/* dst may require loop copy */
 #define	RT_HAS_HEADER_BIT	4	/* mbuf already have its header prepended */
 
 #define	RT_CACHING_CONTEXT	0x1	/* XXX: not used anywhere */
 #define	RT_NORTREF		0x2	/* doesn't hold reference on ro_rt */
 #define	RT_L2_ME		(1 << RT_L2_ME_BIT)		/* 0x0004 */
 #define	RT_MAY_LOOP		(1 << RT_MAY_LOOP_BIT)		/* 0x0008 */
 #define	RT_HAS_HEADER		(1 << RT_HAS_HEADER_BIT)	/* 0x0010 */
 
 #define	RT_REJECT		0x0020		/* Destination is reject */
 #define	RT_BLACKHOLE		0x0040		/* Destination is blackhole */
 #define	RT_HAS_GW		0x0080		/* Destination has GW  */
 
 struct rt_metrics {
 	u_long	rmx_locks;	/* Kernel must leave these values alone */
 	u_long	rmx_mtu;	/* MTU for this path */
 	u_long	rmx_hopcount;	/* max hops expected */
 	u_long	rmx_expire;	/* lifetime for route, e.g. redirect */
 	u_long	rmx_recvpipe;	/* inbound delay-bandwidth product */
 	u_long	rmx_sendpipe;	/* outbound delay-bandwidth product */
 	u_long	rmx_ssthresh;	/* outbound gateway buffer limit */
 	u_long	rmx_rtt;	/* estimated round trip time */
 	u_long	rmx_rttvar;	/* estimated rtt variance */
 	u_long	rmx_pksent;	/* packets sent using this route */
 	u_long	rmx_weight;	/* route weight */
 	u_long	rmx_filler[3];	/* will be used for T/TCP later */
 };
 
 /*
  * rmx_rtt and rmx_rttvar are stored as microseconds;
  * RTTTOPRHZ(rtt) converts to a value suitable for use
  * by a protocol slowtimo counter.
  */
 #define	RTM_RTTUNIT	1000000	/* units for rtt, rttvar, as units per sec */
 #define	RTTTOPRHZ(r)	((r) / (RTM_RTTUNIT / PR_SLOWHZ))
 
 /* lle state is exported in rmx_state rt_metrics field */
 #define	rmx_state	rmx_weight
 
 #define	RT_DEFAULT_FIB	0	/* Explicitly mark fib=0 restricted cases */
 #define	RT_ALL_FIBS	-1	/* Announce event for every fib */
 #ifdef _KERNEL
 extern u_int rt_numfibs;	/* number of usable routing tables */
 VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce interfaces to all fibs */
 #define	V_rt_add_addr_allfibs	VNET(rt_add_addr_allfibs)
 #endif
 
 /*
  * We distinguish between routes to hosts and routes to networks,
  * preferring the former if available.  For each route we infer
  * the interface to use from the gateway address supplied when
  * the route was entered.  Routes that forward packets through
  * gateways are marked so that the output routines know to address the
  * gateway rather than the ultimate destination.
  */
 #ifndef RNF_NORMAL
 #include <net/radix.h>
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 #endif
 
 #if defined(_KERNEL) || defined(_WANT_RTENTRY)
 struct rtentry {
 	struct	radix_node rt_nodes[2];	/* tree glue, and other values */
 	/*
 	 * XXX struct rtentry must begin with a struct radix_node (or two!)
 	 * because the code does some casts of a 'struct radix_node *'
 	 * to a 'struct rtentry *'
 	 */
 #define	rt_key(r)	(*((struct sockaddr **)(&(r)->rt_nodes->rn_key)))
 #define	rt_mask(r)	(*((struct sockaddr **)(&(r)->rt_nodes->rn_mask)))
 	struct	sockaddr *rt_gateway;	/* value */
 	struct	ifnet *rt_ifp;		/* the answer: interface to use */
 	struct	ifaddr *rt_ifa;		/* the answer: interface address to use */
 	int		rt_flags;	/* up/down?, host/net */
 	int		rt_refcnt;	/* # held references */
 	u_int		rt_fibnum;	/* which FIB */
 	u_long		rt_mtu;		/* MTU for this path */
 	u_long		rt_weight;	/* absolute weight */ 
 	u_long		rt_expire;	/* lifetime for route, e.g. redirect */
 #define	rt_endzero	rt_pksent
 	counter_u64_t	rt_pksent;	/* packets sent using this route */
 	struct mtx	rt_mtx;		/* mutex for routing entry */
 	struct rtentry	*rt_chain;	/* pointer to next rtentry to delete */
 };
 #endif /* _KERNEL || _WANT_RTENTRY */
 
 #define	RTF_UP		0x1		/* route usable */
 #define	RTF_GATEWAY	0x2		/* destination is a gateway */
 #define	RTF_HOST	0x4		/* host entry (net otherwise) */
 #define	RTF_REJECT	0x8		/* host or net unreachable */
 #define	RTF_DYNAMIC	0x10		/* created dynamically (by redirect) */
 #define	RTF_MODIFIED	0x20		/* modified dynamically (by redirect) */
 #define RTF_DONE	0x40		/* message confirmed */
 /*			0x80		   unused, was RTF_DELCLONE */
 /*			0x100		   unused, was RTF_CLONING */
 #define RTF_XRESOLVE	0x200		/* external daemon resolves name */
 #define RTF_LLINFO	0x400		/* DEPRECATED - exists ONLY for backward 
 					   compatibility */
 #define RTF_LLDATA	0x400		/* used by apps to add/del L2 entries */
 #define RTF_STATIC	0x800		/* manually added */
 #define RTF_BLACKHOLE	0x1000		/* just discard pkts (during updates) */
 #define RTF_PROTO2	0x4000		/* protocol specific routing flag */
 #define RTF_PROTO1	0x8000		/* protocol specific routing flag */
 /*			0x10000		   unused, was RTF_PRCLONING */
 /*			0x20000		   unused, was RTF_WASCLONED */
 #define RTF_PROTO3	0x40000		/* protocol specific routing flag */
 #define	RTF_FIXEDMTU	0x80000		/* MTU was explicitly specified */
 #define RTF_PINNED	0x100000	/* route is immutable */
 #define	RTF_LOCAL	0x200000 	/* route represents a local address */
 #define	RTF_BROADCAST	0x400000	/* route represents a bcast address */
 #define	RTF_MULTICAST	0x800000	/* route represents a mcast address */
 					/* 0x8000000 and up unassigned */
 #define	RTF_STICKY	 0x10000000	/* always route dst->src */
 
 #define	RTF_RNH_LOCKED	 0x40000000	/* unused */
 
 #define	RTF_GWFLAG_COMPAT 0x80000000	/* a compatibility bit for interacting
 					   with existing routing apps */
 
 /* Mask of RTF flags that are allowed to be modified by RTM_CHANGE. */
 #define RTF_FMASK	\
 	(RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | RTF_BLACKHOLE | \
 	 RTF_REJECT | RTF_STATIC | RTF_STICKY)
 
 /*
  * fib_ nexthop API flags.
  */
 
 /* Consumer-visible nexthop info flags */
 #define	NHF_REJECT		0x0010	/* RTF_REJECT */
 #define	NHF_BLACKHOLE		0x0020	/* RTF_BLACKHOLE */
 #define	NHF_REDIRECT		0x0040	/* RTF_DYNAMIC|RTF_MODIFIED */
 #define	NHF_DEFAULT		0x0080	/* Default route */
 #define	NHF_BROADCAST		0x0100	/* RTF_BROADCAST */
 #define	NHF_GATEWAY		0x0200	/* RTF_GATEWAY */
 
 /* Nexthop request flags */
 #define	NHR_IFAIF		0x01	/* Return ifa_ifp interface */
 #define	NHR_REF			0x02	/* For future use */
 
 /* Control plane route request flags */
 #define	NHR_COPY		0x100	/* Copy rte data */
 
 #ifdef _KERNEL
 /* rte<>ro_flags translation */
 static inline void
 rt_update_ro_flags(struct route *ro)
 {
 	int rt_flags = ro->ro_rt->rt_flags;
 
 	ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW);
 
 	ro->ro_flags |= (rt_flags & RTF_REJECT) ? RT_REJECT : 0;
 	ro->ro_flags |= (rt_flags & RTF_BLACKHOLE) ? RT_BLACKHOLE : 0;
 	ro->ro_flags |= (rt_flags & RTF_GATEWAY) ? RT_HAS_GW : 0;
 }
 #endif
 
 /*
  * Routing statistics.
  */
 struct	rtstat {
 	short	rts_badredirect;	/* bogus redirect calls */
 	short	rts_dynamic;		/* routes created by redirects */
 	short	rts_newgateway;		/* routes modified by redirects */
 	short	rts_unreach;		/* lookups which failed */
 	short	rts_wildcard;		/* lookups satisfied by a wildcard */
 };
 /*
  * Structures for routing messages.
  */
 struct rt_msghdr {
 	u_short	rtm_msglen;	/* to skip over non-understood messages */
 	u_char	rtm_version;	/* future binary compatibility */
 	u_char	rtm_type;	/* message type */
 	u_short	rtm_index;	/* index for associated ifp */
 	int	rtm_flags;	/* flags, incl. kern & message, e.g. DONE */
 	int	rtm_addrs;	/* bitmask identifying sockaddrs in msg */
 	pid_t	rtm_pid;	/* identify sender */
 	int	rtm_seq;	/* for sender to identify action */
 	int	rtm_errno;	/* why failed */
 	int	rtm_fmask;	/* bitmask used in RTM_CHANGE message */
 	u_long	rtm_inits;	/* which metrics we are initializing */
 	struct	rt_metrics rtm_rmx; /* metrics themselves */
 };
 
 #define RTM_VERSION	5	/* Up the ante and ignore older versions */
 
 /*
  * Message types.
  */
 #define RTM_ADD		0x1	/* Add Route */
 #define RTM_DELETE	0x2	/* Delete Route */
 #define RTM_CHANGE	0x3	/* Change Metrics or flags */
 #define RTM_GET		0x4	/* Report Metrics */
 #define RTM_LOSING	0x5	/* Kernel Suspects Partitioning */
 #define RTM_REDIRECT	0x6	/* Told to use different route */
 #define RTM_MISS	0x7	/* Lookup failed on this address */
 #define RTM_LOCK	0x8	/* fix specified metrics */
 		    /*	0x9  */
 		    /*	0xa  */
 #define RTM_RESOLVE	0xb	/* req to resolve dst to LL addr */
 #define RTM_NEWADDR	0xc	/* address being added to iface */
 #define RTM_DELADDR	0xd	/* address being removed from iface */
 #define RTM_IFINFO	0xe	/* iface going up/down etc. */
 #define	RTM_NEWMADDR	0xf	/* mcast group membership being added to if */
 #define	RTM_DELMADDR	0x10	/* mcast group membership being deleted */
 #define	RTM_IFANNOUNCE	0x11	/* iface arrival/departure */
 #define	RTM_IEEE80211	0x12	/* IEEE80211 wireless event */
 
 /*
  * Bitmask values for rtm_inits and rmx_locks.
  */
 #define RTV_MTU		0x1	/* init or lock _mtu */
 #define RTV_HOPCOUNT	0x2	/* init or lock _hopcount */
 #define RTV_EXPIRE	0x4	/* init or lock _expire */
 #define RTV_RPIPE	0x8	/* init or lock _recvpipe */
 #define RTV_SPIPE	0x10	/* init or lock _sendpipe */
 #define RTV_SSTHRESH	0x20	/* init or lock _ssthresh */
 #define RTV_RTT		0x40	/* init or lock _rtt */
 #define RTV_RTTVAR	0x80	/* init or lock _rttvar */
 #define RTV_WEIGHT	0x100	/* init or lock _weight */
 
 /*
  * Bitmask values for rtm_addrs.
  */
 #define RTA_DST		0x1	/* destination sockaddr present */
 #define RTA_GATEWAY	0x2	/* gateway sockaddr present */
 #define RTA_NETMASK	0x4	/* netmask sockaddr present */
 #define RTA_GENMASK	0x8	/* cloning mask sockaddr present */
 #define RTA_IFP		0x10	/* interface name sockaddr present */
 #define RTA_IFA		0x20	/* interface addr sockaddr present */
 #define RTA_AUTHOR	0x40	/* sockaddr for author of redirect */
 #define RTA_BRD		0x80	/* for NEWADDR, broadcast or p-p dest addr */
 
 /*
  * Index offsets for sockaddr array for alternate internal encoding.
  */
 #define RTAX_DST	0	/* destination sockaddr present */
 #define RTAX_GATEWAY	1	/* gateway sockaddr present */
 #define RTAX_NETMASK	2	/* netmask sockaddr present */
 #define RTAX_GENMASK	3	/* cloning mask sockaddr present */
 #define RTAX_IFP	4	/* interface name sockaddr present */
 #define RTAX_IFA	5	/* interface addr sockaddr present */
 #define RTAX_AUTHOR	6	/* sockaddr for author of redirect */
 #define RTAX_BRD	7	/* for NEWADDR, broadcast or p-p dest addr */
 #define RTAX_MAX	8	/* size of array to allocate */
 
 typedef int rt_filter_f_t(const struct rtentry *, void *);
 
 struct rt_addrinfo {
 	int	rti_addrs;			/* Route RTF_ flags */
 	int	rti_flags;			/* Route RTF_ flags */
 	struct	sockaddr *rti_info[RTAX_MAX];	/* Sockaddr data */
 	struct	ifaddr *rti_ifa;		/* value of rt_ifa addr */
 	struct	ifnet *rti_ifp;			/* route interface */
 	rt_filter_f_t	*rti_filter;		/* filter function */
 	void	*rti_filterdata;		/* filter paramenters */
 	u_long	rti_mflags;			/* metrics RTV_ flags */
 	u_long	rti_spare;			/* Will be used for fib */
 	struct	rt_metrics *rti_rmx;		/* Pointer to route metrics */
 };
 
 /*
  * This macro returns the size of a struct sockaddr when passed
  * through a routing socket. Basically we round up sa_len to
  * a multiple of sizeof(long), with a minimum of sizeof(long).
  * The check for a NULL pointer is just a convenience, probably never used.
  * The case sa_len == 0 should only apply to empty structures.
  */
 #define SA_SIZE(sa)						\
     (  (!(sa) || ((struct sockaddr *)(sa))->sa_len == 0) ?	\
 	sizeof(long)		:				\
 	1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(long) - 1) ) )
 
 #define	sa_equal(a, b) (	\
     (((const struct sockaddr *)(a))->sa_len == ((const struct sockaddr *)(b))->sa_len) && \
     (bcmp((a), (b), ((const struct sockaddr *)(b))->sa_len) == 0))
 
 #ifdef _KERNEL
 
 #define RT_LINK_IS_UP(ifp)	(!((ifp)->if_capabilities & IFCAP_LINKSTATE) \
 				 || (ifp)->if_link_state == LINK_STATE_UP)
 
 #define	RT_LOCK_INIT(_rt) \
 	mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK)
 #define	RT_LOCK(_rt)		mtx_lock(&(_rt)->rt_mtx)
 #define	RT_UNLOCK(_rt)		mtx_unlock(&(_rt)->rt_mtx)
 #define	RT_LOCK_DESTROY(_rt)	mtx_destroy(&(_rt)->rt_mtx)
 #define	RT_LOCK_ASSERT(_rt)	mtx_assert(&(_rt)->rt_mtx, MA_OWNED)
 #define	RT_UNLOCK_COND(_rt)	do {				\
 	if (mtx_owned(&(_rt)->rt_mtx))				\
 		mtx_unlock(&(_rt)->rt_mtx);			\
 } while (0)
 
 #define	RT_ADDREF(_rt)	do {					\
 	RT_LOCK_ASSERT(_rt);					\
 	KASSERT((_rt)->rt_refcnt >= 0,				\
 		("negative refcnt %d", (_rt)->rt_refcnt));	\
 	(_rt)->rt_refcnt++;					\
 } while (0)
 
 #define	RT_REMREF(_rt)	do {					\
 	RT_LOCK_ASSERT(_rt);					\
 	KASSERT((_rt)->rt_refcnt > 0,				\
 		("bogus refcnt %d", (_rt)->rt_refcnt));	\
 	(_rt)->rt_refcnt--;					\
 } while (0)
 
 #define	RTFREE_LOCKED(_rt) do {					\
 	if ((_rt)->rt_refcnt <= 1)				\
 		rtfree(_rt);					\
 	else {							\
 		RT_REMREF(_rt);					\
 		RT_UNLOCK(_rt);					\
 	}							\
 	/* guard against invalid refs */			\
 	_rt = 0;						\
 } while (0)
 
 #define	RTFREE(_rt) do {					\
 	RT_LOCK(_rt);						\
 	RTFREE_LOCKED(_rt);					\
 } while (0)
 
 #define	RO_RTFREE(_ro) do {					\
 	if ((_ro)->ro_rt) {					\
 		if ((_ro)->ro_flags & RT_NORTREF) {		\
 			(_ro)->ro_flags &= ~RT_NORTREF;		\
 			(_ro)->ro_rt = NULL;			\
 		} else {					\
 			RT_LOCK((_ro)->ro_rt);			\
 			RTFREE_LOCKED((_ro)->ro_rt);		\
 		}						\
 	}							\
 } while (0)
 
 struct ifmultiaddr;
 struct rib_head;
 
 void	 rt_ieee80211msg(struct ifnet *, int, void *, size_t);
 void	 rt_ifannouncemsg(struct ifnet *, int);
 void	 rt_ifmsg(struct ifnet *);
 void	 rt_missmsg(int, struct rt_addrinfo *, int, int);
 void	 rt_missmsg_fib(int, struct rt_addrinfo *, int, int, int);
 void	 rt_newaddrmsg(int, struct ifaddr *, int, struct rtentry *);
 void	 rt_newaddrmsg_fib(int, struct ifaddr *, int, struct rtentry *, int);
 int	 rt_addrmsg(int, struct ifaddr *, int);
 int	 rt_routemsg(int, struct ifnet *ifp, int, struct rtentry *, int);
 void	 rt_newmaddrmsg(int, struct ifmultiaddr *);
 int	 rt_setgate(struct rtentry *, struct sockaddr *, struct sockaddr *);
 void 	 rt_maskedcopy(struct sockaddr *, struct sockaddr *, struct sockaddr *);
 struct rib_head *rt_table_init(int);
 void	rt_table_destroy(struct rib_head *);
 
 int	rtsock_addrmsg(int, struct ifaddr *, int);
 int	rtsock_routemsg(int, struct ifnet *ifp, int, struct rtentry *, int);
 
 /*
  * Note the following locking behavior:
  *
  *    rtalloc1() returns a locked rtentry
  *
  *    rtfree() and RTFREE_LOCKED() require a locked rtentry
  *
  *    RTFREE() uses an unlocked entry.
  */
 
 void	 rtfree(struct rtentry *);
 void	rt_updatemtu(struct ifnet *);
 
 typedef int rt_walktree_f_t(struct rtentry *, void *);
 typedef void rt_setwarg_t(struct rib_head *, uint32_t, int, void *);
 void	rt_foreach_fib_walk(int af, rt_setwarg_t *, rt_walktree_f_t *, void *);
 void	rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg);
+void	rt_flushifroutes_af(struct ifnet *, int);
 void	rt_flushifroutes(struct ifnet *ifp);
 
 /* XXX MRT COMPAT VERSIONS THAT SET UNIVERSE to 0 */
 /* Thes are used by old code not yet converted to use multiple FIBS */
 struct rtentry *rtalloc1(struct sockaddr *, int, u_long);
 int	 rtinit(struct ifaddr *, int, int);
 
 /* XXX MRT NEW VERSIONS THAT USE FIBs
  * For now the protocol indepedent versions are the same as the AF_INET ones
  * but this will change.. 
  */
 int	 rt_getifa_fib(struct rt_addrinfo *, u_int fibnum);
 void	 rtalloc_ign_fib(struct route *ro, u_long ignflags, u_int fibnum);
 struct rtentry *rtalloc1_fib(struct sockaddr *, int, u_long, u_int);
 int	 rtioctl_fib(u_long, caddr_t, u_int);
 void	 rtredirect_fib(struct sockaddr *, struct sockaddr *,
 	    struct sockaddr *, int, struct sockaddr *, u_int);
 int	 rtrequest_fib(int, struct sockaddr *,
 	    struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int);
 int	 rtrequest1_fib(int, struct rt_addrinfo *, struct rtentry **, u_int);
 int	rib_lookup_info(uint32_t, const struct sockaddr *, uint32_t, uint32_t,
 	    struct rt_addrinfo *);
 void	rib_free_info(struct rt_addrinfo *info);
 
 #endif
 
 #endif
Index: projects/vnet/sys/net/vnet.c
===================================================================
--- projects/vnet/sys/net/vnet.c	(revision 297511)
+++ projects/vnet/sys/net/vnet.c	(revision 297512)
@@ -1,773 +1,797 @@
 /*-
  * Copyright (c) 2004-2009 University of Zagreb
  * Copyright (c) 2006-2009 FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by the University of Zagreb and the
  * FreeBSD Foundation under sponsorship by the Stichting NLnet and the
  * FreeBSD Foundation.
  *
  * Copyright (c) 2009 Jeffrey Roberson <jeff@freebsd.org>
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_kdb.h"
 
 #include <sys/param.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/jail.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <machine/stdarg.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 /*-
  * This file implements core functions for virtual network stacks:
  *
  * - Virtual network stack management functions.
  *
  * - Virtual network stack memory allocator, which virtualizes global
  *   variables in the network stack
  *
  * - Virtualized SYSINIT's/SYSUNINIT's, which allow network stack subsystems
  *   to register startup/shutdown events to be run for each virtual network
  *   stack instance.
  */
 
 FEATURE(vimage, "VIMAGE kernel virtualization");
 
 static MALLOC_DEFINE(M_VNET, "vnet", "network stack control block");
 
 /*
  * The virtual network stack list has two read-write locks, one sleepable and
  * the other not, so that the list can be stablized and walked in a variety
  * of network stack contexts.  Both must be acquired exclusively to modify
  * the list, but a read lock of either lock is sufficient to walk the list.
  */
 struct rwlock		vnet_rwlock;
 struct sx		vnet_sxlock;
 
 #define	VNET_LIST_WLOCK() do {						\
 	sx_xlock(&vnet_sxlock);						\
 	rw_wlock(&vnet_rwlock);						\
 } while (0)
 
 #define	VNET_LIST_WUNLOCK() do {					\
 	rw_wunlock(&vnet_rwlock);					\
 	sx_xunlock(&vnet_sxlock);					\
 } while (0)
 
 struct vnet_list_head vnet_head;
 struct vnet *vnet0;
 
 /*
  * The virtual network stack allocator provides storage for virtualized
  * global variables.  These variables are defined/declared using the
  * VNET_DEFINE()/VNET_DECLARE() macros, which place them in the 'set_vnet'
  * linker set.  The details of the implementation are somewhat subtle, but
  * allow the majority of most network subsystems to maintain
  * virtualization-agnostic.
  *
  * The virtual network stack allocator handles variables in the base kernel
  * vs. modules in similar but different ways.  In both cases, virtualized
  * global variables are marked as such by being declared to be part of the
  * vnet linker set.  These "master" copies of global variables serve two
  * functions:
  *
  * (1) They contain static initialization or "default" values for global
  *     variables which will be propagated to each virtual network stack
  *     instance when created.  As with normal global variables, they default
  *     to zero-filled.
  *
  * (2) They act as unique global names by which the variable can be referred
  *     to, regardless of network stack instance.  The single global symbol
  *     will be used to calculate the location of a per-virtual instance
  *     variable at run-time.
  *
  * Each virtual network stack instance has a complete copy of each
  * virtualized global variable, stored in a malloc'd block of memory
  * referred to by vnet->vnet_data_mem.  Critical to the design is that each
  * per-instance memory block is laid out identically to the master block so
  * that the offset of each global variable is the same across all blocks.  To
  * optimize run-time access, a precalculated 'base' address,
  * vnet->vnet_data_base, is stored in each vnet, and is the amount that can
  * be added to the address of a 'master' instance of a variable to get to the
  * per-vnet instance.
  *
  * Virtualized global variables are handled in a similar manner, but as each
  * module has its own 'set_vnet' linker set, and we want to keep all
  * virtualized globals togther, we reserve space in the kernel's linker set
  * for potential module variables using a per-vnet character array,
  * 'modspace'.  The virtual network stack allocator maintains a free list to
  * track what space in the array is free (all, initially) and as modules are
  * linked, allocates portions of the space to specific globals.  The kernel
  * module linker queries the virtual network stack allocator and will
  * bind references of the global to the location during linking.  It also
  * calls into the virtual network stack allocator, once the memory is
  * initialized, in order to propagate the new static initializations to all
  * existing virtual network stack instances so that the soon-to-be executing
  * module will find every network stack instance with proper default values.
  */
 
 /*
  * Number of bytes of data in the 'set_vnet' linker set, and hence the total
  * size of all kernel virtualized global variables, and the malloc(9) type
  * that will be used to allocate it.
  */
 #define	VNET_BYTES	(VNET_STOP - VNET_START)
 
 static MALLOC_DEFINE(M_VNET_DATA, "vnet_data", "VNET data");
 
 /*
  * VNET_MODMIN is the minimum number of bytes we will reserve for the sum of
  * global variables across all loaded modules.  As this actually sizes an
  * array declared as a virtualized global variable in the kernel itself, and
  * we want the virtualized global variable space to be page-sized, we may
  * have more space than that in practice.
  */
 #define	VNET_MODMIN	8192
 #define	VNET_SIZE	roundup2(VNET_BYTES, PAGE_SIZE)
 
 /*
  * Space to store virtualized global variables from loadable kernel modules,
  * and the free list to manage it.
  */
 static VNET_DEFINE(char, modspace[VNET_MODMIN]);
 
 /*
  * Global lists of subsystem constructor and destructors for vnets.  They are
  * registered via VNET_SYSINIT() and VNET_SYSUNINIT().  Both lists are
  * protected by the vnet_sysinit_sxlock global lock.
  */
 static TAILQ_HEAD(vnet_sysinit_head, vnet_sysinit) vnet_constructors =
 	TAILQ_HEAD_INITIALIZER(vnet_constructors);
 static TAILQ_HEAD(vnet_sysuninit_head, vnet_sysinit) vnet_destructors =
 	TAILQ_HEAD_INITIALIZER(vnet_destructors);
 
 struct sx		vnet_sysinit_sxlock;
 
 #define	VNET_SYSINIT_WLOCK()	sx_xlock(&vnet_sysinit_sxlock);
 #define	VNET_SYSINIT_WUNLOCK()	sx_xunlock(&vnet_sysinit_sxlock);
 #define	VNET_SYSINIT_RLOCK()	sx_slock(&vnet_sysinit_sxlock);
 #define	VNET_SYSINIT_RUNLOCK()	sx_sunlock(&vnet_sysinit_sxlock);
 
 struct vnet_data_free {
 	uintptr_t	vnd_start;
 	int		vnd_len;
 	TAILQ_ENTRY(vnet_data_free) vnd_link;
 };
 
 static MALLOC_DEFINE(M_VNET_DATA_FREE, "vnet_data_free",
     "VNET resource accounting");
 static TAILQ_HEAD(, vnet_data_free) vnet_data_free_head =
 	    TAILQ_HEAD_INITIALIZER(vnet_data_free_head);
 static struct sx vnet_data_free_lock;
 
 SDT_PROVIDER_DEFINE(vnet);
 SDT_PROBE_DEFINE1(vnet, functions, vnet_alloc, entry, "int");
 SDT_PROBE_DEFINE2(vnet, functions, vnet_alloc, alloc, "int",
     "struct vnet *");
 SDT_PROBE_DEFINE2(vnet, functions, vnet_alloc, return,
     "int", "struct vnet *");
 SDT_PROBE_DEFINE2(vnet, functions, vnet_destroy, entry,
     "int", "struct vnet *");
 SDT_PROBE_DEFINE1(vnet, functions, vnet_destroy, return,
     "int");
 
 #ifdef DDB
 static void db_show_vnet_print_vs(struct vnet_sysinit *, int);
 #endif
 
 /*
  * Allocate a virtual network stack.
  */
 struct vnet *
 vnet_alloc(void)
 {
 	struct vnet *vnet;
 
 	SDT_PROBE1(vnet, functions, vnet_alloc, entry, __LINE__);
 	vnet = malloc(sizeof(struct vnet), M_VNET, M_WAITOK | M_ZERO);
 	vnet->vnet_magic_n = VNET_MAGIC_N;
+	vnet->vnet_state = VNET_STATE_STARTING;
 	SDT_PROBE2(vnet, functions, vnet_alloc, alloc, __LINE__, vnet);
 
 	/*
 	 * Allocate storage for virtualized global variables and copy in
 	 * initial values form our 'master' copy.
 	 */
 	vnet->vnet_data_mem = malloc(VNET_SIZE, M_VNET_DATA, M_WAITOK);
 	memcpy(vnet->vnet_data_mem, (void *)VNET_START, VNET_BYTES);
 
 	/*
 	 * All use of vnet-specific data will immediately subtract VNET_START
 	 * from the base memory pointer, so pre-calculate that now to avoid
 	 * it on each use.
 	 */
 	vnet->vnet_data_base = (uintptr_t)vnet->vnet_data_mem - VNET_START;
 
 	/* Initialize / attach vnet module instances. */
 	CURVNET_SET_QUIET(vnet);
 	vnet_sysinit();
 	CURVNET_RESTORE();
 
 	VNET_LIST_WLOCK();
+	vnet->vnet_state = VNET_STATE_ACTIVE;
 	LIST_INSERT_HEAD(&vnet_head, vnet, vnet_le);
 	VNET_LIST_WUNLOCK();
 
 	SDT_PROBE2(vnet, functions, vnet_alloc, return, __LINE__, vnet);
 	return (vnet);
 }
 
 /*
  * Destroy a virtual network stack.
  */
 void
 vnet_destroy(struct vnet *vnet)
 {
 
 	SDT_PROBE2(vnet, functions, vnet_destroy, entry, __LINE__, vnet);
 	KASSERT(vnet->vnet_sockcnt == 0,
 	    ("%s: vnet still has sockets", __func__));
 
 	VNET_LIST_WLOCK();
+	vnet->vnet_state = VNET_STATE_DYING;
 	LIST_REMOVE(vnet, vnet_le);
 	VNET_LIST_WUNLOCK();
 
 	CURVNET_SET_QUIET(vnet);
 	vnet_sysuninit();
 	CURVNET_RESTORE();
 
 	/*
 	 * Release storage for the virtual network stack instance.
 	 */
 	free(vnet->vnet_data_mem, M_VNET_DATA);
 	vnet->vnet_data_mem = NULL;
 	vnet->vnet_data_base = 0;
 	vnet->vnet_magic_n = 0xdeadbeef;
 	free(vnet, M_VNET);
 	SDT_PROBE1(vnet, functions, vnet_destroy, return, __LINE__);
 }
 
 /*
+ * Cloned interfaces have become such a layer violation that they need
+ * special treatmeant.   They need to go very first and they need to be able
+ * to clean themselves up entirely and not wait for the stack to shutdown as
+ * we if_free() them.  We would like to split the cleanup of them up as well
+ * but in a non-VNET context a ifconfig foo0 destroy still has t work as well.
+ * This MUST be the only SYSUNINIT on SI_SUB_PSEUDO_DONE/SI_ORDER_FIRST!
+ */
+static void
+vnet_uninit_after_pseudo(const void *unused __unused)
+{
+
+	curvnet->vnet_state = VNET_STATE_DYING_AFTER_PSEUDO;
+}
+VNET_SYSUNINIT(vnet_if_uninit_after_pseudo, SI_SUB_PSEUDO_DONE, SI_ORDER_FIRST,
+    vnet_uninit_after_pseudo, NULL);
+
+/*
  * Boot time initialization and allocation of virtual network stacks.
  */
 static void
 vnet_init_prelink(void *arg)
 {
 
 	rw_init(&vnet_rwlock, "vnet_rwlock");
 	sx_init(&vnet_sxlock, "vnet_sxlock");
 	sx_init(&vnet_sysinit_sxlock, "vnet_sysinit_sxlock");
 	LIST_INIT(&vnet_head);
 }
 SYSINIT(vnet_init_prelink, SI_SUB_VNET_PRELINK, SI_ORDER_FIRST,
     vnet_init_prelink, NULL);
 
 static void
 vnet0_init(void *arg)
 {
 
 	/* Warn people before take off - in case we crash early. */
 	printf("WARNING: VIMAGE (virtualized network stack) is a highly "
 	    "experimental feature.\n");
 
 	/*
 	 * We MUST clear curvnet in vi_init_done() before going SMP,
 	 * otherwise CURVNET_SET() macros would scream about unnecessary
 	 * curvnet recursions.
 	 */
 	curvnet = prison0.pr_vnet = vnet0 = vnet_alloc();
 }
 SYSINIT(vnet0_init, SI_SUB_VNET, SI_ORDER_FIRST, vnet0_init, NULL);
 
 static void
 vnet_init_done(void *unused)
 {
 
 	curvnet = NULL;
 }
 
 SYSINIT(vnet_init_done, SI_SUB_VNET_DONE, SI_ORDER_FIRST, vnet_init_done,
     NULL);
 
 /*
  * Once on boot, initialize the modspace freelist to entirely cover modspace.
  */
 static void
 vnet_data_startup(void *dummy __unused)
 {
 	struct vnet_data_free *df;
 
 	df = malloc(sizeof(*df), M_VNET_DATA_FREE, M_WAITOK | M_ZERO);
 	df->vnd_start = (uintptr_t)&VNET_NAME(modspace);
 	df->vnd_len = VNET_MODMIN;
 	TAILQ_INSERT_HEAD(&vnet_data_free_head, df, vnd_link);
 	sx_init(&vnet_data_free_lock, "vnet_data alloc lock");
 }
 SYSINIT(vnet_data, SI_SUB_KLD, SI_ORDER_FIRST, vnet_data_startup, 0);
 
 /*
  * When a module is loaded and requires storage for a virtualized global
  * variable, allocate space from the modspace free list.  This interface
  * should be used only by the kernel linker.
  */
 void *
 vnet_data_alloc(int size)
 {
 	struct vnet_data_free *df;
 	void *s;
 
 	s = NULL;
 	size = roundup2(size, sizeof(void *));
 	sx_xlock(&vnet_data_free_lock);
 	TAILQ_FOREACH(df, &vnet_data_free_head, vnd_link) {
 		if (df->vnd_len < size)
 			continue;
 		if (df->vnd_len == size) {
 			s = (void *)df->vnd_start;
 			TAILQ_REMOVE(&vnet_data_free_head, df, vnd_link);
 			free(df, M_VNET_DATA_FREE);
 			break;
 		}
 		s = (void *)df->vnd_start;
 		df->vnd_len -= size;
 		df->vnd_start = df->vnd_start + size;
 		break;
 	}
 	sx_xunlock(&vnet_data_free_lock);
 
 	return (s);
 }
 
 /*
  * Free space for a virtualized global variable on module unload.
  */
 void
 vnet_data_free(void *start_arg, int size)
 {
 	struct vnet_data_free *df;
 	struct vnet_data_free *dn;
 	uintptr_t start;
 	uintptr_t end;
 
 	size = roundup2(size, sizeof(void *));
 	start = (uintptr_t)start_arg;
 	end = start + size;
 	/*
 	 * Free a region of space and merge it with as many neighbors as
 	 * possible.  Keeping the list sorted simplifies this operation.
 	 */
 	sx_xlock(&vnet_data_free_lock);
 	TAILQ_FOREACH(df, &vnet_data_free_head, vnd_link) {
 		if (df->vnd_start > end)
 			break;
 		/*
 		 * If we expand at the end of an entry we may have to merge
 		 * it with the one following it as well.
 		 */
 		if (df->vnd_start + df->vnd_len == start) {
 			df->vnd_len += size;
 			dn = TAILQ_NEXT(df, vnd_link);
 			if (df->vnd_start + df->vnd_len == dn->vnd_start) {
 				df->vnd_len += dn->vnd_len;
 				TAILQ_REMOVE(&vnet_data_free_head, dn,
 				    vnd_link);
 				free(dn, M_VNET_DATA_FREE);
 			}
 			sx_xunlock(&vnet_data_free_lock);
 			return;
 		}
 		if (df->vnd_start == end) {
 			df->vnd_start = start;
 			df->vnd_len += size;
 			sx_xunlock(&vnet_data_free_lock);
 			return;
 		}
 	}
 	dn = malloc(sizeof(*df), M_VNET_DATA_FREE, M_WAITOK | M_ZERO);
 	dn->vnd_start = start;
 	dn->vnd_len = size;
 	if (df)
 		TAILQ_INSERT_BEFORE(df, dn, vnd_link);
 	else
 		TAILQ_INSERT_TAIL(&vnet_data_free_head, dn, vnd_link);
 	sx_xunlock(&vnet_data_free_lock);
 }
 
 /*
  * When a new virtualized global variable has been allocated, propagate its
  * initial value to each already-allocated virtual network stack instance.
  */
 void
 vnet_data_copy(void *start, int size)
 {
 	struct vnet *vnet;
 
 	VNET_LIST_RLOCK();
 	LIST_FOREACH(vnet, &vnet_head, vnet_le)
 		memcpy((void *)((uintptr_t)vnet->vnet_data_base +
 		    (uintptr_t)start), start, size);
 	VNET_LIST_RUNLOCK();
 }
 
 /*
  * Support for special SYSINIT handlers registered via VNET_SYSINIT()
  * and VNET_SYSUNINIT().
  */
 void
 vnet_register_sysinit(void *arg)
 {
 	struct vnet_sysinit *vs, *vs2;	
 	struct vnet *vnet;
 
 	vs = arg;
 	KASSERT(vs->subsystem > SI_SUB_VNET, ("vnet sysinit too early"));
 
 	/* Add the constructor to the global list of vnet constructors. */
 	VNET_SYSINIT_WLOCK();
 	TAILQ_FOREACH(vs2, &vnet_constructors, link) {
 		if (vs2->subsystem > vs->subsystem)
 			break;
 		if (vs2->subsystem == vs->subsystem && vs2->order > vs->order)
 			break;
 	}
 	if (vs2 != NULL)
 		TAILQ_INSERT_BEFORE(vs2, vs, link);
 	else
 		TAILQ_INSERT_TAIL(&vnet_constructors, vs, link);
 
 	/*
 	 * Invoke the constructor on all the existing vnets when it is
 	 * registered.
 	 */
 	VNET_FOREACH(vnet) {
 		CURVNET_SET_QUIET(vnet);
 		vs->func(vs->arg);
 		CURVNET_RESTORE();
 	}
 	VNET_SYSINIT_WUNLOCK();
 }
 
 void
 vnet_deregister_sysinit(void *arg)
 {
 	struct vnet_sysinit *vs;
 
 	vs = arg;
 
 	/* Remove the constructor from the global list of vnet constructors. */
 	VNET_SYSINIT_WLOCK();
 	TAILQ_REMOVE(&vnet_constructors, vs, link);
 	VNET_SYSINIT_WUNLOCK();
 }
 
 void
 vnet_register_sysuninit(void *arg)
 {
 	struct vnet_sysinit *vs, *vs2;
 
 	vs = arg;
 
 	/* Add the destructor to the global list of vnet destructors. */
 	VNET_SYSINIT_WLOCK();
 	TAILQ_FOREACH(vs2, &vnet_destructors, link) {
 		if (vs2->subsystem > vs->subsystem)
 			break;
 		if (vs2->subsystem == vs->subsystem && vs2->order > vs->order)
 			break;
 	}
 	if (vs2 != NULL)
 		TAILQ_INSERT_BEFORE(vs2, vs, link);
 	else
 		TAILQ_INSERT_TAIL(&vnet_destructors, vs, link);
 	VNET_SYSINIT_WUNLOCK();
 }
 
 void
 vnet_deregister_sysuninit(void *arg)
 {
 	struct vnet_sysinit *vs;
 	struct vnet *vnet;
 
 	vs = arg;
 
 	/*
 	 * Invoke the destructor on all the existing vnets when it is
 	 * deregistered.
 	 */
 	VNET_SYSINIT_WLOCK();
 	VNET_FOREACH(vnet) {
 		CURVNET_SET_QUIET(vnet);
 		vs->func(vs->arg);
 		CURVNET_RESTORE();
 	}
 
 	/* Remove the destructor from the global list of vnet destructors. */
 	TAILQ_REMOVE(&vnet_destructors, vs, link);
 	VNET_SYSINIT_WUNLOCK();
 }
 
 /*
  * Invoke all registered vnet constructors on the current vnet.  Used during
  * vnet construction.  The caller is responsible for ensuring the new vnet is
  * the current vnet and that the vnet_sysinit_sxlock lock is locked.
  */
 void
 vnet_sysinit(void)
 {
 	struct vnet_sysinit *vs;
 
 	VNET_SYSINIT_RLOCK();
 	TAILQ_FOREACH(vs, &vnet_constructors, link) {
 		vs->func(vs->arg);
 	}
 	VNET_SYSINIT_RUNLOCK();
 }
 
 /*
  * Invoke all registered vnet destructors on the current vnet.  Used during
  * vnet destruction.  The caller is responsible for ensuring the dying vnet
  * the current vnet and that the vnet_sysinit_sxlock lock is locked.
  */
 void
 vnet_sysuninit(void)
 {
 	struct vnet_sysinit *vs;
 
 	VNET_SYSINIT_RLOCK();
 	TAILQ_FOREACH_REVERSE(vs, &vnet_destructors, vnet_sysuninit_head,
 	    link) {
 		vs->func(vs->arg);
 	}
 	VNET_SYSINIT_RUNLOCK();
 }
 
 /*
  * EVENTHANDLER(9) extensions.
  */
 /*
  * Invoke the eventhandler function originally registered with the possibly
  * registered argument for all virtual network stack instances.
  *
  * This iterator can only be used for eventhandlers that do not take any
  * additional arguments, as we do ignore the variadic arguments from the
  * EVENTHANDLER_INVOKE() call.
  */
 void
 vnet_global_eventhandler_iterator_func(void *arg, ...)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct eventhandler_entry_vimage *v_ee;
 
 	/*
 	 * There is a bug here in that we should actually cast things to
 	 * (struct eventhandler_entry_ ## name *)  but that's not easily
 	 * possible in here so just re-using the variadic version we
 	 * defined for the generic vimage case.
 	 */
 	v_ee = arg;
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		((vimage_iterator_func_t)v_ee->func)(v_ee->ee_arg);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 }
 
 #ifdef VNET_DEBUG
 struct vnet_recursion {
 	SLIST_ENTRY(vnet_recursion)	 vnr_le;
 	const char			*prev_fn;
 	const char			*where_fn;
 	int				 where_line;
 	struct vnet			*old_vnet;
 	struct vnet			*new_vnet;
 };
 
 static SLIST_HEAD(, vnet_recursion) vnet_recursions =
     SLIST_HEAD_INITIALIZER(vnet_recursions);
 
 static void
 vnet_print_recursion(struct vnet_recursion *vnr, int brief)
 {
 
 	if (!brief)
 		printf("CURVNET_SET() recursion in ");
 	printf("%s() line %d, prev in %s()", vnr->where_fn, vnr->where_line,
 	    vnr->prev_fn);
 	if (brief)
 		printf(", ");
 	else
 		printf("\n    ");
 	printf("%p -> %p\n", vnr->old_vnet, vnr->new_vnet);
 }
 
 void
 vnet_log_recursion(struct vnet *old_vnet, const char *old_fn, int line)
 {
 	struct vnet_recursion *vnr;
 
 	/* Skip already logged recursion events. */
 	SLIST_FOREACH(vnr, &vnet_recursions, vnr_le)
 		if (vnr->prev_fn == old_fn &&
 		    vnr->where_fn == curthread->td_vnet_lpush &&
 		    vnr->where_line == line &&
 		    (vnr->old_vnet == vnr->new_vnet) == (curvnet == old_vnet))
 			return;
 
 	vnr = malloc(sizeof(*vnr), M_VNET, M_NOWAIT | M_ZERO);
 	if (vnr == NULL)
 		panic("%s: malloc failed", __func__);
 	vnr->prev_fn = old_fn;
 	vnr->where_fn = curthread->td_vnet_lpush;
 	vnr->where_line = line;
 	vnr->old_vnet = old_vnet;
 	vnr->new_vnet = curvnet;
 
 	SLIST_INSERT_HEAD(&vnet_recursions, vnr, vnr_le);
 
 	vnet_print_recursion(vnr, 0);
 #ifdef KDB
 	kdb_backtrace();
 #endif
 }
 #endif /* VNET_DEBUG */
 
 /*
  * DDB(4).
  */
 #ifdef DDB
 DB_SHOW_COMMAND(vnets, db_show_vnets)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_FOREACH(vnet_iter) {
+		if (have_addr && addr != 0 &&
+		    (struct vnet *)addr != vnet_iter)
+			continue;
 		db_printf("vnet            = %p\n", vnet_iter);
 		db_printf(" vnet_magic_n   = 0x%x (%s, orig 0x%x)\n",
 		    vnet_iter->vnet_magic_n,
 		    (vnet_iter->vnet_magic_n == VNET_MAGIC_N) ?
 			"ok" : "mismatch", VNET_MAGIC_N);
 		db_printf(" vnet_ifcnt     = %u\n", vnet_iter->vnet_ifcnt);
 		db_printf(" vnet_sockcnt   = %u\n", vnet_iter->vnet_sockcnt);
 		db_printf(" vnet_data_mem  = %p\n", vnet_iter->vnet_data_mem);
 		db_printf(" vnet_data_base = 0x%jx\n",
 		    (uintmax_t)vnet_iter->vnet_data_base);
+		db_printf(" vnet_state     = %#x\n", vnet_iter->vnet_state);
 		db_printf("\n");
 		if (db_pager_quit)
 			break;
 	}
 }
 
 static void
 db_show_vnet_print_vs(struct vnet_sysinit *vs, int ddb)
 {
 	const char *vsname, *funcname;
 	c_db_sym_t sym;
 	db_expr_t  offset;
 
 #define xprint(...)							\
 	if (ddb)							\
 		db_printf(__VA_ARGS__);					\
 	else								\
 		printf(__VA_ARGS__)
 
 	if (vs == NULL) {
 		xprint("%s: no vnet_sysinit * given\n", __func__);
 		return;
 	}
 
 	sym = db_search_symbol((vm_offset_t)vs, DB_STGY_ANY, &offset);
 	db_symbol_values(sym, &vsname, NULL);
 	sym = db_search_symbol((vm_offset_t)vs->func, DB_STGY_PROC, &offset);
 	db_symbol_values(sym, &funcname, NULL);
 	xprint("%s(%p)\n", (vsname != NULL) ? vsname : "", vs);
 	xprint("  0x%08x 0x%08x\n", vs->subsystem, vs->order);
 	xprint("  %p(%s)(%p)\n",
 	    vs->func, (funcname != NULL) ? funcname : "", vs->arg);
 #undef xprint
 }
 
 DB_SHOW_COMMAND(vnet_sysinit, db_show_vnet_sysinit)
 {
 	struct vnet_sysinit *vs;
 
 	db_printf("VNET_SYSINIT vs Name(Ptr)\n");
 	db_printf("  Subsystem  Order\n");
 	db_printf("  Function(Name)(Arg)\n");
 	TAILQ_FOREACH(vs, &vnet_constructors, link) {
 		db_show_vnet_print_vs(vs, 1);
 		if (db_pager_quit)
 			break;
 	}
 }
 
 DB_SHOW_COMMAND(vnet_sysuninit, db_show_vnet_sysuninit)
 {
 	struct vnet_sysinit *vs;
 
 	db_printf("VNET_SYSUNINIT vs Name(Ptr)\n");
 	db_printf("  Subsystem  Order\n");
 	db_printf("  Function(Name)(Arg)\n");
 	TAILQ_FOREACH_REVERSE(vs, &vnet_destructors, vnet_sysuninit_head,
 	    link) {
 		db_show_vnet_print_vs(vs, 1);
 		if (db_pager_quit)
 			break;
 	}
 }
 
 #ifdef VNET_DEBUG
 DB_SHOW_COMMAND(vnetrcrs, db_show_vnetrcrs)
 {
 	struct vnet_recursion *vnr;
 
 	SLIST_FOREACH(vnr, &vnet_recursions, vnr_le)
 		vnet_print_recursion(vnr, 1);
 }
 #endif
 #endif /* DDB */
Index: projects/vnet/sys/net/vnet.h
===================================================================
--- projects/vnet/sys/net/vnet.h	(revision 297511)
+++ projects/vnet/sys/net/vnet.h	(revision 297512)
@@ -1,431 +1,436 @@
 /*-
  * Copyright (c) 2006-2009 University of Zagreb
  * Copyright (c) 2006-2009 FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by the University of Zagreb and the
  * FreeBSD Foundation under sponsorship by the Stichting NLnet and the
  * FreeBSD Foundation.
  *
  * Copyright (c) 2009 Jeffrey Roberson <jeff@freebsd.org>
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*-
  * This header file defines several sets of interfaces supporting virtualized
  * network stacks:
  *
  * - Definition of 'struct vnet' and functions and macros to allocate/free/
  *   manipulate it.
  *
  * - A virtual network stack memory allocator, which provides support for
  *   virtualized global variables via a special linker set, set_vnet.
  *
  * - Virtualized sysinits/sysuninits, which allow constructors and
  *   destructors to be run for each network stack subsystem as virtual
  *   instances are created and destroyed.
  *
  * If VIMAGE isn't compiled into the kernel, virtualized global variables
  * compile to normal global variables, and virtualized sysinits to regular
  * sysinits.
  */
 
 #ifndef _NET_VNET_H_
 #define	_NET_VNET_H_
 
 /*
  * struct vnet describes a virtualized network stack, and is primarily a
  * pointer to storage for virtualized global variables.  Expose to userspace
  * as required for libkvm.
  */
 #if defined(_KERNEL) || defined(_WANT_VNET)
 #include <sys/queue.h>
 
 struct vnet {
 	LIST_ENTRY(vnet)	 vnet_le;	/* all vnets list */
 	u_int			 vnet_magic_n;
 	u_int			 vnet_ifcnt;
 	u_int			 vnet_sockcnt;
+	u_int			 vnet_state;
+#define	VNET_STATE_STARTING		0x01
+#define	VNET_STATE_ACTIVE		0x02
+#define	VNET_STATE_DYING		0x04
+#define	VNET_STATE_DYING_AFTER_PSEUDO	0x08
 	void			*vnet_data_mem;
 	uintptr_t		 vnet_data_base;
 };
 #define	VNET_MAGIC_N	0x3e0d8f29
 
 /*
  * These two virtual network stack allocator definitions are also required
  * for libkvm so that it can evaluate virtualized global variables.
  */
 #define	VNET_SETNAME		"set_vnet"
 #define	VNET_SYMPREFIX		"vnet_entry_"
 #endif
 
 #ifdef _KERNEL
 
 #define	VNET_PCPUSTAT_DECLARE(type, name)	\
     VNET_DECLARE(counter_u64_t, name[sizeof(type) / sizeof(uint64_t)])
 
 #define	VNET_PCPUSTAT_DEFINE(type, name)	\
     VNET_DEFINE(counter_u64_t, name[sizeof(type) / sizeof(uint64_t)])
 
 #define	VNET_PCPUSTAT_ALLOC(name, wait)	\
     COUNTER_ARRAY_ALLOC(VNET(name), \
 	sizeof(VNET(name)) / sizeof(counter_u64_t), (wait))
 
 #define	VNET_PCPUSTAT_FREE(name)	\
     COUNTER_ARRAY_FREE(VNET(name), sizeof(VNET(name)) / sizeof(counter_u64_t))
 
 #define	VNET_PCPUSTAT_ADD(type, name, f, v)	\
     counter_u64_add(VNET(name)[offsetof(type, f) / sizeof(uint64_t)], (v))
 
 #define	VNET_PCPUSTAT_FETCH(type, name, f)	\
     counter_u64_fetch(VNET(name)[offsetof(type, f) / sizeof(uint64_t)])
 
 #define	VNET_PCPUSTAT_SYSINIT(name)	\
 static void				\
 vnet_##name##_init(const void *unused)	\
 {					\
 	VNET_PCPUSTAT_ALLOC(name, M_WAITOK);	\
 }					\
 VNET_SYSINIT(vnet_ ## name ## _init, SI_SUB_PROTO_IFATTACHDOMAIN,	\
     SI_ORDER_ANY, vnet_ ## name ## _init, NULL)
 
 #define	VNET_PCPUSTAT_SYSUNINIT(name)					\
 static void								\
 vnet_##name##_uninit(const void *unused)				\
 {									\
 	VNET_PCPUSTAT_FREE(name);					\
 }									\
 VNET_SYSUNINIT(vnet_ ## name ## _uninit, SI_SUB_PROTO_IFATTACHDOMAIN,	\
     SI_ORDER_ANY, vnet_ ## name ## _uninit, NULL)
 
 #ifdef SYSCTL_OID
 #define	SYSCTL_VNET_PCPUSTAT(parent, nbr, name, type, array, desc)	\
 static int								\
 array##_sysctl(SYSCTL_HANDLER_ARGS)					\
 {									\
 	type s;								\
 	CTASSERT((sizeof(type) / sizeof(uint64_t)) ==			\
 	    (sizeof(VNET(array)) / sizeof(counter_u64_t)));		\
 	COUNTER_ARRAY_COPY(VNET(array), &s, sizeof(type) / sizeof(uint64_t));\
 	if (req->newptr)						\
 		COUNTER_ARRAY_ZERO(VNET(array),				\
 		    sizeof(type) / sizeof(uint64_t));			\
 	return (SYSCTL_OUT(req, &s, sizeof(type)));			\
 }									\
 SYSCTL_PROC(parent, nbr, name, CTLFLAG_VNET | CTLTYPE_OPAQUE | CTLFLAG_RW, \
     NULL, 0, array ## _sysctl, "I", desc)
 #endif /* SYSCTL_OID */
 
 #ifdef VIMAGE
 #include <sys/lock.h>
 #include <sys/proc.h>			/* for struct thread */
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 
 /*
  * Location of the kernel's 'set_vnet' linker set.
  */
 extern uintptr_t	*__start_set_vnet;
 __GLOBL(__start_set_vnet);
 extern uintptr_t	*__stop_set_vnet;
 __GLOBL(__stop_set_vnet);
 
 #define	VNET_START	(uintptr_t)&__start_set_vnet
 #define	VNET_STOP	(uintptr_t)&__stop_set_vnet
 
 /*
  * Functions to allocate and destroy virtual network stacks.
  */
 struct vnet *vnet_alloc(void);
 void	vnet_destroy(struct vnet *vnet);
 
 /*
  * The current virtual network stack -- we may wish to move this to struct
  * pcpu in the future.
  */
 #define	curvnet	curthread->td_vnet
 
 /*
  * Various macros -- get and set the current network stack, but also
  * assertions.
  */
 #if defined(INVARIANTS) || defined(VNET_DEBUG)
 #define	VNET_ASSERT(exp, msg)	do {					\
 	if (!(exp))							\
 		panic msg;						\
 } while (0)
 #else
 #define	VNET_ASSERT(exp, msg)	do {					\
 } while (0)
 #endif
 
 #ifdef VNET_DEBUG
 void vnet_log_recursion(struct vnet *, const char *, int);
 
 #define	CURVNET_SET_QUIET(arg)						\
 	VNET_ASSERT((arg) != NULL && (arg)->vnet_magic_n == VNET_MAGIC_N, \
 	    ("CURVNET_SET at %s:%d %s() curvnet=%p vnet=%p",		\
 	    __FILE__, __LINE__, __func__, curvnet, (arg)));		\
 	struct vnet *saved_vnet = curvnet;				\
 	const char *saved_vnet_lpush = curthread->td_vnet_lpush;	\
 	curvnet = arg;							\
 	curthread->td_vnet_lpush = __func__;
  
 #define	CURVNET_SET_VERBOSE(arg)					\
 	CURVNET_SET_QUIET(arg)						\
 	if (saved_vnet)							\
 		vnet_log_recursion(saved_vnet, saved_vnet_lpush, __LINE__);
 
 #define	CURVNET_SET(arg)	CURVNET_SET_VERBOSE(arg)
  
 #define	CURVNET_RESTORE()						\
 	VNET_ASSERT(curvnet != NULL && (saved_vnet == NULL ||		\
 	    saved_vnet->vnet_magic_n == VNET_MAGIC_N),			\
 	    ("CURVNET_RESTORE at %s:%d %s() curvnet=%p saved_vnet=%p",	\
 	    __FILE__, __LINE__, __func__, curvnet, saved_vnet));	\
 	curvnet = saved_vnet;						\
 	curthread->td_vnet_lpush = saved_vnet_lpush;
 #else /* !VNET_DEBUG */
 
 #define	CURVNET_SET_QUIET(arg)						\
 	VNET_ASSERT((arg) != NULL && (arg)->vnet_magic_n == VNET_MAGIC_N, \
 	    ("CURVNET_SET at %s:%d %s() curvnet=%p vnet=%p",		\
 	    __FILE__, __LINE__, __func__, curvnet, (arg)));		\
 	struct vnet *saved_vnet = curvnet;				\
 	curvnet = arg;	
  
 #define	CURVNET_SET_VERBOSE(arg)					\
 	CURVNET_SET_QUIET(arg)
 
 #define	CURVNET_SET(arg)	CURVNET_SET_VERBOSE(arg)
  
 #define	CURVNET_RESTORE()						\
 	VNET_ASSERT(curvnet != NULL && (saved_vnet == NULL ||		\
 	    saved_vnet->vnet_magic_n == VNET_MAGIC_N),			\
 	    ("CURVNET_RESTORE at %s:%d %s() curvnet=%p saved_vnet=%p",	\
 	    __FILE__, __LINE__, __func__, curvnet, saved_vnet));	\
 	curvnet = saved_vnet;
 #endif /* VNET_DEBUG */
 
 extern struct vnet *vnet0;
 #define	IS_DEFAULT_VNET(arg)	((arg) == vnet0)
 
 #define	CRED_TO_VNET(cr)	(cr)->cr_prison->pr_vnet
 #define	TD_TO_VNET(td)		CRED_TO_VNET((td)->td_ucred)
 #define	P_TO_VNET(p)		CRED_TO_VNET((p)->p_ucred)
 
 /*
  * Global linked list of all virtual network stacks, along with read locks to
  * access it.  If a caller may sleep while accessing the list, it must use
  * the sleepable lock macros.
  */
 LIST_HEAD(vnet_list_head, vnet);
 extern struct vnet_list_head vnet_head;
 extern struct rwlock vnet_rwlock;
 extern struct sx vnet_sxlock;
 
 #define	VNET_LIST_RLOCK()		sx_slock(&vnet_sxlock)
 #define	VNET_LIST_RLOCK_NOSLEEP()	rw_rlock(&vnet_rwlock)
 #define	VNET_LIST_RUNLOCK()		sx_sunlock(&vnet_sxlock)
 #define	VNET_LIST_RUNLOCK_NOSLEEP()	rw_runlock(&vnet_rwlock)
 
 /*
  * Iteration macros to walk the global list of virtual network stacks.
  */
 #define	VNET_ITERATOR_DECL(arg)	struct vnet *arg
 #define	VNET_FOREACH(arg)	LIST_FOREACH((arg), &vnet_head, vnet_le)
 
 /*
  * Virtual network stack memory allocator, which allows global variables to
  * be automatically instantiated for each network stack instance.
  */
 #define	VNET_NAME(n)		vnet_entry_##n
 #define	VNET_DECLARE(t, n)	extern t VNET_NAME(n)
 #define	VNET_DEFINE(t, n)	t VNET_NAME(n) __section(VNET_SETNAME) __used
 #define	_VNET_PTR(b, n)		(__typeof(VNET_NAME(n))*)		\
 				    ((b) + (uintptr_t)&VNET_NAME(n))
 
 #define	_VNET(b, n)		(*_VNET_PTR(b, n))
 
 /*
  * Virtualized global variable accessor macros.
  */
 #define	VNET_VNET_PTR(vnet, n)		_VNET_PTR((vnet)->vnet_data_base, n)
 #define	VNET_VNET(vnet, n)		(*VNET_VNET_PTR((vnet), n))
 
 #define	VNET_PTR(n)		VNET_VNET_PTR(curvnet, n)
 #define	VNET(n)			VNET_VNET(curvnet, n)
 
 /*
  * Virtual network stack allocator interfaces from the kernel linker.
  */
 void	*vnet_data_alloc(int size);
 void	 vnet_data_copy(void *start, int size);
 void	 vnet_data_free(void *start_arg, int size);
 
 /*
  * Virtual sysinit mechanism, allowing network stack components to declare
  * startup and shutdown methods to be run when virtual network stack
  * instances are created and destroyed.
  */
 #include <sys/kernel.h>
 
 /*
  * SYSINIT/SYSUNINIT variants that provide per-vnet constructors and
  * destructors.
  */
 struct vnet_sysinit {
 	enum sysinit_sub_id	subsystem;
 	enum sysinit_elem_order	order;
 	sysinit_cfunc_t		func;
 	const void		*arg;
 	TAILQ_ENTRY(vnet_sysinit) link;
 };
 
 #define	VNET_SYSINIT(ident, subsystem, order, func, arg)		\
 	static struct vnet_sysinit ident ## _vnet_init = {		\
 		subsystem,						\
 		order,							\
 		(sysinit_cfunc_t)(sysinit_nfunc_t)func,			\
 		(arg)							\
 	};								\
 	SYSINIT(vnet_init_ ## ident, subsystem, order,			\
 	    vnet_register_sysinit, &ident ## _vnet_init);		\
 	SYSUNINIT(vnet_init_ ## ident, subsystem, order,		\
 	    vnet_deregister_sysinit, &ident ## _vnet_init)
 
 #define	VNET_SYSUNINIT(ident, subsystem, order, func, arg)		\
 	static struct vnet_sysinit ident ## _vnet_uninit = {		\
 		subsystem,						\
 		order,							\
 		(sysinit_cfunc_t)(sysinit_nfunc_t)func,			\
 		(arg)							\
 	};								\
 	SYSINIT(vnet_uninit_ ## ident, subsystem, order,		\
 	    vnet_register_sysuninit, &ident ## _vnet_uninit);		\
 	SYSUNINIT(vnet_uninit_ ## ident, subsystem, order,		\
 	    vnet_deregister_sysuninit, &ident ## _vnet_uninit)
 
 /*
  * Run per-vnet sysinits or sysuninits during vnet creation/destruction.
  */
 void	 vnet_sysinit(void);
 void	 vnet_sysuninit(void);
 
 /*
  * Interfaces for managing per-vnet constructors and destructors.
  */
 void	vnet_register_sysinit(void *arg);
 void	vnet_register_sysuninit(void *arg);
 void	vnet_deregister_sysinit(void *arg);
 void	vnet_deregister_sysuninit(void *arg);
 
 /*
  * EVENTHANDLER(9) extensions.
  */
 #include <sys/eventhandler.h>
 
 void	vnet_global_eventhandler_iterator_func(void *, ...);
 #define VNET_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \
 do {									\
 	if (IS_DEFAULT_VNET(curvnet)) {					\
 		(tag) = vimage_eventhandler_register(NULL, #name, func,	\
 		    arg, priority,					\
 		    vnet_global_eventhandler_iterator_func);		\
 	}								\
 } while(0)
 #define VNET_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority)	\
 do {									\
 	if (IS_DEFAULT_VNET(curvnet)) {					\
 		vimage_eventhandler_register(NULL, #name, func,		\
 		    arg, priority,					\
 		    vnet_global_eventhandler_iterator_func);		\
 	}								\
 } while(0)
 
 #else /* !VIMAGE */
 
 /*
  * Various virtual network stack macros compile to no-ops without VIMAGE.
  */
 #define	curvnet			NULL
 
 #define	VNET_ASSERT(exp, msg)
 #define	CURVNET_SET(arg)
 #define	CURVNET_SET_QUIET(arg)
 #define	CURVNET_RESTORE()
 
 #define	VNET_LIST_RLOCK()
 #define	VNET_LIST_RLOCK_NOSLEEP()
 #define	VNET_LIST_RUNLOCK()
 #define	VNET_LIST_RUNLOCK_NOSLEEP()
 #define	VNET_ITERATOR_DECL(arg)
 #define	VNET_FOREACH(arg)
 
 #define	IS_DEFAULT_VNET(arg)	1
 #define	CRED_TO_VNET(cr)	NULL
 #define	TD_TO_VNET(td)		NULL
 #define	P_TO_VNET(p)		NULL
 
 /*
  * Versions of the VNET macros that compile to normal global variables and
  * standard sysctl definitions.
  */
 #define	VNET_NAME(n)		n
 #define	VNET_DECLARE(t, n)	extern t n
 #define	VNET_DEFINE(t, n)	t n
 #define	_VNET_PTR(b, n)		&VNET_NAME(n)
 
 /*
  * Virtualized global variable accessor macros.
  */
 #define	VNET_VNET_PTR(vnet, n)		(&(n))
 #define	VNET_VNET(vnet, n)		(n)
 
 #define	VNET_PTR(n)		(&(n))
 #define	VNET(n)			(n)
 
 /*
  * When VIMAGE isn't compiled into the kernel, VNET_SYSINIT/VNET_SYSUNINIT
  * map into normal sysinits, which have the same ordering properties.
  */
 #define	VNET_SYSINIT(ident, subsystem, order, func, arg)		\
 	SYSINIT(ident, subsystem, order, func, arg)
 #define	VNET_SYSUNINIT(ident, subsystem, order, func, arg)		\
 	SYSUNINIT(ident, subsystem, order, func, arg)
 
 /*
  * Without VIMAGE revert to the default implementation.
  */
 #define VNET_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \
 	(tag) = eventhandler_register(NULL, #name, func, arg, priority)
 #define VNET_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority)	\
 	eventhandler_register(NULL, #name, func, arg, priority)
 #endif /* VIMAGE */
 #endif /* _KERNEL */
 
 #endif /* !_NET_VNET_H_ */
Index: projects/vnet/sys/netinet/igmp.c
===================================================================
--- projects/vnet/sys/netinet/igmp.c	(revision 297511)
+++ projects/vnet/sys/netinet/igmp.c	(revision 297512)
@@ -1,3674 +1,3667 @@
 /*-
  * Copyright (c) 2007-2009 Bruce Simpson.
  * Copyright (c) 1988 Stephen Deering.
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
  */
 
 /*
  * Internet Group Management Protocol (IGMP) routines.
  * [RFC1112, RFC2236, RFC3376]
  *
  * Written by Steve Deering, Stanford, May 1988.
  * Modified by Rosen Sharma, Stanford, Aug 1994.
  * Modified by Bill Fenner, Xerox PARC, Feb 1995.
  * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995.
  * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson.
  *
  * MULTICAST Revision: 3.5.1.4
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/sysctl.h>
 #include <sys/ktr.h>
 #include <sys/condvar.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/igmp.h>
 #include <netinet/igmp_var.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifndef KTR_IGMPV3
 #define KTR_IGMPV3 KTR_INET
 #endif
 
 static struct igmp_ifsoftc *
 		igi_alloc_locked(struct ifnet *);
 static void	igi_delete_locked(const struct ifnet *);
 static void	igmp_dispatch_queue(struct mbufq *, int, const int);
 static void	igmp_fasttimo_vnet(void);
 static void	igmp_final_leave(struct in_multi *, struct igmp_ifsoftc *);
 static int	igmp_handle_state_change(struct in_multi *,
 		    struct igmp_ifsoftc *);
 static int	igmp_initial_join(struct in_multi *, struct igmp_ifsoftc *);
 static int	igmp_input_v1_query(struct ifnet *, const struct ip *,
 		    const struct igmp *);
 static int	igmp_input_v2_query(struct ifnet *, const struct ip *,
 		    const struct igmp *);
 static int	igmp_input_v3_query(struct ifnet *, const struct ip *,
 		    /*const*/ struct igmpv3 *);
 static int	igmp_input_v3_group_query(struct in_multi *,
 		    struct igmp_ifsoftc *, int, /*const*/ struct igmpv3 *);
 static int	igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *,
 		    /*const*/ struct igmp *);
 static int	igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *,
 		    /*const*/ struct igmp *);
 static void	igmp_intr(struct mbuf *);
 static int	igmp_isgroupreported(const struct in_addr);
 static struct mbuf *
 		igmp_ra_alloc(void);
 #ifdef KTR
 static char *	igmp_rec_type_to_str(const int);
 #endif
 static void	igmp_set_version(struct igmp_ifsoftc *, const int);
 static void	igmp_slowtimo_vnet(void);
 static int	igmp_v1v2_queue_report(struct in_multi *, const int);
 static void	igmp_v1v2_process_group_timer(struct in_multi *, const int);
 static void	igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *);
 static void	igmp_v2_update_group(struct in_multi *, const int);
 static void	igmp_v3_cancel_link_timers(struct igmp_ifsoftc *);
 static void	igmp_v3_dispatch_general_query(struct igmp_ifsoftc *);
 static struct mbuf *
 		igmp_v3_encap_report(struct ifnet *, struct mbuf *);
 static int	igmp_v3_enqueue_group_record(struct mbufq *,
 		    struct in_multi *, const int, const int, const int);
 static int	igmp_v3_enqueue_filter_change(struct mbufq *,
 		    struct in_multi *);
 static void	igmp_v3_process_group_timers(struct igmp_ifsoftc *,
 		    struct mbufq *, struct mbufq *, struct in_multi *,
 		    const int);
 static int	igmp_v3_merge_state_changes(struct in_multi *,
 		    struct mbufq *);
 static void	igmp_v3_suppress_group_record(struct in_multi *);
 static int	sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS);
 static int	sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS);
 static int	sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS);
 
 static const struct netisr_handler igmp_nh = {
 	.nh_name = "igmp",
 	.nh_handler = igmp_intr,
 	.nh_proto = NETISR_IGMP,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 /*
  * System-wide globals.
  *
  * Unlocked access to these is OK, except for the global IGMP output
  * queue. The IGMP subsystem lock ends up being system-wide for the moment,
  * because all VIMAGEs have to share a global output queue, as netisrs
  * themselves are not virtualized.
  *
  * Locking:
  *  * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
  *    Any may be taken independently; if any are held at the same
  *    time, the above lock order must be followed.
  *  * All output is delegated to the netisr.
  *    Now that Giant has been eliminated, the netisr may be inlined.
  *  * IN_MULTI_LOCK covers in_multi.
  *  * IGMP_LOCK covers igmp_ifsoftc and any global variables in this file,
  *    including the output queue.
  *  * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
  *    per-link state iterators.
  *  * igmp_ifsoftc is valid as long as PF_INET is attached to the interface,
  *    therefore it is not refcounted.
  *    We allow unlocked reads of igmp_ifsoftc when accessed via in_multi.
  *
  * Reference counting
  *  * IGMP acquires its own reference every time an in_multi is passed to
  *    it and the group is being joined for the first time.
  *  * IGMP releases its reference(s) on in_multi in a deferred way,
  *    because the operations which process the release run as part of
  *    a loop whose control variables are directly affected by the release
  *    (that, and not recursing on the IF_ADDR_LOCK).
  *
  * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds
  * to a vnet in ifp->if_vnet.
  *
  * SMPng: XXX We may potentially race operations on ifma_protospec.
  * The problem is that we currently lack a clean way of taking the
  * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing,
  * as anything which modifies ifma needs to be covered by that lock.
  * So check for ifma_protospec being NULL before proceeding.
  */
 struct mtx		 igmp_mtx;
 
 struct mbuf		*m_raopt;		 /* Router Alert option */
 static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");
 
 /*
  * VIMAGE-wide globals.
  *
  * The IGMPv3 timers themselves need to run per-image, however,
  * protosw timers run globally (see tcp).
  * An ifnet can only be in one vimage at a time, and the loopback
  * ifnet, loif, is itself virtualized.
  * It would otherwise be possible to seriously hose IGMP state,
  * and create inconsistencies in upstream multicast routing, if you have
  * multiple VIMAGEs running on the same link joining different multicast
  * groups, UNLESS the "primary IP address" is different. This is because
  * IGMP for IPv4 does not force link-local addresses to be used for each
  * node, unlike MLD for IPv6.
  * Obviously the IGMPv3 per-interface state has per-vimage granularity
  * also as a result.
  *
  * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection
  * policy to control the address used by IGMP on the link.
  */
 static VNET_DEFINE(int, interface_timers_running);	/* IGMPv3 general
 							 * query response */
 static VNET_DEFINE(int, state_change_timers_running);	/* IGMPv3 state-change
 							 * retransmit */
 static VNET_DEFINE(int, current_state_timers_running);	/* IGMPv1/v2 host
 							 * report; IGMPv3 g/sg
 							 * query response */
 
 #define	V_interface_timers_running	VNET(interface_timers_running)
 #define	V_state_change_timers_running	VNET(state_change_timers_running)
 #define	V_current_state_timers_running	VNET(current_state_timers_running)
 
 static VNET_DEFINE(LIST_HEAD(, igmp_ifsoftc), igi_head) =
     LIST_HEAD_INITIALIZER(igi_head);
 static VNET_DEFINE(struct igmpstat, igmpstat) = {
 	.igps_version = IGPS_VERSION_3,
 	.igps_len = sizeof(struct igmpstat),
 };
 static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0};
 
 #define	V_igi_head			VNET(igi_head)
 #define	V_igmpstat			VNET(igmpstat)
 #define	V_igmp_gsrdelay			VNET(igmp_gsrdelay)
 
 static VNET_DEFINE(int, igmp_recvifkludge) = 1;
 static VNET_DEFINE(int, igmp_sendra) = 1;
 static VNET_DEFINE(int, igmp_sendlocal) = 1;
 static VNET_DEFINE(int, igmp_v1enable) = 1;
 static VNET_DEFINE(int, igmp_v2enable) = 1;
 static VNET_DEFINE(int, igmp_legacysupp);
 static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3;
 
 #define	V_igmp_recvifkludge		VNET(igmp_recvifkludge)
 #define	V_igmp_sendra			VNET(igmp_sendra)
 #define	V_igmp_sendlocal		VNET(igmp_sendlocal)
 #define	V_igmp_v1enable			VNET(igmp_v1enable)
 #define	V_igmp_v2enable			VNET(igmp_v2enable)
 #define	V_igmp_legacysupp		VNET(igmp_legacysupp)
 #define	V_igmp_default_version		VNET(igmp_default_version)
 
 /*
  * Virtualized sysctls.
  */
 SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmpstat), igmpstat, "");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_recvifkludge), 0,
     "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_sendra), 0,
     "Send IP Router Alert option in IGMPv2/v3 messages");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_sendlocal), 0,
     "Send IGMP membership reports for 224.0.0.0/24 groups");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_v1enable), 0,
     "Enable backwards compatibility with IGMPv1");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_v2enable), 0,
     "Enable backwards compatibility with IGMPv2");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_legacysupp), 0,
     "Allow v1/v2 reports to suppress v3 group responses");
 SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I",
     "Default version of IGMP to run on each interface");
 SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I",
     "Rate limit for IGMPv3 Group-and-Source queries in seconds");
 
 /*
  * Non-virtualized sysctls.
  */
 static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo,
     "Per-interface IGMPv3 state");
 
 static __inline void
 igmp_save_context(struct mbuf *m, struct ifnet *ifp)
 {
 
 #ifdef VIMAGE
 	m->m_pkthdr.PH_loc.ptr = ifp->if_vnet;
 #endif /* VIMAGE */
 	m->m_pkthdr.flowid = ifp->if_index;
 }
 
 static __inline void
 igmp_scrub_context(struct mbuf *m)
 {
 
 	m->m_pkthdr.PH_loc.ptr = NULL;
 	m->m_pkthdr.flowid = 0;
 }
 
 #ifdef KTR
 static __inline char *
 inet_ntoa_haddr(in_addr_t haddr)
 {
 	struct in_addr ia;
 
 	ia.s_addr = htonl(haddr);
 	return (inet_ntoa(ia));
 }
 #endif
 
 /*
  * Restore context from a queued IGMP output chain.
  * Return saved ifindex.
  *
  * VIMAGE: The assertion is there to make sure that we
  * actually called CURVNET_SET() with what's in the mbuf chain.
  */
 static __inline uint32_t
 igmp_restore_context(struct mbuf *m)
 {
 
 #ifdef notyet
 #if defined(VIMAGE) && defined(INVARIANTS)
 	KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr),
 	    ("%s: called when curvnet was not restored", __func__));
 #endif
 #endif
 	return (m->m_pkthdr.flowid);
 }
 
 /*
  * Retrieve or set default IGMP version.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by IGMP lock.
  */
 static int
 sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS)
 {
 	int	 error;
 	int	 new;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	IGMP_LOCK();
 
 	new = V_igmp_default_version;
 
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d",
 	     V_igmp_default_version, new);
 
 	V_igmp_default_version = new;
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Retrieve or set threshold between group-source queries in seconds.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by IGMP lock.
  */
 static int
 sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	IGMP_LOCK();
 
 	i = V_igmp_gsrdelay.tv_sec;
 
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (i < -1 || i >= 60) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d",
 	     V_igmp_gsrdelay.tv_sec, i);
 	V_igmp_gsrdelay.tv_sec = i;
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Expose struct igmp_ifsoftc to userland, keyed by ifindex.
  * For use by ifmcstat(8).
  *
  * SMPng: NOTE: Does an unlocked ifindex space read.
  * VIMAGE: Assume curvnet set by caller. The node handler itself
  * is not directly virtualized.
  */
 static int
 sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
 {
 	int			*name;
 	int			 error;
 	u_int			 namelen;
 	struct ifnet		*ifp;
 	struct igmp_ifsoftc	*igi;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo));
 	if (error)
 		return (error);
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	if (name[0] <= 0 || name[0] > V_if_index) {
 		error = ENOENT;
 		goto out_locked;
 	}
 
 	error = ENOENT;
 
 	ifp = ifnet_byindex(name[0]);
 	if (ifp == NULL)
 		goto out_locked;
 
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		if (ifp == igi->igi_ifp) {
 			struct igmp_ifinfo info;
 
 			info.igi_version = igi->igi_version;
 			info.igi_v1_timer = igi->igi_v1_timer;
 			info.igi_v2_timer = igi->igi_v2_timer;
 			info.igi_v3_timer = igi->igi_v3_timer;
 			info.igi_flags = igi->igi_flags;
 			info.igi_rv = igi->igi_rv;
 			info.igi_qi = igi->igi_qi;
 			info.igi_qri = igi->igi_qri;
 			info.igi_uri = igi->igi_uri;
 			error = SYSCTL_OUT(req, &info, sizeof(info));
 			break;
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Dispatch an entire queue of pending packet chains
  * using the netisr.
  * VIMAGE: Assumes the vnet pointer has been set.
  */
 static void
 igmp_dispatch_queue(struct mbufq *mq, int limit, const int loop)
 {
 	struct mbuf *m;
 
 	while ((m = mbufq_dequeue(mq)) != NULL) {
 		CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, mq, m);
 		if (loop)
 			m->m_flags |= M_IGMP_LOOP;
 		netisr_dispatch(NETISR_IGMP, m);
 		if (--limit == 0)
 			break;
 	}
 }
 
 /*
  * Filter outgoing IGMP report state by group.
  *
  * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1).
  * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are
  * disabled for all groups in the 224.0.0.0/24 link-local scope. However,
  * this may break certain IGMP snooping switches which rely on the old
  * report behaviour.
  *
  * Return zero if the given group is one for which IGMP reports
  * should be suppressed, or non-zero if reports should be issued.
  */
 static __inline int
 igmp_isgroupreported(const struct in_addr addr)
 {
 
 	if (in_allhosts(addr) ||
 	    ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr)))))
 		return (0);
 
 	return (1);
 }
 
 /*
  * Construct a Router Alert option to use in outgoing packets.
  */
 static struct mbuf *
 igmp_ra_alloc(void)
 {
 	struct mbuf	*m;
 	struct ipoption	*p;
 
 	m = m_get(M_WAITOK, MT_DATA);
 	p = mtod(m, struct ipoption *);
 	p->ipopt_dst.s_addr = INADDR_ANY;
 	p->ipopt_list[0] = IPOPT_RA;	/* Router Alert Option */
 	p->ipopt_list[1] = 0x04;	/* 4 bytes long */
 	p->ipopt_list[2] = IPOPT_EOL;	/* End of IP option list */
 	p->ipopt_list[3] = 0x00;	/* pad byte */
 	m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1];
 
 	return (m);
 }
 
 /*
  * Attach IGMP when PF_INET is attached to an interface.
  */
 struct igmp_ifsoftc *
 igmp_domifattach(struct ifnet *ifp)
 {
 	struct igmp_ifsoftc *igi;
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK();
 
 	igi = igi_alloc_locked(ifp);
 	if (!(ifp->if_flags & IFF_MULTICAST))
 		igi->igi_flags |= IGIF_SILENT;
 
 	IGMP_UNLOCK();
 
 	return (igi);
 }
 
 /*
  * VIMAGE: assume curvnet set by caller.
  */
 static struct igmp_ifsoftc *
 igi_alloc_locked(/*const*/ struct ifnet *ifp)
 {
 	struct igmp_ifsoftc *igi;
 
 	IGMP_LOCK_ASSERT();
 
 	igi = malloc(sizeof(struct igmp_ifsoftc), M_IGMP, M_NOWAIT|M_ZERO);
 	if (igi == NULL)
 		goto out;
 
 	igi->igi_ifp = ifp;
 	igi->igi_version = V_igmp_default_version;
 	igi->igi_flags = 0;
 	igi->igi_rv = IGMP_RV_INIT;
 	igi->igi_qi = IGMP_QI_INIT;
 	igi->igi_qri = IGMP_QRI_INIT;
 	igi->igi_uri = IGMP_URI_INIT;
 	SLIST_INIT(&igi->igi_relinmhead);
 	mbufq_init(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS);
 
 	LIST_INSERT_HEAD(&V_igi_head, igi, igi_link);
 
 	CTR2(KTR_IGMPV3, "allocate igmp_ifsoftc for ifp %p(%s)",
 	     ifp, ifp->if_xname);
 
 out:
 	return (igi);
 }
 
 /*
  * Hook for ifdetach.
  *
  * NOTE: Some finalization tasks need to run before the protocol domain
  * is detached, but also before the link layer does its cleanup.
  *
  * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK().
  * XXX This is also bitten by unlocked ifma_protospec access.
  */
 void
 igmp_ifdetach(struct ifnet *ifp)
 {
 	struct igmp_ifsoftc	*igi;
 	struct ifmultiaddr	*ifma;
 	struct in_multi		*inm, *tinm;
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp,
 	    ifp->if_xname);
 
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	if (igi->igi_version == IGMP_VERSION_3) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 #if 0
 			KASSERT(ifma->ifma_protospec != NULL,
 			    ("%s: ifma_protospec is NULL", __func__));
 #endif
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			if (inm->inm_state == IGMP_LEAVING_MEMBER) {
 				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
 				    inm, inm_nrele);
 			}
 			inm_clear_recorded(inm);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		/*
 		 * Free the in_multi reference(s) for this IGMP lifecycle.
 		 */
 		SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele,
 		    tinm) {
 			SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
 			inm_release_locked(inm);
 		}
 	}
 
 	IGMP_UNLOCK();
 }
 
 /*
  * Hook for domifdetach.
  */
 void
 igmp_domifdetach(struct ifnet *ifp)
 {
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK();
 	igi_delete_locked(ifp);
 	IGMP_UNLOCK();
 }
 
 static void
 igi_delete_locked(const struct ifnet *ifp)
 {
 	struct igmp_ifsoftc *igi, *tigi;
 
 	CTR3(KTR_IGMPV3, "%s: freeing igmp_ifsoftc for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK_ASSERT();
 
 	LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) {
 		if (igi->igi_ifp == ifp) {
 			/*
 			 * Free deferred General Query responses.
 			 */
 			mbufq_drain(&igi->igi_gq);
 
 			LIST_REMOVE(igi, igi_link);
 
 			KASSERT(SLIST_EMPTY(&igi->igi_relinmhead),
 			    ("%s: there are dangling in_multi references",
 			    __func__));
 
 			free(igi, M_IGMP);
 			return;
 		}
 	}
-
-#ifdef INVARIANTS
-	panic("%s: igmp_ifsoftc not found for ifp %p\n", __func__,  ifp);
-#endif
 }
 
 /*
  * Process a received IGMPv1 query.
  * Return non-zero if the message should be dropped.
  *
  * VIMAGE: The curvnet pointer is derived from the input ifp.
  */
 static int
 igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
     const struct igmp *igmp)
 {
 	struct ifmultiaddr	*ifma;
 	struct igmp_ifsoftc	*igi;
 	struct in_multi		*inm;
 
 	/*
 	 * IGMPv1 Host Mmembership Queries SHOULD always be addressed to
 	 * 224.0.0.1. They are always treated as General Queries.
 	 * igmp_group is always ignored. Do not drop it as a userland
 	 * daemon may wish to see it.
 	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
 	 */
 	if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) {
 		IGMPSTAT_INC(igps_rcv_badqueries);
 		return (0);
 	}
 	IGMPSTAT_INC(igps_rcv_gen_queries);
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Switch to IGMPv1 host compatibility mode.
 	 */
 	igmp_set_version(igi, IGMP_VERSION_1);
 
 	CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname);
 
 	/*
 	 * Start the timers in all of our group records
 	 * for the interface on which the query arrived,
 	 * except those which are already running.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		if (inm->inm_timer != 0)
 			continue;
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			inm->inm_timer = IGMP_RANDOM_DELAY(
 			    IGMP_V1V2_MAX_RI * PR_FASTHZ);
 			V_current_state_timers_running = 1;
 			break;
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a received IGMPv2 general or group-specific query.
  */
 static int
 igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
     const struct igmp *igmp)
 {
 	struct ifmultiaddr	*ifma;
 	struct igmp_ifsoftc	*igi;
 	struct in_multi		*inm;
 	int			 is_general_query;
 	uint16_t		 timer;
 
 	is_general_query = 0;
 
 	/*
 	 * Validate address fields upfront.
 	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
 	 */
 	if (in_nullhost(igmp->igmp_group)) {
 		/*
 		 * IGMPv2 General Query.
 		 * If this was not sent to the all-hosts group, ignore it.
 		 */
 		if (!in_allhosts(ip->ip_dst))
 			return (0);
 		IGMPSTAT_INC(igps_rcv_gen_queries);
 		is_general_query = 1;
 	} else {
 		/* IGMPv2 Group-Specific Query. */
 		IGMPSTAT_INC(igps_rcv_group_queries);
 	}
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Ignore v2 query if in v1 Compatibility Mode.
 	 */
 	if (igi->igi_version == IGMP_VERSION_1)
 		goto out_locked;
 
 	igmp_set_version(igi, IGMP_VERSION_2);
 
 	timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	if (is_general_query) {
 		/*
 		 * For each reporting group joined on this
 		 * interface, kick the report timer.
 		 */
 		CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			igmp_v2_update_group(inm, timer);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	} else {
 		/*
 		 * Group-specific IGMPv2 query, we need only
 		 * look up the single group to process it.
 		 */
 		inm = inm_lookup(ifp, igmp->igmp_group);
 		if (inm != NULL) {
 			CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 			igmp_v2_update_group(inm, timer);
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Update the report timer on a group in response to an IGMPv2 query.
  *
  * If we are becoming the reporting member for this group, start the timer.
  * If we already are the reporting member for this group, and timer is
  * below the threshold, reset it.
  *
  * We may be updating the group for the first time since we switched
  * to IGMPv3. If we are, then we must clear any recorded source lists,
  * and transition to REPORTING state; the group timer is overloaded
  * for group and group-source query responses. 
  *
  * Unlike IGMPv3, the delay per group should be jittered
  * to avoid bursts of IGMPv2 reports.
  */
 static void
 igmp_v2_update_group(struct in_multi *inm, const int timer)
 {
 
 	CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__,
 	    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer);
 
 	IN_MULTI_LOCK_ASSERT();
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 		break;
 	case IGMP_REPORTING_MEMBER:
 		if (inm->inm_timer != 0 &&
 		    inm->inm_timer <= timer) {
 			CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, "
 			    "skipping.", __func__);
 			break;
 		}
 		/* FALLTHROUGH */
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__);
 		inm->inm_state = IGMP_REPORTING_MEMBER;
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		break;
 	case IGMP_SLEEPING_MEMBER:
 		CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__);
 		inm->inm_state = IGMP_AWAKENING_MEMBER;
 		break;
 	case IGMP_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Process a received IGMPv3 general, group-specific or
  * group-and-source-specific query.
  * Assumes m has already been pulled up to the full IGMP message length.
  * Return 0 if successful, otherwise an appropriate error code is returned.
  */
 static int
 igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
     /*const*/ struct igmpv3 *igmpv3)
 {
 	struct igmp_ifsoftc	*igi;
 	struct in_multi		*inm;
 	int			 is_general_query;
 	uint32_t		 maxresp, nsrc, qqi;
 	uint16_t		 timer;
 	uint8_t			 qrv;
 
 	is_general_query = 0;
 
 	CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname);
 
 	maxresp = igmpv3->igmp_code;	/* in 1/10ths of a second */
 	if (maxresp >= 128) {
 		maxresp = IGMP_MANT(igmpv3->igmp_code) <<
 			  (IGMP_EXP(igmpv3->igmp_code) + 3);
 	}
 
 	/*
 	 * Robustness must never be less than 2 for on-wire IGMPv3.
 	 * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make
 	 * an exception for interfaces whose IGMPv3 state changes
 	 * are redirected to loopback (e.g. MANET).
 	 */
 	qrv = IGMP_QRV(igmpv3->igmp_misc);
 	if (qrv < 2) {
 		CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__,
 		    qrv, IGMP_RV_INIT);
 		qrv = IGMP_RV_INIT;
 	}
 
 	qqi = igmpv3->igmp_qqi;
 	if (qqi >= 128) {
 		qqi = IGMP_MANT(igmpv3->igmp_qqi) <<
 		     (IGMP_EXP(igmpv3->igmp_qqi) + 3);
 	}
 
 	timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	nsrc = ntohs(igmpv3->igmp_numsrc);
 
 	/*
 	 * Validate address fields and versions upfront before
 	 * accepting v3 query.
 	 * XXX SMPng: Unlocked access to igmpstat counters here.
 	 */
 	if (in_nullhost(igmpv3->igmp_group)) {
 		/*
 		 * IGMPv3 General Query.
 		 *
 		 * General Queries SHOULD be directed to 224.0.0.1.
 		 * A general query with a source list has undefined
 		 * behaviour; discard it.
 		 */
 		IGMPSTAT_INC(igps_rcv_gen_queries);
 		if (!in_allhosts(ip->ip_dst) || nsrc > 0) {
 			IGMPSTAT_INC(igps_rcv_badqueries);
 			return (0);
 		}
 		is_general_query = 1;
 	} else {
 		/* Group or group-source specific query. */
 		if (nsrc == 0)
 			IGMPSTAT_INC(igps_rcv_group_queries);
 		else
 			IGMPSTAT_INC(igps_rcv_gsr_queries);
 	}
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Discard the v3 query if we're in Compatibility Mode.
 	 * The RFC is not obviously worded that hosts need to stay in
 	 * compatibility mode until the Old Version Querier Present
 	 * timer expires.
 	 */
 	if (igi->igi_version != IGMP_VERSION_3) {
 		CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)",
 		    igi->igi_version, ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	igmp_set_version(igi, IGMP_VERSION_3);
 	igi->igi_rv = qrv;
 	igi->igi_qi = qqi;
 	igi->igi_qri = maxresp;
 
 	CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi,
 	    maxresp);
 
 	if (is_general_query) {
 		/*
 		 * Schedule a current-state report on this ifp for
 		 * all groups, possibly containing source lists.
 		 * If there is a pending General Query response
 		 * scheduled earlier than the selected delay, do
 		 * not schedule any other reports.
 		 * Otherwise, reset the interface timer.
 		 */
 		CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) {
 			igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer);
 			V_interface_timers_running = 1;
 		}
 	} else {
 		/*
 		 * Group-source-specific queries are throttled on
 		 * a per-group basis to defeat denial-of-service attempts.
 		 * Queries for groups we are not a member of on this
 		 * link are simply ignored.
 		 */
 		inm = inm_lookup(ifp, igmpv3->igmp_group);
 		if (inm == NULL)
 			goto out_locked;
 		if (nsrc > 0) {
 			if (!ratecheck(&inm->inm_lastgsrtv,
 			    &V_igmp_gsrdelay)) {
 				CTR1(KTR_IGMPV3, "%s: GS query throttled.",
 				    __func__);
 				IGMPSTAT_INC(igps_drop_gsr_queries);
 				goto out_locked;
 			}
 		}
 		CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)",
 		     inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname);
 		/*
 		 * If there is a pending General Query response
 		 * scheduled sooner than the selected delay, no
 		 * further report need be scheduled.
 		 * Otherwise, prepare to respond to the
 		 * group-specific or group-and-source query.
 		 */
 		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer)
 			igmp_input_v3_group_query(inm, igi, timer, igmpv3);
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a recieved IGMPv3 group-specific or group-and-source-specific
  * query.
  * Return <0 if any error occured. Currently this is ignored.
  */
 static int
 igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifsoftc *igi,
     int timer, /*const*/ struct igmpv3 *igmpv3)
 {
 	int			 retval;
 	uint16_t		 nsrc;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	retval = 0;
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		return (retval);
 		break;
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		break;
 	}
 
 	nsrc = ntohs(igmpv3->igmp_numsrc);
 
 	/*
 	 * Deal with group-specific queries upfront.
 	 * If any group query is already pending, purge any recorded
 	 * source-list state if it exists, and schedule a query response
 	 * for this group-specific query.
 	 */
 	if (nsrc == 0) {
 		if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
 		    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) {
 			inm_clear_recorded(inm);
 			timer = min(inm->inm_timer, timer);
 		}
 		inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER;
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		return (retval);
 	}
 
 	/*
 	 * Deal with the case where a group-and-source-specific query has
 	 * been received but a group-specific query is already pending.
 	 */
 	if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) {
 		timer = min(inm->inm_timer, timer);
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		return (retval);
 	}
 
 	/*
 	 * Finally, deal with the case where a group-and-source-specific
 	 * query has been received, where a response to a previous g-s-r
 	 * query exists, or none exists.
 	 * In this case, we need to parse the source-list which the Querier
 	 * has provided us with and check if we have any source list filter
 	 * entries at T1 for these sources. If we do not, there is no need
 	 * schedule a report and the query may be dropped.
 	 * If we do, we must record them and schedule a current-state
 	 * report for those sources.
 	 * FIXME: Handling source lists larger than 1 mbuf requires that
 	 * we pass the mbuf chain pointer down to this function, and use
 	 * m_getptr() to walk the chain.
 	 */
 	if (inm->inm_nsrc > 0) {
 		const struct in_addr	*ap;
 		int			 i, nrecorded;
 
 		ap = (const struct in_addr *)(igmpv3 + 1);
 		nrecorded = 0;
 		for (i = 0; i < nsrc; i++, ap++) {
 			retval = inm_record_source(inm, ap->s_addr);
 			if (retval < 0)
 				break;
 			nrecorded += retval;
 		}
 		if (nrecorded > 0) {
 			CTR1(KTR_IGMPV3,
 			    "%s: schedule response to SG query", __func__);
 			inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER;
 			inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 			V_current_state_timers_running = 1;
 		}
 	}
 
 	return (retval);
 }
 
 /*
  * Process a received IGMPv1 host membership report.
  *
  * NOTE: 0.0.0.0 workaround breaks const correctness.
  */
 static int
 igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
     /*const*/ struct igmp *igmp)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct in_ifaddr *ia;
 	struct in_multi *inm;
 
 	IGMPSTAT_INC(igps_rcv_reports);
 
 	if (ifp->if_flags & IFF_LOOPBACK)
 		return (0);
 
 	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
 	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
 		IGMPSTAT_INC(igps_rcv_badreports);
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
 	 * Booting clients may use the source address 0.0.0.0. Some
 	 * IGMP daemons may not know how to use IP_RECVIF to determine
 	 * the interface upon which this message was received.
 	 * Replace 0.0.0.0 with the subnet address if told to do so.
 	 */
 	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
 		IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 		if (ia != NULL) {
 			ip->ip_src.s_addr = htonl(ia->ia_subnet);
 			ifa_free(&ia->ia_ifa);
 		}
 	}
 
 	CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)",
 	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 
 	/*
 	 * IGMPv1 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, stop our group timer and transition to the 'lazy' state.
 	 */
 	IN_MULTI_LOCK();
 	inm = inm_lookup(ifp, igmp->igmp_group);
 	if (inm != NULL) {
 		struct igmp_ifsoftc *igi;
 
 		igi = inm->inm_igi;
 		if (igi == NULL) {
 			KASSERT(igi != NULL,
 			    ("%s: no igi for ifp %p", __func__, ifp));
 			goto out_locked;
 		}
 
 		IGMPSTAT_INC(igps_rcv_ourreports);
 
 		/*
 		 * If we are in IGMPv3 host mode, do not allow the
 		 * other host's IGMPv1 report to suppress our reports
 		 * unless explicitly configured to do so.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3) {
 			if (V_igmp_legacysupp)
 				igmp_v3_suppress_group_record(inm);
 			goto out_locked;
 		}
 
 		inm->inm_timer = 0;
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 		case IGMP_SLEEPING_MEMBER:
 			inm->inm_state = IGMP_SLEEPING_MEMBER;
 			break;
 		case IGMP_REPORTING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 			if (igi->igi_version == IGMP_VERSION_1)
 				inm->inm_state = IGMP_LAZY_MEMBER;
 			else if (igi->igi_version == IGMP_VERSION_2)
 				inm->inm_state = IGMP_SLEEPING_MEMBER;
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a received IGMPv2 host membership report.
  *
  * NOTE: 0.0.0.0 workaround breaks const correctness.
  */
 static int
 igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
     /*const*/ struct igmp *igmp)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct in_ifaddr *ia;
 	struct in_multi *inm;
 
 	/*
 	 * Make sure we don't hear our own membership report.  Fast
 	 * leave requires knowing that we are the only member of a
 	 * group.
 	 */
 	IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 	if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) {
 		ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 
 	IGMPSTAT_INC(igps_rcv_reports);
 
 	if (ifp->if_flags & IFF_LOOPBACK) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 
 	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
 	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		IGMPSTAT_INC(igps_rcv_badreports);
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
 	 * Booting clients may use the source address 0.0.0.0. Some
 	 * IGMP daemons may not know how to use IP_RECVIF to determine
 	 * the interface upon which this message was received.
 	 * Replace 0.0.0.0 with the subnet address if told to do so.
 	 */
 	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
 		if (ia != NULL)
 			ip->ip_src.s_addr = htonl(ia->ia_subnet);
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 
 	CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)",
 	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 
 	/*
 	 * IGMPv2 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, and our group timer is pending or about to be reset,
 	 * stop our group timer by transitioning to the 'lazy' state.
 	 */
 	IN_MULTI_LOCK();
 	inm = inm_lookup(ifp, igmp->igmp_group);
 	if (inm != NULL) {
 		struct igmp_ifsoftc *igi;
 
 		igi = inm->inm_igi;
 		KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp));
 
 		IGMPSTAT_INC(igps_rcv_ourreports);
 
 		/*
 		 * If we are in IGMPv3 host mode, do not allow the
 		 * other host's IGMPv1 report to suppress our reports
 		 * unless explicitly configured to do so.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3) {
 			if (V_igmp_legacysupp)
 				igmp_v3_suppress_group_record(inm);
 			goto out_locked;
 		}
 
 		inm->inm_timer = 0;
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 			break;
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 		case IGMP_LAZY_MEMBER:
 			inm->inm_state = IGMP_LAZY_MEMBER;
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 int
 igmp_input(struct mbuf **mp, int *offp, int proto)
 {
 	int iphlen;
 	struct ifnet *ifp;
 	struct igmp *igmp;
 	struct ip *ip;
 	struct mbuf *m;
 	int igmplen;
 	int minlen;
 	int queryver;
 
 	CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp);
 
 	m = *mp;
 	ifp = m->m_pkthdr.rcvif;
 	*mp = NULL;
 
 	IGMPSTAT_INC(igps_rcv_total);
 
 	ip = mtod(m, struct ip *);
 	iphlen = *offp;
 	igmplen = ntohs(ip->ip_len) - iphlen;
 
 	/*
 	 * Validate lengths.
 	 */
 	if (igmplen < IGMP_MINLEN) {
 		IGMPSTAT_INC(igps_rcv_tooshort);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Always pullup to the minimum size for v1/v2 or v3
 	 * to amortize calls to m_pullup().
 	 */
 	minlen = iphlen;
 	if (igmplen >= IGMP_V3_QUERY_MINLEN)
 		minlen += IGMP_V3_QUERY_MINLEN;
 	else
 		minlen += IGMP_MINLEN;
 	if ((!M_WRITABLE(m) || m->m_len < minlen) &&
 	    (m = m_pullup(m, minlen)) == 0) {
 		IGMPSTAT_INC(igps_rcv_tooshort);
 		return (IPPROTO_DONE);
 	}
 	ip = mtod(m, struct ip *);
 
 	/*
 	 * Validate checksum.
 	 */
 	m->m_data += iphlen;
 	m->m_len -= iphlen;
 	igmp = mtod(m, struct igmp *);
 	if (in_cksum(m, igmplen)) {
 		IGMPSTAT_INC(igps_rcv_badsum);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	m->m_data -= iphlen;
 	m->m_len += iphlen;
 
 	/*
 	 * IGMP control traffic is link-scope, and must have a TTL of 1.
 	 * DVMRP traffic (e.g. mrinfo, mtrace) is an exception;
 	 * probe packets may come from beyond the LAN.
 	 */
 	if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) {
 		IGMPSTAT_INC(igps_rcv_badttl);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	switch (igmp->igmp_type) {
 	case IGMP_HOST_MEMBERSHIP_QUERY:
 		if (igmplen == IGMP_MINLEN) {
 			if (igmp->igmp_code == 0)
 				queryver = IGMP_VERSION_1;
 			else
 				queryver = IGMP_VERSION_2;
 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
 			queryver = IGMP_VERSION_3;
 		} else {
 			IGMPSTAT_INC(igps_rcv_tooshort);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		switch (queryver) {
 		case IGMP_VERSION_1:
 			IGMPSTAT_INC(igps_rcv_v1v2_queries);
 			if (!V_igmp_v1enable)
 				break;
 			if (igmp_input_v1_query(ifp, ip, igmp) != 0) {
 				m_freem(m);
 				return (IPPROTO_DONE);
 			}
 			break;
 
 		case IGMP_VERSION_2:
 			IGMPSTAT_INC(igps_rcv_v1v2_queries);
 			if (!V_igmp_v2enable)
 				break;
 			if (igmp_input_v2_query(ifp, ip, igmp) != 0) {
 				m_freem(m);
 				return (IPPROTO_DONE);
 			}
 			break;
 
 		case IGMP_VERSION_3: {
 				struct igmpv3 *igmpv3;
 				uint16_t igmpv3len;
 				uint16_t nsrc;
 
 				IGMPSTAT_INC(igps_rcv_v3_queries);
 				igmpv3 = (struct igmpv3 *)igmp;
 				/*
 				 * Validate length based on source count.
 				 */
 				nsrc = ntohs(igmpv3->igmp_numsrc);
 				if (nsrc * sizeof(in_addr_t) >
 				    UINT16_MAX - iphlen - IGMP_V3_QUERY_MINLEN) {
 					IGMPSTAT_INC(igps_rcv_tooshort);
 					return (IPPROTO_DONE);
 				}
 				/*
 				 * m_pullup() may modify m, so pullup in
 				 * this scope.
 				 */
 				igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN +
 				   sizeof(struct in_addr) * nsrc;
 				if ((!M_WRITABLE(m) ||
 				     m->m_len < igmpv3len) &&
 				    (m = m_pullup(m, igmpv3len)) == NULL) {
 					IGMPSTAT_INC(igps_rcv_tooshort);
 					return (IPPROTO_DONE);
 				}
 				igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *)
 				    + iphlen);
 				if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) {
 					m_freem(m);
 					return (IPPROTO_DONE);
 				}
 			}
 			break;
 		}
 		break;
 
 	case IGMP_v1_HOST_MEMBERSHIP_REPORT:
 		if (!V_igmp_v1enable)
 			break;
 		if (igmp_input_v1_report(ifp, ip, igmp) != 0) {
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	case IGMP_v2_HOST_MEMBERSHIP_REPORT:
 		if (!V_igmp_v2enable)
 			break;
 		if (!ip_checkrouteralert(m))
 			IGMPSTAT_INC(igps_rcv_nora);
 		if (igmp_input_v2_report(ifp, ip, igmp) != 0) {
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	case IGMP_v3_HOST_MEMBERSHIP_REPORT:
 		/*
 		 * Hosts do not need to process IGMPv3 membership reports,
 		 * as report suppression is no longer required.
 		 */
 		if (!ip_checkrouteralert(m))
 			IGMPSTAT_INC(igps_rcv_nora);
 		break;
 
 	default:
 		break;
 	}
 
 	/*
 	 * Pass all valid IGMP packets up to any process(es) listening on a
 	 * raw IGMP socket.
 	 */
 	*mp = m;
 	return (rip_input(mp, offp, proto));
 }
 
 
 /*
  * Fast timeout handler (global).
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 igmp_fasttimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		igmp_fasttimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Fast timeout handler (per-vnet).
  * Sends are shuffled off to a netisr to deal with Giant.
  *
  * VIMAGE: Assume caller has set up our curvnet.
  */
 static void
 igmp_fasttimo_vnet(void)
 {
 	struct mbufq		 scq;	/* State-change packets */
 	struct mbufq		 qrq;	/* Query response packets */
 	struct ifnet		*ifp;
 	struct igmp_ifsoftc	*igi;
 	struct ifmultiaddr	*ifma;
 	struct in_multi		*inm;
 	int			 loop, uri_fasthz;
 
 	loop = 0;
 	uri_fasthz = 0;
 
 	/*
 	 * Quick check to see if any work needs to be done, in order to
 	 * minimize the overhead of fasttimo processing.
 	 * SMPng: XXX Unlocked reads.
 	 */
 	if (!V_current_state_timers_running &&
 	    !V_interface_timers_running &&
 	    !V_state_change_timers_running)
 		return;
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	/*
 	 * IGMPv3 General Query response timer processing.
 	 */
 	if (V_interface_timers_running) {
 		CTR1(KTR_IGMPV3, "%s: interface timers running", __func__);
 
 		V_interface_timers_running = 0;
 		LIST_FOREACH(igi, &V_igi_head, igi_link) {
 			if (igi->igi_v3_timer == 0) {
 				/* Do nothing. */
 			} else if (--igi->igi_v3_timer == 0) {
 				igmp_v3_dispatch_general_query(igi);
 			} else {
 				V_interface_timers_running = 1;
 			}
 		}
 	}
 
 	if (!V_current_state_timers_running &&
 	    !V_state_change_timers_running)
 		goto out_locked;
 
 	V_current_state_timers_running = 0;
 	V_state_change_timers_running = 0;
 
 	CTR1(KTR_IGMPV3, "%s: state change timers running", __func__);
 
 	/*
 	 * IGMPv1/v2/v3 host report and state-change timer processing.
 	 * Note: Processing a v3 group timer may remove a node.
 	 */
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		ifp = igi->igi_ifp;
 
 		if (igi->igi_version == IGMP_VERSION_3) {
 			loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
 			uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri *
 			    PR_FASTHZ);
 			mbufq_init(&qrq, IGMP_MAX_G_GS_PACKETS);
 			mbufq_init(&scq, IGMP_MAX_STATE_CHANGE_PACKETS);
 		}
 
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			switch (igi->igi_version) {
 			case IGMP_VERSION_1:
 			case IGMP_VERSION_2:
 				igmp_v1v2_process_group_timer(inm,
 				    igi->igi_version);
 				break;
 			case IGMP_VERSION_3:
 				igmp_v3_process_group_timers(igi, &qrq,
 				    &scq, inm, uri_fasthz);
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 
 		if (igi->igi_version == IGMP_VERSION_3) {
 			struct in_multi		*tinm;
 
 			igmp_dispatch_queue(&qrq, 0, loop);
 			igmp_dispatch_queue(&scq, 0, loop);
 
 			/*
 			 * Free the in_multi reference(s) for this
 			 * IGMP lifecycle.
 			 */
 			SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead,
 			    inm_nrele, tinm) {
 				SLIST_REMOVE_HEAD(&igi->igi_relinmhead,
 				    inm_nrele);
 				inm_release_locked(inm);
 			}
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 }
 
 /*
  * Update host report group timer for IGMPv1/v2.
  * Will update the global pending timer flags.
  */
 static void
 igmp_v1v2_process_group_timer(struct in_multi *inm, const int version)
 {
 	int report_timer_expired;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	if (inm->inm_timer == 0) {
 		report_timer_expired = 0;
 	} else if (--inm->inm_timer == 0) {
 		report_timer_expired = 1;
 	} else {
 		V_current_state_timers_running = 1;
 		return;
 	}
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		break;
 	case IGMP_REPORTING_MEMBER:
 		if (report_timer_expired) {
 			inm->inm_state = IGMP_IDLE_MEMBER;
 			(void)igmp_v1v2_queue_report(inm,
 			    (version == IGMP_VERSION_2) ?
 			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
 			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
 		}
 		break;
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Update a group's timers for IGMPv3.
  * Will update the global pending timer flags.
  * Note: Unlocked read from igi.
  */
 static void
 igmp_v3_process_group_timers(struct igmp_ifsoftc *igi,
     struct mbufq *qrq, struct mbufq *scq,
     struct in_multi *inm, const int uri_fasthz)
 {
 	int query_response_timer_expired;
 	int state_change_retransmit_timer_expired;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	query_response_timer_expired = 0;
 	state_change_retransmit_timer_expired = 0;
 
 	/*
 	 * During a transition from v1/v2 compatibility mode back to v3,
 	 * a group record in REPORTING state may still have its group
 	 * timer active. This is a no-op in this function; it is easier
 	 * to deal with it here than to complicate the slow-timeout path.
 	 */
 	if (inm->inm_timer == 0) {
 		query_response_timer_expired = 0;
 	} else if (--inm->inm_timer == 0) {
 		query_response_timer_expired = 1;
 	} else {
 		V_current_state_timers_running = 1;
 	}
 
 	if (inm->inm_sctimer == 0) {
 		state_change_retransmit_timer_expired = 0;
 	} else if (--inm->inm_sctimer == 0) {
 		state_change_retransmit_timer_expired = 1;
 	} else {
 		V_state_change_timers_running = 1;
 	}
 
 	/* We are in fasttimo, so be quick about it. */
 	if (!state_change_retransmit_timer_expired &&
 	    !query_response_timer_expired)
 		return;
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 		break;
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		/*
 		 * Respond to a previously pending Group-Specific
 		 * or Group-and-Source-Specific query by enqueueing
 		 * the appropriate Current-State report for
 		 * immediate transmission.
 		 */
 		if (query_response_timer_expired) {
 			int retval;
 
 			retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1,
 			    (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER));
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			/* XXX Clear recorded sources for next time. */
 			inm_clear_recorded(inm);
 		}
 		/* FALLTHROUGH */
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		if (state_change_retransmit_timer_expired) {
 			/*
 			 * State-change retransmission timer fired.
 			 * If there are any further pending retransmissions,
 			 * set the global pending state-change flag, and
 			 * reset the timer.
 			 */
 			if (--inm->inm_scrv > 0) {
 				inm->inm_sctimer = uri_fasthz;
 				V_state_change_timers_running = 1;
 			}
 			/*
 			 * Retransmit the previously computed state-change
 			 * report. If there are no further pending
 			 * retransmissions, the mbuf queue will be consumed.
 			 * Update T0 state to T1 as we have now sent
 			 * a state-change.
 			 */
 			(void)igmp_v3_merge_state_changes(inm, scq);
 
 			inm_commit(inm);
 			CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 			    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 
 			/*
 			 * If we are leaving the group for good, make sure
 			 * we release IGMP's reference to it.
 			 * This release must be deferred using a SLIST,
 			 * as we are called from a loop which traverses
 			 * the in_ifmultiaddr TAILQ.
 			 */
 			if (inm->inm_state == IGMP_LEAVING_MEMBER &&
 			    inm->inm_scrv == 0) {
 				inm->inm_state = IGMP_NOT_MEMBER;
 				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
 				    inm, inm_nrele);
 			}
 		}
 		break;
 	}
 }
 
 
 /*
  * Suppress a group's pending response to a group or source/group query.
  *
  * Do NOT suppress state changes. This leads to IGMPv3 inconsistency.
  * Do NOT update ST1/ST0 as this operation merely suppresses
  * the currently pending group record.
  * Do NOT suppress the response to a general query. It is possible but
  * it would require adding another state or flag.
  */
 static void
 igmp_v3_suppress_group_record(struct in_multi *inm)
 {
 
 	IN_MULTI_LOCK_ASSERT();
 
 	KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3,
 		("%s: not IGMPv3 mode on link", __func__));
 
 	if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER ||
 	    inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER)
 		return;
 
 	if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
 		inm_clear_recorded(inm);
 
 	inm->inm_timer = 0;
 	inm->inm_state = IGMP_REPORTING_MEMBER;
 }
 
 /*
  * Switch to a different IGMP version on the given interface,
  * as per Section 7.2.1.
  */
 static void
 igmp_set_version(struct igmp_ifsoftc *igi, const int version)
 {
 	int old_version_timer;
 
 	IGMP_LOCK_ASSERT();
 
 	CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__,
 	    version, igi->igi_ifp, igi->igi_ifp->if_xname);
 
 	if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) {
 		/*
 		 * Compute the "Older Version Querier Present" timer as per
 		 * Section 8.12.
 		 */
 		old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri;
 		old_version_timer *= PR_SLOWHZ;
 
 		if (version == IGMP_VERSION_1) {
 			igi->igi_v1_timer = old_version_timer;
 			igi->igi_v2_timer = 0;
 		} else if (version == IGMP_VERSION_2) {
 			igi->igi_v1_timer = 0;
 			igi->igi_v2_timer = old_version_timer;
 		}
 	}
 
 	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
 		if (igi->igi_version != IGMP_VERSION_2) {
 			igi->igi_version = IGMP_VERSION_2;
 			igmp_v3_cancel_link_timers(igi);
 		}
 	} else if (igi->igi_v1_timer > 0) {
 		if (igi->igi_version != IGMP_VERSION_1) {
 			igi->igi_version = IGMP_VERSION_1;
 			igmp_v3_cancel_link_timers(igi);
 		}
 	}
 }
 
 /*
  * Cancel pending IGMPv3 timers for the given link and all groups
  * joined on it; state-change, general-query, and group-query timers.
  *
  * Only ever called on a transition from v3 to Compatibility mode. Kill
  * the timers stone dead (this may be expensive for large N groups), they
  * will be restarted if Compatibility Mode deems that they must be due to
  * query processing.
  */
 static void
 igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in_multi		*inm, *tinm;
 
 	CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__,
 	    igi->igi_ifp, igi->igi_ifp->if_xname);
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	/*
 	 * Stop the v3 General Query Response on this link stone dead.
 	 * If fasttimo is woken up due to V_interface_timers_running,
 	 * the flag will be cleared if there are no pending link timers.
 	 */
 	igi->igi_v3_timer = 0;
 
 	/*
 	 * Now clear the current-state and state-change report timers
 	 * for all memberships scoped to this link.
 	 */
 	ifp = igi->igi_ifp;
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			/*
 			 * These states are either not relevant in v3 mode,
 			 * or are unreported. Do nothing.
 			 */
 			break;
 		case IGMP_LEAVING_MEMBER:
 			/*
 			 * If we are leaving the group and switching to
 			 * compatibility mode, we need to release the final
 			 * reference held for issuing the INCLUDE {}, and
 			 * transition to REPORTING to ensure the host leave
 			 * message is sent upstream to the old querier --
 			 * transition to NOT would lose the leave and race.
 			 */
 			SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele);
 			/* FALLTHROUGH */
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 			inm_clear_recorded(inm);
 			/* FALLTHROUGH */
 		case IGMP_REPORTING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			break;
 		}
 		/*
 		 * Always clear state-change and group report timers.
 		 * Free any pending IGMPv3 state-change records.
 		 */
 		inm->inm_sctimer = 0;
 		inm->inm_timer = 0;
 		mbufq_drain(&inm->inm_scq);
 	}
 	IF_ADDR_RUNLOCK(ifp);
 	SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
 		inm_release_locked(inm);
 	}
 }
 
 /*
  * Update the Older Version Querier Present timers for a link.
  * See Section 7.2.1 of RFC 3376.
  */
 static void
 igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *igi)
 {
 
 	IGMP_LOCK_ASSERT();
 
 	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) {
 		/*
 		 * IGMPv1 and IGMPv2 Querier Present timers expired.
 		 *
 		 * Revert to IGMPv3.
 		 */
 		if (igi->igi_version != IGMP_VERSION_3) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_version = IGMP_VERSION_3;
 		}
 	} else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
 		/*
 		 * IGMPv1 Querier Present timer expired,
 		 * IGMPv2 Querier Present timer running.
 		 * If IGMPv2 was disabled since last timeout,
 		 * revert to IGMPv3.
 		 * If IGMPv2 is enabled, revert to IGMPv2.
 		 */
 		if (!V_igmp_v2enable) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v2_timer = 0;
 			igi->igi_version = IGMP_VERSION_3;
 		} else {
 			--igi->igi_v2_timer;
 			if (igi->igi_version != IGMP_VERSION_2) {
 				CTR5(KTR_IGMPV3,
 				    "%s: transition from v%d -> v%d on %p(%s)",
 				    __func__, igi->igi_version, IGMP_VERSION_2,
 				    igi->igi_ifp, igi->igi_ifp->if_xname);
 				igi->igi_version = IGMP_VERSION_2;
 				igmp_v3_cancel_link_timers(igi);
 			}
 		}
 	} else if (igi->igi_v1_timer > 0) {
 		/*
 		 * IGMPv1 Querier Present timer running.
 		 * Stop IGMPv2 timer if running.
 		 *
 		 * If IGMPv1 was disabled since last timeout,
 		 * revert to IGMPv3.
 		 * If IGMPv1 is enabled, reset IGMPv2 timer if running.
 		 */
 		if (!V_igmp_v1enable) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v1_timer = 0;
 			igi->igi_version = IGMP_VERSION_3;
 		} else {
 			--igi->igi_v1_timer;
 		}
 		if (igi->igi_v2_timer > 0) {
 			CTR3(KTR_IGMPV3,
 			    "%s: cancel v2 timer on %p(%s)",
 			    __func__, igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v2_timer = 0;
 		}
 	}
 }
 
 /*
  * Global slowtimo handler.
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 igmp_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		igmp_slowtimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Per-vnet slowtimo handler.
  */
 static void
 igmp_slowtimo_vnet(void)
 {
 	struct igmp_ifsoftc *igi;
 
 	IGMP_LOCK();
 
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		igmp_v1v2_process_querier_timers(igi);
 	}
 
 	IGMP_UNLOCK();
 }
 
 /*
  * Dispatch an IGMPv1/v2 host report or leave message.
  * These are always small enough to fit inside a single mbuf.
  */
 static int
 igmp_v1v2_queue_report(struct in_multi *inm, const int type)
 {
 	struct ifnet		*ifp;
 	struct igmp		*igmp;
 	struct ip		*ip;
 	struct mbuf		*m;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	ifp = inm->inm_ifp;
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOMEM);
 	M_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp));
 
 	m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp);
 
 	m->m_data += sizeof(struct ip);
 	m->m_len = sizeof(struct igmp);
 
 	igmp = mtod(m, struct igmp *);
 	igmp->igmp_type = type;
 	igmp->igmp_code = 0;
 	igmp->igmp_group = inm->inm_addr;
 	igmp->igmp_cksum = 0;
 	igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp));
 
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_tos = 0;
 	ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp));
 	ip->ip_off = 0;
 	ip->ip_p = IPPROTO_IGMP;
 	ip->ip_src.s_addr = INADDR_ANY;
 
 	if (type == IGMP_HOST_LEAVE_MESSAGE)
 		ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP);
 	else
 		ip->ip_dst = inm->inm_addr;
 
 	igmp_save_context(m, ifp);
 
 	m->m_flags |= M_IGMPV2;
 	if (inm->inm_igi->igi_flags & IGIF_LOOPBACK)
 		m->m_flags |= M_IGMP_LOOP;
 
 	CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m);
 	netisr_dispatch(NETISR_IGMP, m);
 
 	return (0);
 }
 
 /*
  * Process a state change from the upper layer for the given IPv4 group.
  *
  * Each socket holds a reference on the in_multi in its own ip_moptions.
  * The socket layer will have made the necessary updates to.the group
  * state, it is now up to IGMP to issue a state change report if there
  * has been any change between T0 (when the last state-change was issued)
  * and T1 (now).
  *
  * We use the IGMPv3 state machine at group level. The IGMP module
  * however makes the decision as to which IGMP protocol version to speak.
  * A state change *from* INCLUDE {} always means an initial join.
  * A state change *to* INCLUDE {} always means a final leave.
  *
  * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can
  * save ourselves a bunch of work; any exclusive mode groups need not
  * compute source filter lists.
  *
  * VIMAGE: curvnet should have been set by caller, as this routine
  * is called from the socket option handlers.
  */
 int
 igmp_change_state(struct in_multi *inm)
 {
 	struct igmp_ifsoftc *igi;
 	struct ifnet *ifp;
 	int error;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	error = 0;
 
 	/*
 	 * Try to detect if the upper layer just asked us to change state
 	 * for an interface which has now gone away.
 	 */
 	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->inm_ifma->ifma_ifp;
 	/*
 	 * Sanity check that netinet's notion of ifp is the
 	 * same as net's.
 	 */
 	KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
 
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	/*
 	 * If we detect a state transition to or from MCAST_UNDEFINED
 	 * for this group, then we are starting or finishing an IGMP
 	 * life cycle for this group.
 	 */
 	if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) {
 		CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__,
 		    inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode);
 		if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_IGMPV3, "%s: initial join", __func__);
 			error = igmp_initial_join(inm, igi);
 			goto out_locked;
 		} else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_IGMPV3, "%s: final leave", __func__);
 			igmp_final_leave(inm, igi);
 			goto out_locked;
 		}
 	} else {
 		CTR1(KTR_IGMPV3, "%s: filter set change", __func__);
 	}
 
 	error = igmp_handle_state_change(inm, igi);
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Perform the initial join for an IGMP group.
  *
  * When joining a group:
  *  If the group should have its IGMP traffic suppressed, do nothing.
  *  IGMPv1 starts sending IGMPv1 host membership reports.
  *  IGMPv2 starts sending IGMPv2 host membership reports.
  *  IGMPv3 will schedule an IGMPv3 state-change report containing the
  *  initial state of the membership.
  */
 static int
 igmp_initial_join(struct in_multi *inm, struct igmp_ifsoftc *igi)
 {
 	struct ifnet		*ifp;
 	struct mbufq		*mq;
 	int			 error, retval, syncstates;
 
 	CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	error = 0;
 	syncstates = 1;
 
 	ifp = inm->inm_ifp;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	/*
 	 * Groups joined on loopback or marked as 'not reported',
 	 * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and
 	 * are never reported in any IGMP protocol exchanges.
 	 * All other groups enter the appropriate IGMP state machine
 	 * for the version in use on this link.
 	 * A link marked as IGIF_SILENT causes IGMP to be completely
 	 * disabled for the link.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (igi->igi_flags & IGIF_SILENT) ||
 	    !igmp_isgroupreported(inm->inm_addr)) {
 		CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		inm->inm_state = IGMP_SILENT_MEMBER;
 		inm->inm_timer = 0;
 	} else {
 		/*
 		 * Deal with overlapping in_multi lifecycle.
 		 * If this group was LEAVING, then make sure
 		 * we drop the reference we picked up to keep the
 		 * group around for the final INCLUDE {} enqueue.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3 &&
 		    inm->inm_state == IGMP_LEAVING_MEMBER)
 			inm_release_locked(inm);
 
 		inm->inm_state = IGMP_REPORTING_MEMBER;
 
 		switch (igi->igi_version) {
 		case IGMP_VERSION_1:
 		case IGMP_VERSION_2:
 			inm->inm_state = IGMP_IDLE_MEMBER;
 			error = igmp_v1v2_queue_report(inm,
 			    (igi->igi_version == IGMP_VERSION_2) ?
 			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
 			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
 			if (error == 0) {
 				inm->inm_timer = IGMP_RANDOM_DELAY(
 				    IGMP_V1V2_MAX_RI * PR_FASTHZ);
 				V_current_state_timers_running = 1;
 			}
 			break;
 
 		case IGMP_VERSION_3:
 			/*
 			 * Defer update of T0 to T1, until the first copy
 			 * of the state change has been transmitted.
 			 */
 			syncstates = 0;
 
 			/*
 			 * Immediately enqueue a State-Change Report for
 			 * this interface, freeing any previous reports.
 			 * Don't kick the timers if there is nothing to do,
 			 * or if an error occurred.
 			 */
 			mq = &inm->inm_scq;
 			mbufq_drain(mq);
 			retval = igmp_v3_enqueue_group_record(mq, inm, 1,
 			    0, 0);
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			if (retval <= 0) {
 				error = retval * -1;
 				break;
 			}
 
 			/*
 			 * Schedule transmission of pending state-change
 			 * report up to RV times for this link. The timer
 			 * will fire at the next igmp_fasttimo (~200ms),
 			 * giving us an opportunity to merge the reports.
 			 */
 			if (igi->igi_flags & IGIF_LOOPBACK) {
 				inm->inm_scrv = 1;
 			} else {
 				KASSERT(igi->igi_rv > 1,
 				   ("%s: invalid robustness %d", __func__,
 				    igi->igi_rv));
 				inm->inm_scrv = igi->igi_rv;
 			}
 			inm->inm_sctimer = 1;
 			V_state_change_timers_running = 1;
 
 			error = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Only update the T0 state if state change is atomic,
 	 * i.e. we don't need to wait for a timer to fire before we
 	 * can consider the state change to have been communicated.
 	 */
 	if (syncstates) {
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 	}
 
 	return (error);
 }
 
 /*
  * Issue an intermediate state change during the IGMP life-cycle.
  */
 static int
 igmp_handle_state_change(struct in_multi *inm, struct igmp_ifsoftc *igi)
 {
 	struct ifnet		*ifp;
 	int			 retval;
 
 	CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	ifp = inm->inm_ifp;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (igi->igi_flags & IGIF_SILENT) ||
 	    !igmp_isgroupreported(inm->inm_addr) ||
 	    (igi->igi_version != IGMP_VERSION_3)) {
 		if (!igmp_isgroupreported(inm->inm_addr)) {
 			CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		}
 		CTR1(KTR_IGMPV3, "%s: nothing to do", __func__);
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 		return (0);
 	}
 
 	mbufq_drain(&inm->inm_scq);
 
 	retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0);
 	CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval);
 	if (retval <= 0)
 		return (-retval);
 
 	/*
 	 * If record(s) were enqueued, start the state-change
 	 * report timer for this group.
 	 */
 	inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv);
 	inm->inm_sctimer = 1;
 	V_state_change_timers_running = 1;
 
 	return (0);
 }
 
 /*
  * Perform the final leave for an IGMP group.
  *
  * When leaving a group:
  *  IGMPv1 does nothing.
  *  IGMPv2 sends a host leave message, if and only if we are the reporter.
  *  IGMPv3 enqueues a state-change report containing a transition
  *  to INCLUDE {} for immediate transmission.
  */
 static void
 igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi)
 {
 	int syncstates;
 
 	syncstates = 1;
 
 	CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		/* Already leaving or left; do nothing. */
 		CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		break;
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		if (igi->igi_version == IGMP_VERSION_2) {
 #ifdef INVARIANTS
 			if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
 			    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
 			panic("%s: IGMPv3 state reached, not IGMPv3 mode",
 			     __func__);
 #endif
 			igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE);
 			inm->inm_state = IGMP_NOT_MEMBER;
 		} else if (igi->igi_version == IGMP_VERSION_3) {
 			/*
 			 * Stop group timer and all pending reports.
 			 * Immediately enqueue a state-change report
 			 * TO_IN {} to be sent on the next fast timeout,
 			 * giving us an opportunity to merge reports.
 			 */
 			mbufq_drain(&inm->inm_scq);
 			inm->inm_timer = 0;
 			if (igi->igi_flags & IGIF_LOOPBACK) {
 				inm->inm_scrv = 1;
 			} else {
 				inm->inm_scrv = igi->igi_rv;
 			}
 			CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d "
 			    "pending retransmissions.", __func__,
 			    inet_ntoa(inm->inm_addr),
 			    inm->inm_ifp->if_xname, inm->inm_scrv);
 			if (inm->inm_scrv == 0) {
 				inm->inm_state = IGMP_NOT_MEMBER;
 				inm->inm_sctimer = 0;
 			} else {
 				int retval;
 
 				inm_acquire_locked(inm);
 
 				retval = igmp_v3_enqueue_group_record(
 				    &inm->inm_scq, inm, 1, 0, 0);
 				KASSERT(retval != 0,
 				    ("%s: enqueue record = %d", __func__,
 				     retval));
 
 				inm->inm_state = IGMP_LEAVING_MEMBER;
 				inm->inm_sctimer = 1;
 				V_state_change_timers_running = 1;
 				syncstates = 0;
 			}
 			break;
 		}
 		break;
 	case IGMP_LAZY_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		/* Our reports are suppressed; do nothing. */
 		break;
 	}
 
 	if (syncstates) {
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 		CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s",
 		    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 	}
 }
 
 /*
  * Enqueue an IGMPv3 group record to the given output queue.
  *
  * XXX This function could do with having the allocation code
  * split out, and the multiple-tree-walks coalesced into a single
  * routine as has been done in igmp_v3_enqueue_filter_change().
  *
  * If is_state_change is zero, a current-state record is appended.
  * If is_state_change is non-zero, a state-change report is appended.
  *
  * If is_group_query is non-zero, an mbuf packet chain is allocated.
  * If is_group_query is zero, and if there is a packet with free space
  * at the tail of the queue, it will be appended to providing there
  * is enough free space.
  * Otherwise a new mbuf packet chain is allocated.
  *
  * If is_source_query is non-zero, each source is checked to see if
  * it was recorded for a Group-Source query, and will be omitted if
  * it is not both in-mode and recorded.
  *
  * The function will attempt to allocate leading space in the packet
  * for the IP/IGMP header to be prepended without fragmenting the chain.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm,
     const int is_state_change, const int is_group_query,
     const int is_source_query)
 {
 	struct igmp_grouprec	 ig;
 	struct igmp_grouprec	*pig;
 	struct ifnet		*ifp;
 	struct ip_msource	*ims, *nims;
 	struct mbuf		*m0, *m, *md;
 	int			 error, is_filter_list_change;
 	int			 minrec0len, m0srcs, msrcs, nbytes, off;
 	int			 record_has_sources;
 	int			 now;
 	int			 type;
 	in_addr_t		 naddr;
 	uint8_t			 mode;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	error = 0;
 	ifp = inm->inm_ifp;
 	is_filter_list_change = 0;
 	m = NULL;
 	m0 = NULL;
 	m0srcs = 0;
 	msrcs = 0;
 	nbytes = 0;
 	nims = NULL;
 	record_has_sources = 1;
 	pig = NULL;
 	type = IGMP_DO_NOTHING;
 	mode = inm->inm_st[1].iss_fmode;
 
 	/*
 	 * If we did not transition out of ASM mode during t0->t1,
 	 * and there are no source nodes to process, we can skip
 	 * the generation of source records.
 	 */
 	if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 &&
 	    inm->inm_nsrc == 0)
 		record_has_sources = 0;
 
 	if (is_state_change) {
 		/*
 		 * Queue a state change record.
 		 * If the mode did not change, and there are non-ASM
 		 * listeners or source filters present,
 		 * we potentially need to issue two records for the group.
 		 * If we are transitioning to MCAST_UNDEFINED, we need
 		 * not send any sources.
 		 * If there are ASM listeners, and there was no filter
 		 * mode transition of any kind, do nothing.
 		 */
 		if (mode != inm->inm_st[0].iss_fmode) {
 			if (mode == MCAST_EXCLUDE) {
 				CTR1(KTR_IGMPV3, "%s: change to EXCLUDE",
 				    __func__);
 				type = IGMP_CHANGE_TO_EXCLUDE_MODE;
 			} else {
 				CTR1(KTR_IGMPV3, "%s: change to INCLUDE",
 				    __func__);
 				type = IGMP_CHANGE_TO_INCLUDE_MODE;
 				if (mode == MCAST_UNDEFINED)
 					record_has_sources = 0;
 			}
 		} else {
 			if (record_has_sources) {
 				is_filter_list_change = 1;
 			} else {
 				type = IGMP_DO_NOTHING;
 			}
 		}
 	} else {
 		/*
 		 * Queue a current state record.
 		 */
 		if (mode == MCAST_EXCLUDE) {
 			type = IGMP_MODE_IS_EXCLUDE;
 		} else if (mode == MCAST_INCLUDE) {
 			type = IGMP_MODE_IS_INCLUDE;
 			KASSERT(inm->inm_st[1].iss_asm == 0,
 			    ("%s: inm %p is INCLUDE but ASM count is %d",
 			     __func__, inm, inm->inm_st[1].iss_asm));
 		}
 	}
 
 	/*
 	 * Generate the filter list changes using a separate function.
 	 */
 	if (is_filter_list_change)
 		return (igmp_v3_enqueue_filter_change(mq, inm));
 
 	if (type == IGMP_DO_NOTHING) {
 		CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s",
 		    __func__, inet_ntoa(inm->inm_addr),
 		    inm->inm_ifp->if_xname);
 		return (0);
 	}
 
 	/*
 	 * If any sources are present, we must be able to fit at least
 	 * one in the trailing space of the tail packet's mbuf,
 	 * ideally more.
 	 */
 	minrec0len = sizeof(struct igmp_grouprec);
 	if (record_has_sources)
 		minrec0len += sizeof(in_addr_t);
 
 	CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__,
 	    igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr),
 	    inm->inm_ifp->if_xname);
 
 	/*
 	 * Check if we have a packet in the tail of the queue for this
 	 * group into which the first group record for this group will fit.
 	 * Otherwise allocate a new packet.
 	 * Always allocate leading space for IP+RA_OPT+IGMP+REPORT.
 	 * Note: Group records for G/GSR query responses MUST be sent
 	 * in their own packet.
 	 */
 	m0 = mbufq_last(mq);
 	if (!is_group_query &&
 	    m0 != NULL &&
 	    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) &&
 	    (m0->m_pkthdr.len + minrec0len) <
 	     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
 		m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 			    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 		m = m0;
 		CTR1(KTR_IGMPV3, "%s: use existing packet", __func__);
 	} else {
 		if (mbufq_full(mq)) {
 			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = NULL;
 		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 		if (!is_state_change && !is_group_query) {
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 			if (m)
 				m->m_data += IGMP_LEADINGSPACE;
 		}
 		if (m == NULL) {
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 			if (m)
 				M_ALIGN(m, IGMP_LEADINGSPACE);
 		}
 		if (m == NULL)
 			return (-ENOMEM);
 
 		igmp_save_context(m, ifp);
 
 		CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__);
 	}
 
 	/*
 	 * Append group record.
 	 * If we have sources, we don't know how many yet.
 	 */
 	ig.ig_type = type;
 	ig.ig_datalen = 0;
 	ig.ig_numsrc = 0;
 	ig.ig_group = inm->inm_addr;
 	if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
 		if (m != m0)
 			m_freem(m);
 		CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
 		return (-ENOMEM);
 	}
 	nbytes += sizeof(struct igmp_grouprec);
 
 	/*
 	 * Append as many sources as will fit in the first packet.
 	 * If we are appending to a new packet, the chain allocation
 	 * may potentially use clusters; use m_getptr() in this case.
 	 * If we are appending to an existing packet, we need to obtain
 	 * a pointer to the group record after m_append(), in case a new
 	 * mbuf was allocated.
 	 * Only append sources which are in-mode at t1. If we are
 	 * transitioning to MCAST_UNDEFINED state on the group, do not
 	 * include source entries.
 	 * Only report recorded sources in our filter set when responding
 	 * to a group-source query.
 	 */
 	if (record_has_sources) {
 		if (m == m0) {
 			md = m_last(m);
 			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
 			    md->m_len - nbytes);
 		} else {
 			md = m_getptr(m, 0, &off);
 			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
 			    off);
 		}
 		msrcs = 0;
 		RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) {
 			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
 			    inet_ntoa_haddr(ims->ims_haddr));
 			now = ims_get_mode(inm, ims, 1);
 			CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now);
 			if ((now != mode) ||
 			    (now == mode && mode == MCAST_UNDEFINED)) {
 				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->ims_stp == 0) {
 				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_IGMPV3, "%s: append node", __func__);
 			naddr = htonl(ims->ims_haddr);
 			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			nbytes += sizeof(in_addr_t);
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__,
 		    msrcs);
 		pig->ig_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(in_addr_t));
 	}
 
 	if (is_source_query && msrcs == 0) {
 		CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__);
 		if (m != m0)
 			m_freem(m);
 		return (0);
 	}
 
 	/*
 	 * We are good to go with first packet.
 	 */
 	if (m != m0) {
 		CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__);
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		mbufq_enqueue(mq, m);
 	} else
 		m->m_pkthdr.PH_vt.vt_nrecs++;
 
 	/*
 	 * No further work needed if no source list in packet(s).
 	 */
 	if (!record_has_sources)
 		return (nbytes);
 
 	/*
 	 * Whilst sources remain to be announced, we need to allocate
 	 * a new packet and fill out as many sources as will fit.
 	 * Always try for a cluster first.
 	 */
 	while (nims != NULL) {
 		if (mbufq_full(mq)) {
 			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m)
 			m->m_data += IGMP_LEADINGSPACE;
 		if (m == NULL) {
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 			if (m)
 				M_ALIGN(m, IGMP_LEADINGSPACE);
 		}
 		if (m == NULL)
 			return (-ENOMEM);
 		igmp_save_context(m, ifp);
 		md = m_getptr(m, 0, &off);
 		pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off);
 		CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__);
 
 		if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
 			if (m != m0)
 				m_freem(m);
 			CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
 			return (-ENOMEM);
 		}
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		nbytes += sizeof(struct igmp_grouprec);
 
 		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 
 		msrcs = 0;
 		RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
 			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
 			    inet_ntoa_haddr(ims->ims_haddr));
 			now = ims_get_mode(inm, ims, 1);
 			if ((now != mode) ||
 			    (now == mode && mode == MCAST_UNDEFINED)) {
 				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->ims_stp == 0) {
 				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_IGMPV3, "%s: append node", __func__);
 			naddr = htonl(ims->ims_haddr);
 			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		pig->ig_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(in_addr_t));
 
 		CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__);
 		mbufq_enqueue(mq, m);
 	}
 
 	return (nbytes);
 }
 
 /*
  * Type used to mark record pass completion.
  * We exploit the fact we can cast to this easily from the
  * current filter modes on each ip_msource node.
  */
 typedef enum {
 	REC_NONE = 0x00,	/* MCAST_UNDEFINED */
 	REC_ALLOW = 0x01,	/* MCAST_INCLUDE */
 	REC_BLOCK = 0x02,	/* MCAST_EXCLUDE */
 	REC_FULL = REC_ALLOW | REC_BLOCK
 } rectype_t;
 
 /*
  * Enqueue an IGMPv3 filter list change to the given output queue.
  *
  * Source list filter state is held in an RB-tree. When the filter list
  * for a group is changed without changing its mode, we need to compute
  * the deltas between T0 and T1 for each source in the filter set,
  * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
  *
  * As we may potentially queue two record types, and the entire R-B tree
  * needs to be walked at once, we break this out into its own function
  * so we can generate a tightly packed queue of packets.
  *
  * XXX This could be written to only use one tree walk, although that makes
  * serializing into the mbuf chains a bit harder. For now we do two walks
  * which makes things easier on us, and it may or may not be harder on
  * the L2 cache.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 igmp_v3_enqueue_filter_change(struct mbufq *mq, struct in_multi *inm)
 {
 	static const int MINRECLEN =
 	    sizeof(struct igmp_grouprec) + sizeof(in_addr_t);
 	struct ifnet		*ifp;
 	struct igmp_grouprec	 ig;
 	struct igmp_grouprec	*pig;
 	struct ip_msource	*ims, *nims;
 	struct mbuf		*m, *m0, *md;
 	in_addr_t		 naddr;
 	int			 m0srcs, nbytes, npbytes, off, rsrcs, schanged;
 	int			 nallow, nblock;
 	uint8_t			 mode, now, then;
 	rectype_t		 crt, drt, nrt;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	if (inm->inm_nsrc == 0 ||
 	    (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0))
 		return (0);
 
 	ifp = inm->inm_ifp;			/* interface */
 	mode = inm->inm_st[1].iss_fmode;	/* filter mode at t1 */
 	crt = REC_NONE;	/* current group record type */
 	drt = REC_NONE;	/* mask of completed group record types */
 	nrt = REC_NONE;	/* record type for current node */
 	m0srcs = 0;	/* # source which will fit in current mbuf chain */
 	nbytes = 0;	/* # of bytes appended to group's state-change queue */
 	npbytes = 0;	/* # of bytes appended this packet */
 	rsrcs = 0;	/* # sources encoded in current record */
 	schanged = 0;	/* # nodes encoded in overall filter change */
 	nallow = 0;	/* # of source entries in ALLOW_NEW */
 	nblock = 0;	/* # of source entries in BLOCK_OLD */
 	nims = NULL;	/* next tree node pointer */
 
 	/*
 	 * For each possible filter record mode.
 	 * The first kind of source we encounter tells us which
 	 * is the first kind of record we start appending.
 	 * If a node transitioned to UNDEFINED at t1, its mode is treated
 	 * as the inverse of the group's filter mode.
 	 */
 	while (drt != REC_FULL) {
 		do {
 			m0 = mbufq_last(mq);
 			if (m0 != NULL &&
 			    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
 			     IGMP_V3_REPORT_MAXRECS) &&
 			    (m0->m_pkthdr.len + MINRECLEN) <
 			     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
 				m = m0;
 				m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 					    sizeof(struct igmp_grouprec)) /
 				    sizeof(in_addr_t);
 				CTR1(KTR_IGMPV3,
 				    "%s: use previous packet", __func__);
 			} else {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				if (m)
 					m->m_data += IGMP_LEADINGSPACE;
 				if (m == NULL) {
 					m = m_gethdr(M_NOWAIT, MT_DATA);
 					if (m)
 						M_ALIGN(m, IGMP_LEADINGSPACE);
 				}
 				if (m == NULL) {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_get*() failed", __func__);
 					return (-ENOMEM);
 				}
 				m->m_pkthdr.PH_vt.vt_nrecs = 0;
 				igmp_save_context(m, ifp);
 				m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 				    sizeof(struct igmp_grouprec)) /
 				    sizeof(in_addr_t);
 				npbytes = 0;
 				CTR1(KTR_IGMPV3,
 				    "%s: allocated new packet", __func__);
 			}
 			/*
 			 * Append the IGMP group record header to the
 			 * current packet's data area.
 			 * Recalculate pointer to free space for next
 			 * group record, in case m_append() allocated
 			 * a new mbuf or cluster.
 			 */
 			memset(&ig, 0, sizeof(ig));
 			ig.ig_group = inm->inm_addr;
 			if (!m_append(m, sizeof(ig), (void *)&ig)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3,
 				    "%s: m_append() failed", __func__);
 				return (-ENOMEM);
 			}
 			npbytes += sizeof(struct igmp_grouprec);
 			if (m != m0) {
 				/* new packet; offset in c hain */
 				md = m_getptr(m, npbytes -
 				    sizeof(struct igmp_grouprec), &off);
 				pig = (struct igmp_grouprec *)(mtod(md,
 				    uint8_t *) + off);
 			} else {
 				/* current packet; offset from last append */
 				md = m_last(m);
 				pig = (struct igmp_grouprec *)(mtod(md,
 				    uint8_t *) + md->m_len -
 				    sizeof(struct igmp_grouprec));
 			}
 			/*
 			 * Begin walking the tree for this record type
 			 * pass, or continue from where we left off
 			 * previously if we had to allocate a new packet.
 			 * Only report deltas in-mode at t1.
 			 * We need not report included sources as allowed
 			 * if we are in inclusive mode on the group,
 			 * however the converse is not true.
 			 */
 			rsrcs = 0;
 			if (nims == NULL)
 				nims = RB_MIN(ip_msource_tree, &inm->inm_srcs);
 			RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
 				CTR2(KTR_IGMPV3, "%s: visit node %s",
 				    __func__, inet_ntoa_haddr(ims->ims_haddr));
 				now = ims_get_mode(inm, ims, 1);
 				then = ims_get_mode(inm, ims, 0);
 				CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d",
 				    __func__, then, now);
 				if (now == then) {
 					CTR1(KTR_IGMPV3,
 					    "%s: skip unchanged", __func__);
 					continue;
 				}
 				if (mode == MCAST_EXCLUDE &&
 				    now == MCAST_INCLUDE) {
 					CTR1(KTR_IGMPV3,
 					    "%s: skip IN src on EX group",
 					    __func__);
 					continue;
 				}
 				nrt = (rectype_t)now;
 				if (nrt == REC_NONE)
 					nrt = (rectype_t)(~mode & REC_FULL);
 				if (schanged++ == 0) {
 					crt = nrt;
 				} else if (crt != nrt)
 					continue;
 				naddr = htonl(ims->ims_haddr);
 				if (!m_append(m, sizeof(in_addr_t),
 				    (void *)&naddr)) {
 					if (m != m0)
 						m_freem(m);
 					CTR1(KTR_IGMPV3,
 					    "%s: m_append() failed", __func__);
 					return (-ENOMEM);
 				}
 				nallow += !!(crt == REC_ALLOW);
 				nblock += !!(crt == REC_BLOCK);
 				if (++rsrcs == m0srcs)
 					break;
 			}
 			/*
 			 * If we did not append any tree nodes on this
 			 * pass, back out of allocations.
 			 */
 			if (rsrcs == 0) {
 				npbytes -= sizeof(struct igmp_grouprec);
 				if (m != m0) {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_free(m)", __func__);
 					m_freem(m);
 				} else {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_adj(m, -ig)", __func__);
 					m_adj(m, -((int)sizeof(
 					    struct igmp_grouprec)));
 				}
 				continue;
 			}
 			npbytes += (rsrcs * sizeof(in_addr_t));
 			if (crt == REC_ALLOW)
 				pig->ig_type = IGMP_ALLOW_NEW_SOURCES;
 			else if (crt == REC_BLOCK)
 				pig->ig_type = IGMP_BLOCK_OLD_SOURCES;
 			pig->ig_numsrc = htons(rsrcs);
 			/*
 			 * Count the new group record, and enqueue this
 			 * packet if it wasn't already queued.
 			 */
 			m->m_pkthdr.PH_vt.vt_nrecs++;
 			if (m != m0)
 				mbufq_enqueue(mq, m);
 			nbytes += npbytes;
 		} while (nims != NULL);
 		drt |= crt;
 		crt = (~crt & REC_FULL);
 	}
 
 	CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
 	    nallow, nblock);
 
 	return (nbytes);
 }
 
 static int
 igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq)
 {
 	struct mbufq	*gq;
 	struct mbuf	*m;		/* pending state-change */
 	struct mbuf	*m0;		/* copy of pending state-change */
 	struct mbuf	*mt;		/* last state-change in packet */
 	int		 docopy, domerge;
 	u_int		 recslen;
 
 	docopy = 0;
 	domerge = 0;
 	recslen = 0;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	/*
 	 * If there are further pending retransmissions, make a writable
 	 * copy of each queued state-change message before merging.
 	 */
 	if (inm->inm_scrv > 0)
 		docopy = 1;
 
 	gq = &inm->inm_scq;
 #ifdef KTR
 	if (mbufq_first(gq) == NULL) {
 		CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty",
 		    __func__, inm);
 	}
 #endif
 
 	m = mbufq_first(gq);
 	while (m != NULL) {
 		/*
 		 * Only merge the report into the current packet if
 		 * there is sufficient space to do so; an IGMPv3 report
 		 * packet may only contain 65,535 group records.
 		 * Always use a simple mbuf chain concatentation to do this,
 		 * as large state changes for single groups may have
 		 * allocated clusters.
 		 */
 		domerge = 0;
 		mt = mbufq_last(scq);
 		if (mt != NULL) {
 			recslen = m_length(m, NULL);
 
 			if ((mt->m_pkthdr.PH_vt.vt_nrecs +
 			    m->m_pkthdr.PH_vt.vt_nrecs <=
 			    IGMP_V3_REPORT_MAXRECS) &&
 			    (mt->m_pkthdr.len + recslen <=
 			    (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE)))
 				domerge = 1;
 		}
 
 		if (!domerge && mbufq_full(gq)) {
 			CTR2(KTR_IGMPV3,
 			    "%s: outbound queue full, skipping whole packet %p",
 			    __func__, m);
 			mt = m->m_nextpkt;
 			if (!docopy)
 				m_freem(m);
 			m = mt;
 			continue;
 		}
 
 		if (!docopy) {
 			CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m);
 			m0 = mbufq_dequeue(gq);
 			m = m0->m_nextpkt;
 		} else {
 			CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m);
 			m0 = m_dup(m, M_NOWAIT);
 			if (m0 == NULL)
 				return (ENOMEM);
 			m0->m_nextpkt = NULL;
 			m = m->m_nextpkt;
 		}
 
 		if (!domerge) {
 			CTR3(KTR_IGMPV3, "%s: queueing %p to scq %p)",
 			    __func__, m0, scq);
 			mbufq_enqueue(scq, m0);
 		} else {
 			struct mbuf *mtl;	/* last mbuf of packet mt */
 
 			CTR3(KTR_IGMPV3, "%s: merging %p with scq tail %p)",
 			    __func__, m0, mt);
 
 			mtl = m_last(mt);
 			m0->m_flags &= ~M_PKTHDR;
 			mt->m_pkthdr.len += recslen;
 			mt->m_pkthdr.PH_vt.vt_nrecs +=
 			    m0->m_pkthdr.PH_vt.vt_nrecs;
 
 			mtl->m_next = m0;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Respond to a pending IGMPv3 General Query.
  */
 static void
 igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in_multi		*inm;
 	int			 retval, loop;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi->igi_version == IGMP_VERSION_3,
 	    ("%s: called when version %d", __func__, igi->igi_version));
 
 	/*
 	 * Check that there are some packets queued. If so, send them first.
 	 * For large number of groups the reply to general query can take
 	 * many packets, we should finish sending them before starting of
 	 * queuing the new reply.
 	 */
 	if (mbufq_len(&igi->igi_gq) != 0)
 		goto send;
 
 	ifp = igi->igi_ifp;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		KASSERT(ifp == inm->inm_ifp,
 		    ("%s: inconsistent ifp", __func__));
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			retval = igmp_v3_enqueue_group_record(&igi->igi_gq,
 			    inm, 0, 0, 0);
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 send:
 	loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
 	igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop);
 
 	/*
 	 * Slew transmission of bursts over 500ms intervals.
 	 */
 	if (mbufq_first(&igi->igi_gq) != NULL) {
 		igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY(
 		    IGMP_RESPONSE_BURST_INTERVAL);
 		V_interface_timers_running = 1;
 	}
 }
 
 /*
  * Transmit the next pending IGMP message in the output queue.
  *
  * We get called from netisr_processqueue(). A mutex private to igmpoq
  * will be acquired and released around this routine.
  *
  * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
  * MRT: Nothing needs to be done, as IGMP traffic is always local to
  * a link and uses a link-scope multicast address.
  */
 static void
 igmp_intr(struct mbuf *m)
 {
 	struct ip_moptions	 imo;
 	struct ifnet		*ifp;
 	struct mbuf		*ipopts, *m0;
 	int			 error;
 	uint32_t		 ifindex;
 
 	CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m);
 
 	/*
 	 * Set VNET image pointer from enqueued mbuf chain
 	 * before doing anything else. Whilst we use interface
 	 * indexes to guard against interface detach, they are
 	 * unique to each VIMAGE and must be retrieved.
 	 */
 	CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr));
 	ifindex = igmp_restore_context(m);
 
 	/*
 	 * Check if the ifnet still exists. This limits the scope of
 	 * any race in the absence of a global ifp lock for low cost
 	 * (an array lookup).
 	 */
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.",
 		    __func__, m, ifindex);
 		m_freem(m);
 		IPSTAT_INC(ips_noroute);
 		goto out;
 	}
 
 	ipopts = V_igmp_sendra ? m_raopt : NULL;
 
 	imo.imo_multicast_ttl  = 1;
 	imo.imo_multicast_vif  = -1;
 	imo.imo_multicast_loop = (V_ip_mrouter != NULL);
 
 	/*
 	 * If the user requested that IGMP traffic be explicitly
 	 * redirected to the loopback interface (e.g. they are running a
 	 * MANET interface and the routing protocol needs to see the
 	 * updates), handle this now.
 	 */
 	if (m->m_flags & M_IGMP_LOOP)
 		imo.imo_multicast_ifp = V_loif;
 	else
 		imo.imo_multicast_ifp = ifp;
 
 	if (m->m_flags & M_IGMPV2) {
 		m0 = m;
 	} else {
 		m0 = igmp_v3_encap_report(ifp, m);
 		if (m0 == NULL) {
 			CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m);
 			m_freem(m);
 			IPSTAT_INC(ips_odropped);
 			goto out;
 		}
 	}
 
 	igmp_scrub_context(m0);
 	m_clrprotoflags(m);
 	m0->m_pkthdr.rcvif = V_loif;
 #ifdef MAC
 	mac_netinet_igmp_send(ifp, m0);
 #endif
 	error = ip_output(m0, ipopts, NULL, 0, &imo, NULL);
 	if (error) {
 		CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error);
 		goto out;
 	}
 
 	IGMPSTAT_INC(igps_snd_reports);
 
 out:
 	/*
 	 * We must restore the existing vnet pointer before
 	 * continuing as we are run from netisr context.
 	 */
 	CURVNET_RESTORE();
 }
 
 /*
  * Encapsulate an IGMPv3 report.
  *
  * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf
  * chain has already had its IP/IGMPv3 header prepended. In this case
  * the function will not attempt to prepend; the lengths and checksums
  * will however be re-computed.
  *
  * Returns a pointer to the new mbuf chain head, or NULL if the
  * allocation failed.
  */
 static struct mbuf *
 igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
 {
 	struct rm_priotracker	in_ifa_tracker;
 	struct igmp_report	*igmp;
 	struct ip		*ip;
 	int			 hdrlen, igmpreclen;
 
 	KASSERT((m->m_flags & M_PKTHDR),
 	    ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
 
 	igmpreclen = m_length(m, NULL);
 	hdrlen = sizeof(struct ip) + sizeof(struct igmp_report);
 
 	if (m->m_flags & M_IGMPV3_HDR) {
 		igmpreclen -= hdrlen;
 	} else {
 		M_PREPEND(m, hdrlen, M_NOWAIT);
 		if (m == NULL)
 			return (NULL);
 		m->m_flags |= M_IGMPV3_HDR;
 	}
 
 	CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen);
 
 	m->m_data += sizeof(struct ip);
 	m->m_len -= sizeof(struct ip);
 
 	igmp = mtod(m, struct igmp_report *);
 	igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT;
 	igmp->ir_rsv1 = 0;
 	igmp->ir_rsv2 = 0;
 	igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs);
 	igmp->ir_cksum = 0;
 	igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen);
 	m->m_pkthdr.PH_vt.vt_nrecs = 0;
 
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_tos = IPTOS_PREC_INTERNETCONTROL;
 	ip->ip_len = htons(hdrlen + igmpreclen);
 	ip->ip_off = htons(IP_DF);
 	ip->ip_p = IPPROTO_IGMP;
 	ip->ip_sum = 0;
 
 	ip->ip_src.s_addr = INADDR_ANY;
 
 	if (m->m_flags & M_IGMP_LOOP) {
 		struct in_ifaddr *ia;
 
 		IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 		if (ia != NULL) {
 			ip->ip_src = ia->ia_addr.sin_addr;
 			ifa_free(&ia->ia_ifa);
 		}
 	}
 
 	ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP);
 
 	return (m);
 }
 
 #ifdef KTR
 static char *
 igmp_rec_type_to_str(const int type)
 {
 
 	switch (type) {
 		case IGMP_CHANGE_TO_EXCLUDE_MODE:
 			return "TO_EX";
 			break;
 		case IGMP_CHANGE_TO_INCLUDE_MODE:
 			return "TO_IN";
 			break;
 		case IGMP_MODE_IS_EXCLUDE:
 			return "MODE_EX";
 			break;
 		case IGMP_MODE_IS_INCLUDE:
 			return "MODE_IN";
 			break;
 		case IGMP_ALLOW_NEW_SOURCES:
 			return "ALLOW_NEW";
 			break;
 		case IGMP_BLOCK_OLD_SOURCES:
 			return "BLOCK_OLD";
 			break;
 		default:
 			break;
 	}
 	return "unknown";
 }
 #endif
 
 #ifdef VIMAGE
 static void
 vnet_igmp_uninit(const void *unused __unused)
 {
 
+	/* This can happen when we shutdown the entire network stack. */
 	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
-
-	VNET_ASSERT(LIST_EMPTY(&V_igi_head),
-	    ("%s: igi list %p not empty; ifnets not detached?", __func__,
-	    &V_igi_head));
 }
 VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY,
     vnet_igmp_uninit, NULL);
 #endif
 
 #ifdef DDB
 DB_SHOW_COMMAND(igi_list, db_show_igi_list)
 {
 	struct igmp_ifsoftc *igi, *tigi;
 	LIST_HEAD(_igi_list, igmp_ifsoftc) *igi_head;
 
         if (!have_addr) {
                 db_printf("usage: show igi_list <addr>\n");
                 return;
         }
         igi_head = (struct _igi_list *)addr;
 
 	LIST_FOREACH_SAFE(igi, igi_head, igi_link, tigi) {
 		db_printf("igmp_ifsoftc %p:\n", igi);
 		db_printf("    ifp %p\n", igi->igi_ifp);
 		db_printf("    version %u\n", igi->igi_version);
 		db_printf("    v1_timer %u\n", igi->igi_v1_timer);
 		db_printf("    v2_timer %u\n", igi->igi_v2_timer);
 		db_printf("    v3_timer %u\n", igi->igi_v3_timer);
 		db_printf("    flags %#x\n", igi->igi_flags);
 		db_printf("    rv %u\n", igi->igi_rv);
 		db_printf("    qi %u\n", igi->igi_qi);
 		db_printf("    qri %u\n", igi->igi_qri);
 		db_printf("    uri %u\n", igi->igi_uri);
 		/* SLIST_HEAD(,in_multi)   igi_relinmhead */
 		/* struct mbufq    igi_gq; */
 		db_printf("\n");
 	}
 }
 #endif
 
 static int
 igmp_modevent(module_t mod, int type, void *unused __unused)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		CTR1(KTR_IGMPV3, "%s: initializing", __func__);
 		IGMP_LOCK_INIT();
 		m_raopt = igmp_ra_alloc();
 		netisr_register(&igmp_nh);
 		break;
 	case MOD_UNLOAD:
 		CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
 		netisr_unregister(&igmp_nh);
 		m_free(m_raopt);
 		m_raopt = NULL;
 		IGMP_LOCK_DESTROY();
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t igmp_mod = {
     "igmp",
     igmp_modevent,
     0
 };
 DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PROTO_MC, SI_ORDER_ANY);
Index: projects/vnet/sys/netinet/in.c
===================================================================
--- projects/vnet/sys/netinet/in.c	(revision 297511)
+++ projects/vnet/sys/netinet/in.c	(revision 297512)
@@ -1,1432 +1,1467 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (C) 2001 WIDE Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in.c	8.4 (Berkeley) 1/9/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/systm.h>
 #include <sys/sockio.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/socket.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/rmlock.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sx.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/if_ether.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_carp.h>
 #include <netinet/igmp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *);
 static int in_difaddr_ioctl(caddr_t, struct ifnet *, struct thread *);
 
 static void	in_socktrim(struct sockaddr_in *);
 static void	in_purgemaddrs(struct ifnet *);
 
 static VNET_DEFINE(int, nosameprefix);
 #define	V_nosameprefix			VNET(nosameprefix)
 SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(nosameprefix), 0,
 	"Refuse to create same prefixes on different interfaces");
 
 VNET_DECLARE(struct inpcbinfo, ripcbinfo);
 #define	V_ripcbinfo			VNET(ripcbinfo)
 
 static struct sx in_control_sx;
 SX_SYSINIT(in_control_sx, &in_control_sx, "in_control");
 
 /*
  * Return 1 if an internet address is for a ``local'' host
  * (one to which we have a connection).
  */
 int
 in_localaddr(struct in_addr in)
 {
 	struct rm_priotracker in_ifa_tracker;
 	register u_long i = ntohl(in.s_addr);
 	register struct in_ifaddr *ia;
 
 	IN_IFADDR_RLOCK(&in_ifa_tracker);
 	TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 		if ((i & ia->ia_subnetmask) == ia->ia_subnet) {
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			return (1);
 		}
 	}
 	IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 	return (0);
 }
 
 /*
  * Return 1 if an internet address is for the local host and configured
  * on one of its interfaces.
  */
 int
 in_localip(struct in_addr in)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct in_ifaddr *ia;
 
 	IN_IFADDR_RLOCK(&in_ifa_tracker);
 	LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) {
 		if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) {
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			return (1);
 		}
 	}
 	IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 	return (0);
 }
 
 /*
  * Return 1 if an internet address is configured on an interface.
  */
 int
 in_ifhasaddr(struct ifnet *ifp, struct in_addr in)
 {
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET)
 			continue;
 		ia = (struct in_ifaddr *)ifa;
 		if (ia->ia_addr.sin_addr.s_addr == in.s_addr) {
 			IF_ADDR_RUNLOCK(ifp);
 			return (1);
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (0);
 }
 
 /*
  * Return a reference to the interface address which is different to
  * the supplied one but with same IP address value.
  */
 static struct in_ifaddr *
 in_localip_more(struct in_ifaddr *ia)
 {
 	struct rm_priotracker in_ifa_tracker;
 	in_addr_t in = IA_SIN(ia)->sin_addr.s_addr;
 	struct in_ifaddr *it;
 
 	IN_IFADDR_RLOCK(&in_ifa_tracker);
 	LIST_FOREACH(it, INADDR_HASH(in), ia_hash) {
 		if (it != ia && IA_SIN(it)->sin_addr.s_addr == in) {
 			ifa_ref(&it->ia_ifa);
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			return (it);
 		}
 	}
 	IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 
 	return (NULL);
 }
 
 /*
  * Determine whether an IP address is in a reserved set of addresses
  * that may not be forwarded, or whether datagrams to that destination
  * may be forwarded.
  */
 int
 in_canforward(struct in_addr in)
 {
 	register u_long i = ntohl(in.s_addr);
 	register u_long net;
 
 	if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i))
 		return (0);
 	if (IN_CLASSA(i)) {
 		net = i & IN_CLASSA_NET;
 		if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))
 			return (0);
 	}
 	return (1);
 }
 
 /*
  * Trim a mask in a sockaddr
  */
 static void
 in_socktrim(struct sockaddr_in *ap)
 {
     register char *cplim = (char *) &ap->sin_addr;
     register char *cp = (char *) (&ap->sin_addr + 1);
 
     ap->sin_len = 0;
     while (--cp >= cplim)
 	if (*cp) {
 	    (ap)->sin_len = cp - (char *) (ap) + 1;
 	    break;
 	}
 }
 
 /*
  * Generic internet control operations (ioctl's).
  */
 int
 in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
     struct thread *td)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	int error;
 
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 
 	/*
 	 * Filter out 4 ioctls we implement directly.  Forward the rest
 	 * to specific functions and ifp->if_ioctl().
 	 */
 	switch (cmd) {
 	case SIOCGIFADDR:
 	case SIOCGIFBRDADDR:
 	case SIOCGIFDSTADDR:
 	case SIOCGIFNETMASK:
 		break;
 	case SIOCDIFADDR:
 		sx_xlock(&in_control_sx);
 		error = in_difaddr_ioctl(data, ifp, td);
 		sx_xunlock(&in_control_sx);
 		return (error);
 	case OSIOCAIFADDR:	/* 9.x compat */
 	case SIOCAIFADDR:
 		sx_xlock(&in_control_sx);
 		error = in_aifaddr_ioctl(cmd, data, ifp, td);
 		sx_xunlock(&in_control_sx);
 		return (error);
 	case SIOCSIFADDR:
 	case SIOCSIFBRDADDR:
 	case SIOCSIFDSTADDR:
 	case SIOCSIFNETMASK:
 		/* We no longer support that old commands. */
 		return (EINVAL);
 	default:
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		return ((*ifp->if_ioctl)(ifp, cmd, data));
 	}
 
 	if (addr->sin_addr.s_addr != INADDR_ANY &&
 	    prison_check_ip4(td->td_ucred, &addr->sin_addr) != 0)
 		return (EADDRNOTAVAIL);
 
 	/*
 	 * Find address for this interface, if it exists.  If an
 	 * address was specified, find that one instead of the
 	 * first one on the interface, if possible.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET)
 			continue;
 		ia = (struct in_ifaddr *)ifa;
 		if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr)
 			break;
 	}
 	if (ifa == NULL)
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 			if (ifa->ifa_addr->sa_family == AF_INET) {
 				ia = (struct in_ifaddr *)ifa;
 				if (prison_check_ip4(td->td_ucred,
 				    &ia->ia_addr.sin_addr) == 0)
 					break;
 			}
 
 	if (ifa == NULL) {
 		IF_ADDR_RUNLOCK(ifp);
 		return (EADDRNOTAVAIL);
 	}
 
 	error = 0;
 	switch (cmd) {
 	case SIOCGIFADDR:
 		*addr = ia->ia_addr;
 		break;
 
 	case SIOCGIFBRDADDR:
 		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
 			error = EINVAL;
 			break;
 		}
 		*addr = ia->ia_broadaddr;
 		break;
 
 	case SIOCGIFDSTADDR:
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
 			error = EINVAL;
 			break;
 		}
 		*addr = ia->ia_dstaddr;
 		break;
 
 	case SIOCGIFNETMASK:
 		*addr = ia->ia_sockmask;
 		break;
 	}
 
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (error);
 }
 
 static int
 in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td)
 {
 	const struct in_aliasreq *ifra = (struct in_aliasreq *)data;
 	const struct sockaddr_in *addr = &ifra->ifra_addr;
 	const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr;
 	const struct sockaddr_in *mask = &ifra->ifra_mask;
 	const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr;
 	const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	bool iaIsFirst;
 	int error = 0;
 
 	error = priv_check(td, PRIV_NET_ADDIFADDR);
 	if (error)
 		return (error);
 
 	/*
 	 * ifra_addr must be present and be of INET family.
 	 * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional.
 	 */
 	if (addr->sin_len != sizeof(struct sockaddr_in) ||
 	    addr->sin_family != AF_INET)
 		return (EINVAL);
 	if (broadaddr->sin_len != 0 &&
 	    (broadaddr->sin_len != sizeof(struct sockaddr_in) ||
 	    broadaddr->sin_family != AF_INET))
 		return (EINVAL);
 	if (mask->sin_len != 0 &&
 	    (mask->sin_len != sizeof(struct sockaddr_in) ||
 	    mask->sin_family != AF_INET))
 		return (EINVAL);
 	if ((ifp->if_flags & IFF_POINTOPOINT) &&
 	    (dstaddr->sin_len != sizeof(struct sockaddr_in) ||
 	     dstaddr->sin_addr.s_addr == INADDR_ANY))
 		return (EDESTADDRREQ);
 	if (vhid > 0 && carp_attach_p == NULL)
 		return (EPROTONOSUPPORT);
 
 	/*
 	 * See whether address already exist.
 	 */
 	iaIsFirst = true;
 	ia = NULL;
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		struct in_ifaddr *it;
 
 		if (ifa->ifa_addr->sa_family != AF_INET)
 			continue;
 
 		it = (struct in_ifaddr *)ifa;
 		iaIsFirst = false;
 		if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr &&
 		    prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0)
 			ia = it;
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	if (ia != NULL)
 		(void )in_difaddr_ioctl(data, ifp, td);
 
 	ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK);
 	ia = (struct in_ifaddr *)ifa;
 	ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr;
 	ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr;
 	ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask;
 
 	ia->ia_ifp = ifp;
 	ia->ia_addr = *addr;
 	if (mask->sin_len != 0) {
 		ia->ia_sockmask = *mask;
 		ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr);
 	} else {
 		in_addr_t i = ntohl(addr->sin_addr.s_addr);
 
 		/*
 	 	 * Be compatible with network classes, if netmask isn't
 		 * supplied, guess it based on classes.
 	 	 */
 		if (IN_CLASSA(i))
 			ia->ia_subnetmask = IN_CLASSA_NET;
 		else if (IN_CLASSB(i))
 			ia->ia_subnetmask = IN_CLASSB_NET;
 		else
 			ia->ia_subnetmask = IN_CLASSC_NET;
 		ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask);
 	}
 	ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask;
 	in_socktrim(&ia->ia_sockmask);
 
 	if (ifp->if_flags & IFF_BROADCAST) {
 		if (broadaddr->sin_len != 0) {
 			ia->ia_broadaddr = *broadaddr;
 		} else if (ia->ia_subnetmask == IN_RFC3021_MASK) {
 			ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST;
 			ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in);
 			ia->ia_broadaddr.sin_family = AF_INET;
 		} else {
 			ia->ia_broadaddr.sin_addr.s_addr =
 			    htonl(ia->ia_subnet | ~ia->ia_subnetmask);
 			ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in);
 			ia->ia_broadaddr.sin_family = AF_INET;
 		}
 	}
 
 	if (ifp->if_flags & IFF_POINTOPOINT)
 		ia->ia_dstaddr = *dstaddr;
 
 	/* XXXGL: rtinit() needs this strange assignment. */
 	if (ifp->if_flags & IFF_LOOPBACK)
                 ia->ia_dstaddr = ia->ia_addr;
 
 	if (vhid != 0) {
 		error = (*carp_attach_p)(&ia->ia_ifa, vhid);
 		if (error)
 			return (error);
 	}
 
 	/* if_addrhead is already referenced by ifa_alloc() */
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 
 	ifa_ref(ifa);			/* in_ifaddrhead */
 	IN_IFADDR_WLOCK();
 	TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link);
 	LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash);
 	IN_IFADDR_WUNLOCK();
 
 	/*
 	 * Give the interface a chance to initialize
 	 * if this is its first address,
 	 * and to validate the address if necessary.
 	 */
 	if (ifp->if_ioctl != NULL) {
 		error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia);
 		if (error)
 			goto fail1;
 	}
 
 	/*
 	 * Add route for the network.
 	 */
 	if (vhid == 0) {
 		int flags = RTF_UP;
 
 		if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT))
 			flags |= RTF_HOST;
 
 		error = in_addprefix(ia, flags);
 		if (error)
 			goto fail1;
 	}
 
 	/*
 	 * Add a loopback route to self.
 	 */
 	if (vhid == 0 && (ifp->if_flags & IFF_LOOPBACK) == 0 &&
 	    ia->ia_addr.sin_addr.s_addr != INADDR_ANY &&
 	    !((ifp->if_flags & IFF_POINTOPOINT) &&
 	     ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)) {
 		struct in_ifaddr *eia;
 
 		eia = in_localip_more(ia);
 
 		if (eia == NULL) {
 			error = ifa_add_loopback_route((struct ifaddr *)ia,
 			    (struct sockaddr *)&ia->ia_addr);
 			if (error)
 				goto fail2;
 		} else
 			ifa_free(&eia->ia_ifa);
 	}
 
 	if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) {
 		struct in_addr allhosts_addr;
 		struct in_ifinfo *ii;
 
 		ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
 		allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP);
 
 		error = in_joingroup(ifp, &allhosts_addr, NULL,
 			&ii->ii_allhosts);
 	}
 
 	EVENTHANDLER_INVOKE(ifaddr_event, ifp);
 
 	return (error);
 
 fail2:
 	if (vhid == 0)
 		(void )in_scrubprefix(ia, LLE_STATIC);
 
 fail1:
 	if (ia->ia_ifa.ifa_carp)
 		(*carp_detach_p)(&ia->ia_ifa);
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 	ifa_free(&ia->ia_ifa);		/* if_addrhead */
 
 	IN_IFADDR_WLOCK();
 	TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link);
 	LIST_REMOVE(ia, ia_hash);
 	IN_IFADDR_WUNLOCK();
 	ifa_free(&ia->ia_ifa);		/* in_ifaddrhead */
 
 	return (error);
 }
 
 static int
 in_difaddr_ioctl(caddr_t data, struct ifnet *ifp, struct thread *td)
 {
 	const struct ifreq *ifr = (struct ifreq *)data;
 	const struct sockaddr_in *addr = (const struct sockaddr_in *)
 	    &ifr->ifr_addr;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	bool deleteAny, iaIsLast;
 	int error;
 
 	if (td != NULL) {
 		error = priv_check(td, PRIV_NET_DELIFADDR);
 		if (error)
 			return (error);
 	}
 
 	if (addr->sin_len != sizeof(struct sockaddr_in) ||
 	    addr->sin_family != AF_INET)
 		deleteAny = true;
 	else
 		deleteAny = false;
 
 	iaIsLast = true;
 	ia = NULL;
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		struct in_ifaddr *it;
 
 		if (ifa->ifa_addr->sa_family != AF_INET)
 			continue;
 
 		it = (struct in_ifaddr *)ifa;
 		if (deleteAny && ia == NULL && (td == NULL ||
 		    prison_check_ip4(td->td_ucred, &it->ia_addr.sin_addr) == 0))
 			ia = it;
 
 		if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr &&
 		    (td == NULL || prison_check_ip4(td->td_ucred,
 		    &addr->sin_addr) == 0))
 			ia = it;
 
 		if (it != ia)
 			iaIsLast = false;
 	}
 
 	if (ia == NULL) {
 		IF_ADDR_WUNLOCK(ifp);
 		return (EADDRNOTAVAIL);
 	}
 
 	TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 	ifa_free(&ia->ia_ifa);		/* if_addrhead */
 
 	IN_IFADDR_WLOCK();
 	TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link);
 	LIST_REMOVE(ia, ia_hash);
 	IN_IFADDR_WUNLOCK();
 
 	/*
 	 * in_scrubprefix() kills the interface route.
 	 */
 	in_scrubprefix(ia, LLE_STATIC);
 
 	/*
 	 * in_ifadown gets rid of all the rest of
 	 * the routes.  This is not quite the right
 	 * thing to do, but at least if we are running
 	 * a routing process they will come back.
 	 */
 	in_ifadown(&ia->ia_ifa, 1);
 
 	if (ia->ia_ifa.ifa_carp)
 		(*carp_detach_p)(&ia->ia_ifa);
 
 	/*
 	 * If this is the last IPv4 address configured on this
 	 * interface, leave the all-hosts group.
 	 * No state-change report need be transmitted.
 	 */
 	if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) {
 		struct in_ifinfo *ii;
 
 		ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
 		IN_MULTI_LOCK();
 		if (ii->ii_allhosts) {
 			(void)in_leavegroup_locked(ii->ii_allhosts, NULL);
 			ii->ii_allhosts = NULL;
 		}
 		IN_MULTI_UNLOCK();
 	}
 
 	EVENTHANDLER_INVOKE(ifaddr_event, ifp);
 	ifa_free(&ia->ia_ifa);		/* in_ifaddrhead */
 
 	return (0);
 }
 
 #define rtinitflags(x) \
 	((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \
 	    ? RTF_HOST : 0)
 
 /*
  * Check if we have a route for the given prefix already or add one accordingly.
  */
 int
 in_addprefix(struct in_ifaddr *target, int flags)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct in_ifaddr *ia;
 	struct in_addr prefix, mask, p, m;
 	int error;
 
 	if ((flags & RTF_HOST) != 0) {
 		prefix = target->ia_dstaddr.sin_addr;
 		mask.s_addr = 0;
 	} else {
 		prefix = target->ia_addr.sin_addr;
 		mask = target->ia_sockmask.sin_addr;
 		prefix.s_addr &= mask.s_addr;
 	}
 
 	IN_IFADDR_RLOCK(&in_ifa_tracker);
 	/* Look for an existing address with the same prefix, mask, and fib */
 	TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 		if (rtinitflags(ia)) {
 			p = ia->ia_dstaddr.sin_addr;
 
 			if (prefix.s_addr != p.s_addr)
 				continue;
 		} else {
 			p = ia->ia_addr.sin_addr;
 			m = ia->ia_sockmask.sin_addr;
 			p.s_addr &= m.s_addr;
 
 			if (prefix.s_addr != p.s_addr ||
 			    mask.s_addr != m.s_addr)
 				continue;
 		}
 		if (target->ia_ifp->if_fib != ia->ia_ifp->if_fib)
 			continue;
 
 		/*
 		 * If we got a matching prefix route inserted by other
 		 * interface address, we are done here.
 		 */
 		if (ia->ia_flags & IFA_ROUTE) {
 #ifdef RADIX_MPATH
 			if (ia->ia_addr.sin_addr.s_addr ==
 			    target->ia_addr.sin_addr.s_addr) {
 				IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 				return (EEXIST);
 			} else
 				break;
 #endif
 			if (V_nosameprefix) {
 				IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 				return (EEXIST);
 			} else {
 				int fibnum;
 
 				fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS :
 					target->ia_ifp->if_fib;
 				rt_addrmsg(RTM_ADD, &target->ia_ifa, fibnum);
 				IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 				return (0);
 			}
 		}
 	}
 	IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 
 	/*
 	 * No-one seem to have this prefix route, so we try to insert it.
 	 */
 	error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags);
 	if (!error)
 		target->ia_flags |= IFA_ROUTE;
 	return (error);
 }
 
 /*
  * Removes either all lle entries for given @ia, or lle
  * corresponding to @ia address.
  */
 static void
 in_scrubprefixlle(struct in_ifaddr *ia, int all, u_int flags)
 {
 	struct sockaddr_in addr, mask;
 	struct sockaddr *saddr, *smask;
 	struct ifnet *ifp;
 
 	saddr = (struct sockaddr *)&addr;
 	bzero(&addr, sizeof(addr));
 	addr.sin_len = sizeof(addr);
 	addr.sin_family = AF_INET;
 	smask = (struct sockaddr *)&mask;
 	bzero(&mask, sizeof(mask));
 	mask.sin_len = sizeof(mask);
 	mask.sin_family = AF_INET;
 	mask.sin_addr.s_addr = ia->ia_subnetmask;
 	ifp = ia->ia_ifp;
 
 	if (all) {
 
 		/*
 		 * Remove all L2 entries matching given prefix.
 		 * Convert address to host representation to avoid
 		 * doing this on every callback. ia_subnetmask is already
 		 * stored in host representation.
 		 */
 		addr.sin_addr.s_addr = ntohl(ia->ia_addr.sin_addr.s_addr);
 		lltable_prefix_free(AF_INET, saddr, smask, flags);
 	} else {
 		/* Remove interface address only */
 		addr.sin_addr.s_addr = ia->ia_addr.sin_addr.s_addr;
 		lltable_delete_addr(LLTABLE(ifp), LLE_IFADDR, saddr);
 	}
 }
 
 /*
  * If there is no other address in the system that can serve a route to the
  * same prefix, remove the route.  Hand over the route to the new address
  * otherwise.
  */
 int
 in_scrubprefix(struct in_ifaddr *target, u_int flags)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct in_ifaddr *ia;
 	struct in_addr prefix, mask, p, m;
 	int error = 0;
 
 	/*
 	 * Remove the loopback route to the interface address.
 	 */
 	if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) &&
 	    !(target->ia_ifp->if_flags & IFF_LOOPBACK) &&
 	    (flags & LLE_STATIC)) {
 		struct in_ifaddr *eia;
 
 		/*
 		 * XXXME: add fib-aware in_localip.
 		 * We definitely don't want to switch between
 		 * prefixes in different fibs.
 		 */
 		eia = in_localip_more(target);
 
 		if (eia != NULL) {
 			error = ifa_switch_loopback_route((struct ifaddr *)eia,
 			    (struct sockaddr *)&target->ia_addr);
 			ifa_free(&eia->ia_ifa);
 		} else {
 			error = ifa_del_loopback_route((struct ifaddr *)target,
 			    (struct sockaddr *)&target->ia_addr);
 		}
 	}
 
 	if (rtinitflags(target)) {
 		prefix = target->ia_dstaddr.sin_addr;
 		mask.s_addr = 0;
 	} else {
 		prefix = target->ia_addr.sin_addr;
 		mask = target->ia_sockmask.sin_addr;
 		prefix.s_addr &= mask.s_addr;
 	}
 
 	if ((target->ia_flags & IFA_ROUTE) == 0) {
 		int fibnum;
 		
 		fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS :
 			target->ia_ifp->if_fib;
 		rt_addrmsg(RTM_DELETE, &target->ia_ifa, fibnum);
 	
 		/*
 		 * Removing address from !IFF_UP interface or
 		 * prefix which exists on other interface (along with route).
 		 * No entries should exist here except target addr.
 		 * Given that, delete this entry only.
 		 */
 		in_scrubprefixlle(target, 0, flags);
 		return (0);
 	}
 
 	IN_IFADDR_RLOCK(&in_ifa_tracker);
 	TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 		if (rtinitflags(ia)) {
 			p = ia->ia_dstaddr.sin_addr;
 
 			if (prefix.s_addr != p.s_addr)
 				continue;
 		} else {
 			p = ia->ia_addr.sin_addr;
 			m = ia->ia_sockmask.sin_addr;
 			p.s_addr &= m.s_addr;
 
 			if (prefix.s_addr != p.s_addr ||
 			    mask.s_addr != m.s_addr)
 				continue;
 		}
 
 		if ((ia->ia_ifp->if_flags & IFF_UP) == 0)
 			continue;
 
 		/*
 		 * If we got a matching prefix address, move IFA_ROUTE and
 		 * the route itself to it.  Make sure that routing daemons
 		 * get a heads-up.
 		 */
 		if ((ia->ia_flags & IFA_ROUTE) == 0) {
 			ifa_ref(&ia->ia_ifa);
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			error = rtinit(&(target->ia_ifa), (int)RTM_DELETE,
 			    rtinitflags(target));
 			if (error == 0)
 				target->ia_flags &= ~IFA_ROUTE;
 			else
 				log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n",
 					error);
 			/* Scrub all entries IFF interface is different */
 			in_scrubprefixlle(target, target->ia_ifp != ia->ia_ifp,
 			    flags);
 			error = rtinit(&ia->ia_ifa, (int)RTM_ADD,
 			    rtinitflags(ia) | RTF_UP);
 			if (error == 0)
 				ia->ia_flags |= IFA_ROUTE;
 			else
 				log(LOG_INFO, "in_scrubprefix: err=%d, new prefix add failed\n",
 					error);
 			ifa_free(&ia->ia_ifa);
 			return (error);
 		}
 	}
 	IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 
 	/*
 	 * remove all L2 entries on the given prefix
 	 */
 	in_scrubprefixlle(target, 1, flags);
 
 	/*
 	 * As no-one seem to have this prefix, we can remove the route.
 	 */
 	error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target));
 	if (error == 0)
 		target->ia_flags &= ~IFA_ROUTE;
 	else
 		log(LOG_INFO, "in_scrubprefix: err=%d, prefix delete failed\n", error);
 	return (error);
 }
 
 #undef rtinitflags
 
+void
+in_ifscrub_all(void)
+{
+	struct ifnet *ifp;
+	struct ifaddr *ifa, *nifa;
+	struct ifaliasreq ifr;
+
+	IFNET_RLOCK();
+	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
+		/* Cannot lock here - lock recursion. */
+		/* IF_ADDR_RLOCK(ifp); */
+		TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) {
+			if (ifa->ifa_addr->sa_family != AF_INET)
+				continue;
+
+			/*
+			 * This is ugly but the only way for legacy IP to
+			 * cleanly remove addresses and everything attached.
+			 */
+			bzero(&ifr, sizeof(ifr));
+			ifr.ifra_addr = *ifa->ifa_addr;
+			if (ifa->ifa_dstaddr)
+			ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
+			(void)in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr,
+			    ifp, NULL);
+		}
+		/* IF_ADDR_RUNLOCK(ifp); */
+		in_purgemaddrs(ifp);
+		igmp_domifdetach(ifp);
+	}
+	IFNET_RUNLOCK();
+}
+
 /*
  * Return 1 if the address might be a local broadcast address.
  */
 int
 in_broadcast(struct in_addr in, struct ifnet *ifp)
 {
 	register struct ifaddr *ifa;
 	u_long t;
 
 	if (in.s_addr == INADDR_BROADCAST ||
 	    in.s_addr == INADDR_ANY)
 		return (1);
 	if ((ifp->if_flags & IFF_BROADCAST) == 0)
 		return (0);
 	t = ntohl(in.s_addr);
 	/*
 	 * Look through the list of addresses for a match
 	 * with a broadcast address.
 	 */
 #define ia ((struct in_ifaddr *)ifa)
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (ifa->ifa_addr->sa_family == AF_INET &&
 		    (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr ||
 		     /*
 		      * Check for old-style (host 0) broadcast, but
 		      * taking into account that RFC 3021 obsoletes it.
 		      */
 		    (ia->ia_subnetmask != IN_RFC3021_MASK &&
 		    t == ia->ia_subnet)) &&
 		     /*
 		      * Check for an all one subnetmask. These
 		      * only exist when an interface gets a secondary
 		      * address.
 		      */
 		    ia->ia_subnetmask != (u_long)0xffffffff)
 			    return (1);
 	return (0);
 #undef ia
 }
 
 /*
  * On interface removal, clean up IPv4 data structures hung off of the ifnet.
  */
 void
-in_ifdetach(struct ifnet *ifp)
+in_ifdetach(struct ifnet *ifp, int purgeulp)
 {
 
-	in_pcbpurgeif0(&V_ripcbinfo, ifp);
-	in_pcbpurgeif0(&V_udbinfo, ifp);
-	in_pcbpurgeif0(&V_ulitecbinfo, ifp);
+	if (purgeulp) {
+		in_pcbpurgeif0(&V_ripcbinfo, ifp);
+		in_pcbpurgeif0(&V_udbinfo, ifp);
+		in_pcbpurgeif0(&V_ulitecbinfo, ifp);
+	}
 	in_purgemaddrs(ifp);
 }
 
 /*
  * Delete all IPv4 multicast address records, and associated link-layer
  * multicast address records, associated with ifp.
  * XXX It looks like domifdetach runs AFTER the link layer cleanup.
  * XXX This should not race with ifma_protospec being set during
  * a new allocation, if it does, we have bigger problems.
  */
 static void
 in_purgemaddrs(struct ifnet *ifp)
 {
 	LIST_HEAD(,in_multi) purgeinms;
 	struct in_multi		*inm, *tinm;
 	struct ifmultiaddr	*ifma;
 
 	LIST_INIT(&purgeinms);
 	IN_MULTI_LOCK();
 
 	/*
 	 * Extract list of in_multi associated with the detaching ifp
 	 * which the PF_INET layer is about to release.
 	 * We need to do this as IF_ADDR_LOCK() may be re-acquired
 	 * by code further down.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 #if 0
 		KASSERT(ifma->ifma_protospec != NULL,
 		    ("%s: ifma_protospec is NULL", __func__));
 #endif
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		LIST_INSERT_HEAD(&purgeinms, inm, inm_link);
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	LIST_FOREACH_SAFE(inm, &purgeinms, inm_link, tinm) {
 		LIST_REMOVE(inm, inm_link);
 		inm_release_locked(inm);
 	}
 	igmp_ifdetach(ifp);
 
 	IN_MULTI_UNLOCK();
 }
 
 struct in_llentry {
 	struct llentry		base;
 };
 
 #define	IN_LLTBL_DEFAULT_HSIZE	32
 #define	IN_LLTBL_HASH(k, h) \
 	(((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1))
 
 /*
  * Do actual deallocation of @lle.
  * Called by LLE_FREE_LOCKED when number of references
  * drops to zero.
  */
 static void
 in_lltable_destroy_lle(struct llentry *lle)
 {
 
 	LLE_WUNLOCK(lle);
 	LLE_LOCK_DESTROY(lle);
 	LLE_REQ_DESTROY(lle);
 	free(lle, M_LLTABLE);
 }
 
 static struct llentry *
 in_lltable_new(struct in_addr addr4, u_int flags)
 {
 	struct in_llentry *lle;
 
 	lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO);
 	if (lle == NULL)		/* NB: caller generates msg */
 		return NULL;
 
 	/*
 	 * For IPv4 this will trigger "arpresolve" to generate
 	 * an ARP request.
 	 */
 	lle->base.la_expire = time_uptime; /* mark expired */
 	lle->base.r_l3addr.addr4 = addr4;
 	lle->base.lle_refcnt = 1;
 	lle->base.lle_free = in_lltable_destroy_lle;
 	LLE_LOCK_INIT(&lle->base);
 	LLE_REQ_INIT(&lle->base);
 	callout_init(&lle->base.lle_timer, 1);
 
 	return (&lle->base);
 }
 
 #define IN_ARE_MASKED_ADDR_EQUAL(d, a, m)	(		\
 	((((d).s_addr ^ (a).s_addr) & (m).s_addr)) == 0 )
 
 static int
 in_lltable_match_prefix(const struct sockaddr *saddr,
     const struct sockaddr *smask, u_int flags, struct llentry *lle)
 {
 	struct in_addr addr, mask, lle_addr;
 
 	addr = ((const struct sockaddr_in *)saddr)->sin_addr;
 	mask = ((const struct sockaddr_in *)smask)->sin_addr;
 	lle_addr.s_addr = ntohl(lle->r_l3addr.addr4.s_addr);
 
 	if (IN_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0)
 		return (0);
 
 	if (lle->la_flags & LLE_IFADDR) {
 
 		/*
 		 * Delete LLE_IFADDR records IFF address & flag matches.
 		 * Note that addr is the interface address within prefix
 		 * being matched.
 		 * Note also we should handle 'ifdown' cases without removing
 		 * ifaddr macs.
 		 */
 		if (addr.s_addr == lle_addr.s_addr && (flags & LLE_STATIC) != 0)
 			return (1);
 		return (0);
 	}
 
 	/* flags & LLE_STATIC means deleting both dynamic and static entries */
 	if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC))
 		return (1);
 
 	return (0);
 }
 
 static void
 in_lltable_free_entry(struct lltable *llt, struct llentry *lle)
 {
 	struct ifnet *ifp;
 	size_t pkts_dropped;
 
 	LLE_WLOCK_ASSERT(lle);
 	KASSERT(llt != NULL, ("lltable is NULL"));
 
 	/* Unlink entry from table if not already */
 	if ((lle->la_flags & LLE_LINKED) != 0) {
 		ifp = llt->llt_ifp;
 		IF_AFDATA_WLOCK_ASSERT(ifp);
 		lltable_unlink_entry(llt, lle);
 	}
 
 	/* cancel timer */
 	if (callout_stop(&lle->lle_timer) > 0)
 		LLE_REMREF(lle);
 
 	/* Drop hold queue */
 	pkts_dropped = llentry_free(lle);
 	ARPSTAT_ADD(dropped, pkts_dropped);
 }
 
 static int
 in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr)
 {
 	struct rt_addrinfo info;
 	struct sockaddr_in rt_key, rt_mask;
 	struct sockaddr rt_gateway;
 	int rt_flags;
 
 	KASSERT(l3addr->sa_family == AF_INET,
 	    ("sin_family %d", l3addr->sa_family));
 
 	bzero(&rt_key, sizeof(rt_key));
 	rt_key.sin_len = sizeof(rt_key);
 	bzero(&rt_mask, sizeof(rt_mask));
 	rt_mask.sin_len = sizeof(rt_mask);
 	bzero(&rt_gateway, sizeof(rt_gateway));
 	rt_gateway.sa_len = sizeof(rt_gateway);
 
 	bzero(&info, sizeof(info));
 	info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key;
 	info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&rt_mask;
 	info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&rt_gateway;
 
 	if (rib_lookup_info(ifp->if_fib, l3addr, NHR_REF, 0, &info) != 0)
 		return (EINVAL);
 
 	rt_flags = info.rti_flags;
 
 	/*
 	 * If the gateway for an existing host route matches the target L3
 	 * address, which is a special route inserted by some implementation
 	 * such as MANET, and the interface is of the correct type, then
 	 * allow for ARP to proceed.
 	 */
 	if (rt_flags & RTF_GATEWAY) {
 		if (!(rt_flags & RTF_HOST) || !info.rti_ifp ||
 		    info.rti_ifp->if_type != IFT_ETHER ||
 		    (info.rti_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 ||
 		    memcmp(rt_gateway.sa_data, l3addr->sa_data,
 		    sizeof(in_addr_t)) != 0) {
 			rib_free_info(&info);
 			return (EINVAL);
 		}
 	}
 	rib_free_info(&info);
 
 	/*
 	 * Make sure that at least the destination address is covered
 	 * by the route. This is for handling the case where 2 or more
 	 * interfaces have the same prefix. An incoming packet arrives
 	 * on one interface and the corresponding outgoing packet leaves
 	 * another interface.
 	 */
 	if (!(rt_flags & RTF_HOST) && info.rti_ifp != ifp) {
 		const char *sa, *mask, *addr, *lim;
 		int len;
 
 		mask = (const char *)&rt_mask;
 		/*
 		 * Just being extra cautious to avoid some custom
 		 * code getting into trouble.
 		 */
 		if ((info.rti_addrs & RTA_NETMASK) == 0)
 			return (EINVAL);
 
 		sa = (const char *)&rt_key;
 		addr = (const char *)l3addr;
 		len = ((const struct sockaddr_in *)l3addr)->sin_len;
 		lim = addr + len;
 
 		for ( ; addr < lim; sa++, mask++, addr++) {
 			if ((*sa ^ *addr) & *mask) {
 #ifdef DIAGNOSTIC
 				log(LOG_INFO, "IPv4 address: \"%s\" is not on the network\n",
 				    inet_ntoa(((const struct sockaddr_in *)l3addr)->sin_addr));
 #endif
 				return (EINVAL);
 			}
 		}
 	}
 
 	return (0);
 }
 
 static inline uint32_t
 in_lltable_hash_dst(const struct in_addr dst, uint32_t hsize)
 {
 
 	return (IN_LLTBL_HASH(dst.s_addr, hsize));
 }
 
 static uint32_t
 in_lltable_hash(const struct llentry *lle, uint32_t hsize)
 {
 
 	return (in_lltable_hash_dst(lle->r_l3addr.addr4, hsize));
 }
 
 static void
 in_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa)
 {
 	struct sockaddr_in *sin;
 
 	sin = (struct sockaddr_in *)sa;
 	bzero(sin, sizeof(*sin));
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = lle->r_l3addr.addr4;
 }
 
 static inline struct llentry *
 in_lltable_find_dst(struct lltable *llt, struct in_addr dst)
 {
 	struct llentry *lle;
 	struct llentries *lleh;
 	u_int hashidx;
 
 	hashidx = in_lltable_hash_dst(dst, llt->llt_hsize);
 	lleh = &llt->lle_head[hashidx];
 	LIST_FOREACH(lle, lleh, lle_next) {
 		if (lle->la_flags & LLE_DELETED)
 			continue;
 		if (lle->r_l3addr.addr4.s_addr == dst.s_addr)
 			break;
 	}
 
 	return (lle);
 }
 
 static void
 in_lltable_delete_entry(struct lltable *llt, struct llentry *lle)
 {
 
 	lle->la_flags |= LLE_DELETED;
 	EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED);
 #ifdef DIAGNOSTIC
 	log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle);
 #endif
 	llentry_free(lle);
 }
 
 static struct llentry *
 in_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr)
 {
 	const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr;
 	struct ifnet *ifp = llt->llt_ifp;
 	struct llentry *lle;
 	char linkhdr[LLE_MAX_LINKHDR];
 	size_t linkhdrsize;
 	int lladdr_off;
 
 	KASSERT(l3addr->sa_family == AF_INET,
 	    ("sin_family %d", l3addr->sa_family));
 
 	/*
 	 * A route that covers the given address must have
 	 * been installed 1st because we are doing a resolution,
 	 * verify this.
 	 */
 	if (!(flags & LLE_IFADDR) &&
 	    in_lltable_rtcheck(ifp, flags, l3addr) != 0)
 		return (NULL);
 
 	lle = in_lltable_new(sin->sin_addr, flags);
 	if (lle == NULL) {
 		log(LOG_INFO, "lla_lookup: new lle malloc failed\n");
 		return (NULL);
 	}
 	lle->la_flags = flags;
 	if (flags & LLE_STATIC)
 		lle->r_flags |= RLLE_VALID;
 	if ((flags & LLE_IFADDR) == LLE_IFADDR) {
 		linkhdrsize = LLE_MAX_LINKHDR;
 		if (lltable_calc_llheader(ifp, AF_INET, IF_LLADDR(ifp),
 		    linkhdr, &linkhdrsize, &lladdr_off) != 0)
 			return (NULL);
 		lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize,
 		    lladdr_off);
 		lle->la_flags |= LLE_STATIC;
 		lle->r_flags |= (RLLE_VALID | RLLE_IFADDR);
 	}
 
 	return (lle);
 }
 
 /*
  * Return NULL if not found or marked for deletion.
  * If found return lle read locked.
  */
 static struct llentry *
 in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr)
 {
 	const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr;
 	struct llentry *lle;
 
 	IF_AFDATA_LOCK_ASSERT(llt->llt_ifp);
 	KASSERT(l3addr->sa_family == AF_INET,
 	    ("sin_family %d", l3addr->sa_family));
 	lle = in_lltable_find_dst(llt, sin->sin_addr);
 
 	if (lle == NULL)
 		return (NULL);
 
 	KASSERT((flags & (LLE_UNLOCKED|LLE_EXCLUSIVE)) !=
 	    (LLE_UNLOCKED|LLE_EXCLUSIVE),("wrong lle request flags: 0x%X",
 	    flags));
 
 	if (flags & LLE_UNLOCKED)
 		return (lle);
 
 	if (flags & LLE_EXCLUSIVE)
 		LLE_WLOCK(lle);
 	else
 		LLE_RLOCK(lle);
 
 	return (lle);
 }
 
 static int
 in_lltable_dump_entry(struct lltable *llt, struct llentry *lle,
     struct sysctl_req *wr)
 {
 	struct ifnet *ifp = llt->llt_ifp;
 	/* XXX stack use */
 	struct {
 		struct rt_msghdr	rtm;
 		struct sockaddr_in	sin;
 		struct sockaddr_dl	sdl;
 	} arpc;
 	struct sockaddr_dl *sdl;
 	int error;
 
 	bzero(&arpc, sizeof(arpc));
 			/* skip deleted entries */
 			if ((lle->la_flags & LLE_DELETED) == LLE_DELETED)
 				return (0);
 			/* Skip if jailed and not a valid IP of the prison. */
 			lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin);
 			if (prison_if(wr->td->td_ucred,
 			    (struct sockaddr *)&arpc.sin) != 0)
 				return (0);
 			/*
 			 * produce a msg made of:
 			 *  struct rt_msghdr;
 			 *  struct sockaddr_in; (IPv4)
 			 *  struct sockaddr_dl;
 			 */
 			arpc.rtm.rtm_msglen = sizeof(arpc);
 			arpc.rtm.rtm_version = RTM_VERSION;
 			arpc.rtm.rtm_type = RTM_GET;
 			arpc.rtm.rtm_flags = RTF_UP;
 			arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY;
 
 			/* publish */
 			if (lle->la_flags & LLE_PUB)
 				arpc.rtm.rtm_flags |= RTF_ANNOUNCE;
 
 			sdl = &arpc.sdl;
 			sdl->sdl_family = AF_LINK;
 			sdl->sdl_len = sizeof(*sdl);
 			sdl->sdl_index = ifp->if_index;
 			sdl->sdl_type = ifp->if_type;
 			if ((lle->la_flags & LLE_VALID) == LLE_VALID) {
 				sdl->sdl_alen = ifp->if_addrlen;
 				bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen);
 			} else {
 				sdl->sdl_alen = 0;
 				bzero(LLADDR(sdl), ifp->if_addrlen);
 			}
 
 			arpc.rtm.rtm_rmx.rmx_expire =
 			    lle->la_flags & LLE_STATIC ? 0 : lle->la_expire;
 			arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA);
 			if (lle->la_flags & LLE_STATIC)
 				arpc.rtm.rtm_flags |= RTF_STATIC;
 			if (lle->la_flags & LLE_IFADDR)
 				arpc.rtm.rtm_flags |= RTF_PINNED;
 			arpc.rtm.rtm_index = ifp->if_index;
 			error = SYSCTL_OUT(wr, &arpc, sizeof(arpc));
 
 	return (error);
 }
 
 static struct lltable *
 in_lltattach(struct ifnet *ifp)
 {
 	struct lltable *llt;
 
 	llt = lltable_allocate_htbl(IN_LLTBL_DEFAULT_HSIZE);
  	llt->llt_af = AF_INET;
  	llt->llt_ifp = ifp;
 
 	llt->llt_lookup = in_lltable_lookup;
 	llt->llt_alloc_entry = in_lltable_alloc;
 	llt->llt_delete_entry = in_lltable_delete_entry;
 	llt->llt_dump_entry = in_lltable_dump_entry;
 	llt->llt_hash = in_lltable_hash;
 	llt->llt_fill_sa_entry = in_lltable_fill_sa_entry;
 	llt->llt_free_entry = in_lltable_free_entry;
 	llt->llt_match_prefix = in_lltable_match_prefix;
  	lltable_link(llt);
 
 	return (llt);
 }
 
 void *
 in_domifattach(struct ifnet *ifp)
 {
 	struct in_ifinfo *ii;
 
 	ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO);
 
 	ii->ii_llt = in_lltattach(ifp);
 	ii->ii_igmp = igmp_domifattach(ifp);
 
 	return (ii);
 }
 
 void
 in_domifdetach(struct ifnet *ifp, void *aux)
 {
 	struct in_ifinfo *ii = (struct in_ifinfo *)aux;
 
 	igmp_domifdetach(ifp);
 	lltable_free(ii->ii_llt);
 	free(ii, M_IFADDR);
 }
Index: projects/vnet/sys/netinet/in.h
===================================================================
--- projects/vnet/sys/netinet/in.h	(revision 297511)
+++ projects/vnet/sys/netinet/in.h	(revision 297512)
@@ -1,669 +1,669 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in.h	8.3 (Berkeley) 1/3/94
  * $FreeBSD$
  */
 
 #ifndef _NETINET_IN_H_
 #define	_NETINET_IN_H_
 
 #include <sys/cdefs.h>
 #include <sys/_types.h>
 #include <machine/endian.h>
 
 /* Protocols common to RFC 1700, POSIX, and X/Open. */
 #define	IPPROTO_IP		0		/* dummy for IP */
 #define	IPPROTO_ICMP		1		/* control message protocol */
 #define	IPPROTO_TCP		6		/* tcp */
 #define	IPPROTO_UDP		17		/* user datagram protocol */
 
 #define	INADDR_ANY		((in_addr_t)0x00000000)
 #define	INADDR_BROADCAST	((in_addr_t)0xffffffff)	/* must be masked */
 
 #ifndef _UINT8_T_DECLARED
 typedef	__uint8_t		uint8_t;
 #define	_UINT8_T_DECLARED
 #endif
 
 #ifndef _UINT16_T_DECLARED
 typedef	__uint16_t		uint16_t;
 #define	_UINT16_T_DECLARED
 #endif
 
 #ifndef _UINT32_T_DECLARED
 typedef	__uint32_t		uint32_t;
 #define	_UINT32_T_DECLARED
 #endif
 
 #ifndef _IN_ADDR_T_DECLARED
 typedef	uint32_t		in_addr_t;
 #define	_IN_ADDR_T_DECLARED
 #endif
 
 #ifndef _IN_PORT_T_DECLARED
 typedef	uint16_t		in_port_t;
 #define	_IN_PORT_T_DECLARED
 #endif
 
 #ifndef _SA_FAMILY_T_DECLARED
 typedef	__sa_family_t		sa_family_t;
 #define	_SA_FAMILY_T_DECLARED
 #endif
 
 /* Internet address (a structure for historical reasons). */
 #ifndef	_STRUCT_IN_ADDR_DECLARED
 struct in_addr {
 	in_addr_t s_addr;
 };
 #define	_STRUCT_IN_ADDR_DECLARED
 #endif
 
 #ifndef	_SOCKLEN_T_DECLARED
 typedef	__socklen_t	socklen_t;
 #define	_SOCKLEN_T_DECLARED
 #endif
 
 #include <sys/_sockaddr_storage.h>
 
 /* Socket address, internet style. */
 struct sockaddr_in {
 	uint8_t	sin_len;
 	sa_family_t	sin_family;
 	in_port_t	sin_port;
 	struct	in_addr sin_addr;
 	char	sin_zero[8];
 };
 
 #if !defined(_KERNEL) && __POSIX_VISIBLE >= 200112
 
 #ifndef _BYTEORDER_PROTOTYPED
 #define	_BYTEORDER_PROTOTYPED
 __BEGIN_DECLS
 uint32_t	htonl(uint32_t);
 uint16_t	htons(uint16_t);
 uint32_t	ntohl(uint32_t);
 uint16_t	ntohs(uint16_t);
 __END_DECLS
 #endif
 
 #ifndef _BYTEORDER_FUNC_DEFINED
 #define	_BYTEORDER_FUNC_DEFINED
 #define	htonl(x)	__htonl(x)
 #define	htons(x)	__htons(x)
 #define	ntohl(x)	__ntohl(x)
 #define	ntohs(x)	__ntohs(x)
 #endif
 
 #endif /* !_KERNEL && __POSIX_VISIBLE >= 200112 */
 
 #if __POSIX_VISIBLE >= 200112
 #define	IPPROTO_IPV6		41		/* IP6 header */
 #define	IPPROTO_RAW		255		/* raw IP packet */
 #define	INET_ADDRSTRLEN		16
 #endif
 
 #if __BSD_VISIBLE
 /*
  * Constants and structures defined by the internet system,
  * Per RFC 790, September 1981, and numerous additions.
  */
 
 /*
  * Protocols (RFC 1700)
  */
 #define	IPPROTO_HOPOPTS		0		/* IP6 hop-by-hop options */
 #define	IPPROTO_IGMP		2		/* group mgmt protocol */
 #define	IPPROTO_GGP		3		/* gateway^2 (deprecated) */
 #define	IPPROTO_IPV4		4		/* IPv4 encapsulation */
 #define	IPPROTO_IPIP		IPPROTO_IPV4	/* for compatibility */
 #define	IPPROTO_ST		7		/* Stream protocol II */
 #define	IPPROTO_EGP		8		/* exterior gateway protocol */
 #define	IPPROTO_PIGP		9		/* private interior gateway */
 #define	IPPROTO_RCCMON		10		/* BBN RCC Monitoring */
 #define	IPPROTO_NVPII		11		/* network voice protocol*/
 #define	IPPROTO_PUP		12		/* pup */
 #define	IPPROTO_ARGUS		13		/* Argus */
 #define	IPPROTO_EMCON		14		/* EMCON */
 #define	IPPROTO_XNET		15		/* Cross Net Debugger */
 #define	IPPROTO_CHAOS		16		/* Chaos*/
 #define	IPPROTO_MUX		18		/* Multiplexing */
 #define	IPPROTO_MEAS		19		/* DCN Measurement Subsystems */
 #define	IPPROTO_HMP		20		/* Host Monitoring */
 #define	IPPROTO_PRM		21		/* Packet Radio Measurement */
 #define	IPPROTO_IDP		22		/* xns idp */
 #define	IPPROTO_TRUNK1		23		/* Trunk-1 */
 #define	IPPROTO_TRUNK2		24		/* Trunk-2 */
 #define	IPPROTO_LEAF1		25		/* Leaf-1 */
 #define	IPPROTO_LEAF2		26		/* Leaf-2 */
 #define	IPPROTO_RDP		27		/* Reliable Data */
 #define	IPPROTO_IRTP		28		/* Reliable Transaction */
 #define	IPPROTO_TP		29		/* tp-4 w/ class negotiation */
 #define	IPPROTO_BLT		30		/* Bulk Data Transfer */
 #define	IPPROTO_NSP		31		/* Network Services */
 #define	IPPROTO_INP		32		/* Merit Internodal */
 #define	IPPROTO_SEP		33		/* Sequential Exchange */
 #define	IPPROTO_3PC		34		/* Third Party Connect */
 #define	IPPROTO_IDPR		35		/* InterDomain Policy Routing */
 #define	IPPROTO_XTP		36		/* XTP */
 #define	IPPROTO_DDP		37		/* Datagram Delivery */
 #define	IPPROTO_CMTP		38		/* Control Message Transport */
 #define	IPPROTO_TPXX		39		/* TP++ Transport */
 #define	IPPROTO_IL		40		/* IL transport protocol */
 #define	IPPROTO_SDRP		42		/* Source Demand Routing */
 #define	IPPROTO_ROUTING		43		/* IP6 routing header */
 #define	IPPROTO_FRAGMENT	44		/* IP6 fragmentation header */
 #define	IPPROTO_IDRP		45		/* InterDomain Routing*/
 #define	IPPROTO_RSVP		46		/* resource reservation */
 #define	IPPROTO_GRE		47		/* General Routing Encap. */
 #define	IPPROTO_MHRP		48		/* Mobile Host Routing */
 #define	IPPROTO_BHA		49		/* BHA */
 #define	IPPROTO_ESP		50		/* IP6 Encap Sec. Payload */
 #define	IPPROTO_AH		51		/* IP6 Auth Header */
 #define	IPPROTO_INLSP		52		/* Integ. Net Layer Security */
 #define	IPPROTO_SWIPE		53		/* IP with encryption */
 #define	IPPROTO_NHRP		54		/* Next Hop Resolution */
 #define	IPPROTO_MOBILE		55		/* IP Mobility */
 #define	IPPROTO_TLSP		56		/* Transport Layer Security */
 #define	IPPROTO_SKIP		57		/* SKIP */
 #define	IPPROTO_ICMPV6		58		/* ICMP6 */
 #define	IPPROTO_NONE		59		/* IP6 no next header */
 #define	IPPROTO_DSTOPTS		60		/* IP6 destination option */
 #define	IPPROTO_AHIP		61		/* any host internal protocol */
 #define	IPPROTO_CFTP		62		/* CFTP */
 #define	IPPROTO_HELLO		63		/* "hello" routing protocol */
 #define	IPPROTO_SATEXPAK	64		/* SATNET/Backroom EXPAK */
 #define	IPPROTO_KRYPTOLAN	65		/* Kryptolan */
 #define	IPPROTO_RVD		66		/* Remote Virtual Disk */
 #define	IPPROTO_IPPC		67		/* Pluribus Packet Core */
 #define	IPPROTO_ADFS		68		/* Any distributed FS */
 #define	IPPROTO_SATMON		69		/* Satnet Monitoring */
 #define	IPPROTO_VISA		70		/* VISA Protocol */
 #define	IPPROTO_IPCV		71		/* Packet Core Utility */
 #define	IPPROTO_CPNX		72		/* Comp. Prot. Net. Executive */
 #define	IPPROTO_CPHB		73		/* Comp. Prot. HeartBeat */
 #define	IPPROTO_WSN		74		/* Wang Span Network */
 #define	IPPROTO_PVP		75		/* Packet Video Protocol */
 #define	IPPROTO_BRSATMON	76		/* BackRoom SATNET Monitoring */
 #define	IPPROTO_ND		77		/* Sun net disk proto (temp.) */
 #define	IPPROTO_WBMON		78		/* WIDEBAND Monitoring */
 #define	IPPROTO_WBEXPAK		79		/* WIDEBAND EXPAK */
 #define	IPPROTO_EON		80		/* ISO cnlp */
 #define	IPPROTO_VMTP		81		/* VMTP */
 #define	IPPROTO_SVMTP		82		/* Secure VMTP */
 #define	IPPROTO_VINES		83		/* Banyon VINES */
 #define	IPPROTO_TTP		84		/* TTP */
 #define	IPPROTO_IGP		85		/* NSFNET-IGP */
 #define	IPPROTO_DGP		86		/* dissimilar gateway prot. */
 #define	IPPROTO_TCF		87		/* TCF */
 #define	IPPROTO_IGRP		88		/* Cisco/GXS IGRP */
 #define	IPPROTO_OSPFIGP		89		/* OSPFIGP */
 #define	IPPROTO_SRPC		90		/* Strite RPC protocol */
 #define	IPPROTO_LARP		91		/* Locus Address Resoloution */
 #define	IPPROTO_MTP		92		/* Multicast Transport */
 #define	IPPROTO_AX25		93		/* AX.25 Frames */
 #define	IPPROTO_IPEIP		94		/* IP encapsulated in IP */
 #define	IPPROTO_MICP		95		/* Mobile Int.ing control */
 #define	IPPROTO_SCCSP		96		/* Semaphore Comm. security */
 #define	IPPROTO_ETHERIP		97		/* Ethernet IP encapsulation */
 #define	IPPROTO_ENCAP		98		/* encapsulation header */
 #define	IPPROTO_APES		99		/* any private encr. scheme */
 #define	IPPROTO_GMTP		100		/* GMTP*/
 #define	IPPROTO_IPCOMP		108		/* payload compression (IPComp) */
 #define	IPPROTO_SCTP		132		/* SCTP */
 #define	IPPROTO_MH		135		/* IPv6 Mobility Header */
 #define	IPPROTO_UDPLITE		136		/* UDP-Lite */
 #define	IPPROTO_HIP		139		/* IP6 Host Identity Protocol */
 #define	IPPROTO_SHIM6		140		/* IP6 Shim6 Protocol */
 /* 101-254: Partly Unassigned */
 #define	IPPROTO_PIM		103		/* Protocol Independent Mcast */
 #define	IPPROTO_CARP		112		/* CARP */
 #define	IPPROTO_PGM		113		/* PGM */
 #define	IPPROTO_MPLS		137		/* MPLS-in-IP */
 #define	IPPROTO_PFSYNC		240		/* PFSYNC */
 #define	IPPROTO_RESERVED_253	253		/* Reserved */
 #define	IPPROTO_RESERVED_254	254		/* Reserved */
 /* 255: Reserved */
 /* BSD Private, local use, namespace incursion, no longer used */
 #define	IPPROTO_OLD_DIVERT	254		/* OLD divert pseudo-proto */
 #define	IPPROTO_MAX		256
 
 /* last return value of *_input(), meaning "all job for this pkt is done".  */
 #define	IPPROTO_DONE		257
 
 /* Only used internally, so can be outside the range of valid IP protocols. */
 #define	IPPROTO_DIVERT		258		/* divert pseudo-protocol */
 #define	IPPROTO_SEND		259		/* SeND pseudo-protocol */
 
 /*
  * Defined to avoid confusion.  The master value is defined by
  * PROTO_SPACER in sys/protosw.h.
  */
 #define	IPPROTO_SPACER		32767		/* spacer for loadable protos */
 
 /*
  * Local port number conventions:
  *
  * When a user does a bind(2) or connect(2) with a port number of zero,
  * a non-conflicting local port address is chosen.
  * The default range is IPPORT_HIFIRSTAUTO through
  * IPPORT_HILASTAUTO, although that is settable by sysctl.
  *
  * A user may set the IPPROTO_IP option IP_PORTRANGE to change this
  * default assignment range.
  *
  * The value IP_PORTRANGE_DEFAULT causes the default behavior.
  *
  * The value IP_PORTRANGE_HIGH changes the range of candidate port numbers
  * into the "high" range.  These are reserved for client outbound connections
  * which do not want to be filtered by any firewalls.
  *
  * The value IP_PORTRANGE_LOW changes the range to the "low" are
  * that is (by convention) restricted to privileged processes.  This
  * convention is based on "vouchsafe" principles only.  It is only secure
  * if you trust the remote host to restrict these ports.
  *
  * The default range of ports and the high range can be changed by
  * sysctl(3).  (net.inet.ip.port{hi,low}{first,last}_auto)
  *
  * Changing those values has bad security implications if you are
  * using a stateless firewall that is allowing packets outside of that
  * range in order to allow transparent outgoing connections.
  *
  * Such a firewall configuration will generally depend on the use of these
  * default values.  If you change them, you may find your Security
  * Administrator looking for you with a heavy object.
  *
  * For a slightly more orthodox text view on this:
  *
  *            ftp://ftp.isi.edu/in-notes/iana/assignments/port-numbers
  *
  *    port numbers are divided into three ranges:
  *
  *                0 -  1023 Well Known Ports
  *             1024 - 49151 Registered Ports
  *            49152 - 65535 Dynamic and/or Private Ports
  *
  */
 
 /*
  * Ports < IPPORT_RESERVED are reserved for
  * privileged processes (e.g. root).         (IP_PORTRANGE_LOW)
  */
 #define	IPPORT_RESERVED		1024
 
 /*
  * Default local port range, used by IP_PORTRANGE_DEFAULT
  */
 #define IPPORT_EPHEMERALFIRST	10000
 #define IPPORT_EPHEMERALLAST	65535 
  
 /*
  * Dynamic port range, used by IP_PORTRANGE_HIGH.
  */
 #define	IPPORT_HIFIRSTAUTO	49152
 #define	IPPORT_HILASTAUTO	65535
 
 /*
  * Scanning for a free reserved port return a value below IPPORT_RESERVED,
  * but higher than IPPORT_RESERVEDSTART.  Traditionally the start value was
  * 512, but that conflicts with some well-known-services that firewalls may
  * have a fit if we use.
  */
 #define	IPPORT_RESERVEDSTART	600
 
 #define	IPPORT_MAX		65535
 
 /*
  * Definitions of bits in internet address integers.
  * On subnets, the decomposition of addresses to host and net parts
  * is done according to subnet mask, not the masks here.
  */
 #define	IN_CLASSA(i)		(((in_addr_t)(i) & 0x80000000) == 0)
 #define	IN_CLASSA_NET		0xff000000
 #define	IN_CLASSA_NSHIFT	24
 #define	IN_CLASSA_HOST		0x00ffffff
 #define	IN_CLASSA_MAX		128
 
 #define	IN_CLASSB(i)		(((in_addr_t)(i) & 0xc0000000) == 0x80000000)
 #define	IN_CLASSB_NET		0xffff0000
 #define	IN_CLASSB_NSHIFT	16
 #define	IN_CLASSB_HOST		0x0000ffff
 #define	IN_CLASSB_MAX		65536
 
 #define	IN_CLASSC(i)		(((in_addr_t)(i) & 0xe0000000) == 0xc0000000)
 #define	IN_CLASSC_NET		0xffffff00
 #define	IN_CLASSC_NSHIFT	8
 #define	IN_CLASSC_HOST		0x000000ff
 
 #define	IN_CLASSD(i)		(((in_addr_t)(i) & 0xf0000000) == 0xe0000000)
 #define	IN_CLASSD_NET		0xf0000000	/* These ones aren't really */
 #define	IN_CLASSD_NSHIFT	28		/* net and host fields, but */
 #define	IN_CLASSD_HOST		0x0fffffff	/* routing needn't know.    */
 #define	IN_MULTICAST(i)		IN_CLASSD(i)
 
 #define	IN_EXPERIMENTAL(i)	(((in_addr_t)(i) & 0xf0000000) == 0xf0000000)
 #define	IN_BADCLASS(i)		(((in_addr_t)(i) & 0xf0000000) == 0xf0000000)
 
 #define IN_LINKLOCAL(i)		(((in_addr_t)(i) & 0xffff0000) == 0xa9fe0000)
 #define IN_LOOPBACK(i)		(((in_addr_t)(i) & 0xff000000) == 0x7f000000)
 #define IN_ZERONET(i)		(((in_addr_t)(i) & 0xff000000) == 0)
 
 #define	IN_PRIVATE(i)	((((in_addr_t)(i) & 0xff000000) == 0x0a000000) || \
 			 (((in_addr_t)(i) & 0xfff00000) == 0xac100000) || \
 			 (((in_addr_t)(i) & 0xffff0000) == 0xc0a80000))
 
 #define	IN_LOCAL_GROUP(i)	(((in_addr_t)(i) & 0xffffff00) == 0xe0000000)
  
 #define	IN_ANY_LOCAL(i)		(IN_LINKLOCAL(i) || IN_LOCAL_GROUP(i))
 
 #define	INADDR_LOOPBACK		((in_addr_t)0x7f000001)
 #ifndef _KERNEL
 #define	INADDR_NONE		((in_addr_t)0xffffffff)	/* -1 return */
 #endif
 
 #define	INADDR_UNSPEC_GROUP	((in_addr_t)0xe0000000)	/* 224.0.0.0 */
 #define	INADDR_ALLHOSTS_GROUP	((in_addr_t)0xe0000001)	/* 224.0.0.1 */
 #define	INADDR_ALLRTRS_GROUP	((in_addr_t)0xe0000002)	/* 224.0.0.2 */
 #define	INADDR_ALLRPTS_GROUP	((in_addr_t)0xe0000016)	/* 224.0.0.22, IGMPv3 */
 #define	INADDR_CARP_GROUP	((in_addr_t)0xe0000012)	/* 224.0.0.18 */
 #define	INADDR_PFSYNC_GROUP	((in_addr_t)0xe00000f0)	/* 224.0.0.240 */
 #define	INADDR_ALLMDNS_GROUP	((in_addr_t)0xe00000fb)	/* 224.0.0.251 */
 #define	INADDR_MAX_LOCAL_GROUP	((in_addr_t)0xe00000ff)	/* 224.0.0.255 */
 
 #define	IN_LOOPBACKNET		127			/* official! */
 
 #define	IN_RFC3021_MASK		((in_addr_t)0xfffffffe)
 
 /*
  * Options for use with [gs]etsockopt at the IP level.
  * First word of comment is data type; bool is stored in int.
  */
 #define	IP_OPTIONS		1    /* buf/ip_opts; set/get IP options */
 #define	IP_HDRINCL		2    /* int; header is included with data */
 #define	IP_TOS			3    /* int; IP type of service and preced. */
 #define	IP_TTL			4    /* int; IP time to live */
 #define	IP_RECVOPTS		5    /* bool; receive all IP opts w/dgram */
 #define	IP_RECVRETOPTS		6    /* bool; receive IP opts for response */
 #define	IP_RECVDSTADDR		7    /* bool; receive IP dst addr w/dgram */
 #define	IP_SENDSRCADDR		IP_RECVDSTADDR /* cmsg_type to set src addr */
 #define	IP_RETOPTS		8    /* ip_opts; set/get IP options */
 #define	IP_MULTICAST_IF		9    /* struct in_addr *or* struct ip_mreqn;
 				      * set/get IP multicast i/f  */
 #define	IP_MULTICAST_TTL	10   /* u_char; set/get IP multicast ttl */
 #define	IP_MULTICAST_LOOP	11   /* u_char; set/get IP multicast loopback */
 #define	IP_ADD_MEMBERSHIP	12   /* ip_mreq; add an IP group membership */
 #define	IP_DROP_MEMBERSHIP	13   /* ip_mreq; drop an IP group membership */
 #define	IP_MULTICAST_VIF	14   /* set/get IP mcast virt. iface */
 #define	IP_RSVP_ON		15   /* enable RSVP in kernel */
 #define	IP_RSVP_OFF		16   /* disable RSVP in kernel */
 #define	IP_RSVP_VIF_ON		17   /* set RSVP per-vif socket */
 #define	IP_RSVP_VIF_OFF		18   /* unset RSVP per-vif socket */
 #define	IP_PORTRANGE		19   /* int; range to choose for unspec port */
 #define	IP_RECVIF		20   /* bool; receive reception if w/dgram */
 /* for IPSEC */
 #define	IP_IPSEC_POLICY		21   /* int; set/get security policy */
 				     /* unused; was IP_FAITH */
 #define	IP_ONESBCAST		23   /* bool: send all-ones broadcast */
 #define	IP_BINDANY		24   /* bool: allow bind to any address */
 #define	IP_BINDMULTI		25   /* bool: allow multiple listeners on a tuple */
 #define	IP_RSS_LISTEN_BUCKET	26   /* int; set RSS listen bucket */
 
 /*
  * Options for controlling the firewall and dummynet.
  * Historical options (from 40 to 64) will eventually be
  * replaced by only two options, IP_FW3 and IP_DUMMYNET3.
  */
 #define	IP_FW_TABLE_ADD		40   /* add entry */
 #define	IP_FW_TABLE_DEL		41   /* delete entry */
 #define	IP_FW_TABLE_FLUSH	42   /* flush table */
 #define	IP_FW_TABLE_GETSIZE	43   /* get table size */
 #define	IP_FW_TABLE_LIST	44   /* list table contents */
 
 #define	IP_FW3			48   /* generic ipfw v.3 sockopts */
 #define	IP_DUMMYNET3		49   /* generic dummynet v.3 sockopts */
 
 #define	IP_FW_ADD		50   /* add a firewall rule to chain */
 #define	IP_FW_DEL		51   /* delete a firewall rule from chain */
 #define	IP_FW_FLUSH		52   /* flush firewall rule chain */
 #define	IP_FW_ZERO		53   /* clear single/all firewall counter(s) */
 #define	IP_FW_GET		54   /* get entire firewall rule chain */
 #define	IP_FW_RESETLOG		55   /* reset logging counters */
 
 #define IP_FW_NAT_CFG           56   /* add/config a nat rule */
 #define IP_FW_NAT_DEL           57   /* delete a nat rule */
 #define IP_FW_NAT_GET_CONFIG    58   /* get configuration of a nat rule */
 #define IP_FW_NAT_GET_LOG       59   /* get log of a nat rule */
 
 #define	IP_DUMMYNET_CONFIGURE	60   /* add/configure a dummynet pipe */
 #define	IP_DUMMYNET_DEL		61   /* delete a dummynet pipe from chain */
 #define	IP_DUMMYNET_FLUSH	62   /* flush dummynet */
 #define	IP_DUMMYNET_GET		64   /* get entire dummynet pipes */
 
 #define	IP_RECVTTL		65   /* bool; receive IP TTL w/dgram */
 #define	IP_MINTTL		66   /* minimum TTL for packet or drop */
 #define	IP_DONTFRAG		67   /* don't fragment packet */
 #define	IP_RECVTOS		68   /* bool; receive IP TOS w/dgram */
 
 /* IPv4 Source Filter Multicast API [RFC3678] */
 #define	IP_ADD_SOURCE_MEMBERSHIP	70   /* join a source-specific group */
 #define	IP_DROP_SOURCE_MEMBERSHIP	71   /* drop a single source */
 #define	IP_BLOCK_SOURCE			72   /* block a source */
 #define	IP_UNBLOCK_SOURCE		73   /* unblock a source */
 
 /* The following option is private; do not use it from user applications. */
 #define	IP_MSFILTER			74   /* set/get filter list */
 
 /* Protocol Independent Multicast API [RFC3678] */
 #define	MCAST_JOIN_GROUP		80   /* join an any-source group */
 #define	MCAST_LEAVE_GROUP		81   /* leave all sources for group */
 #define	MCAST_JOIN_SOURCE_GROUP		82   /* join a source-specific group */
 #define	MCAST_LEAVE_SOURCE_GROUP	83   /* leave a single source */
 #define	MCAST_BLOCK_SOURCE		84   /* block a source */
 #define	MCAST_UNBLOCK_SOURCE		85   /* unblock a source */
 
 /* Flow and RSS definitions */
 #define	IP_FLOWID		90   /* get flow id for the given socket/inp */
 #define	IP_FLOWTYPE		91   /* get flow type (M_HASHTYPE) */
 #define	IP_RSSBUCKETID		92   /* get RSS flowid -> bucket mapping */
 #define	IP_RECVFLOWID		93   /* bool; receive IP flowid/flowtype w/ datagram */
 #define	IP_RECVRSSBUCKETID	94   /* bool; receive IP RSS bucket id w/ datagram */
 
 /*
  * Defaults and limits for options
  */
 #define	IP_DEFAULT_MULTICAST_TTL  1	/* normally limit m'casts to 1 hop  */
 #define	IP_DEFAULT_MULTICAST_LOOP 1	/* normally hear sends if a member  */
 
 /*
  * The imo_membership vector for each socket is now dynamically allocated at
  * run-time, bounded by USHRT_MAX, and is reallocated when needed, sized
  * according to a power-of-two increment.
  */
 #define	IP_MIN_MEMBERSHIPS	31
 #define	IP_MAX_MEMBERSHIPS	4095
 #define	IP_MAX_SOURCE_FILTER	1024	/* XXX to be unused */
 
 /*
  * Default resource limits for IPv4 multicast source filtering.
  * These may be modified by sysctl.
  */
 #define	IP_MAX_GROUP_SRC_FILTER		512	/* sources per group */
 #define	IP_MAX_SOCK_SRC_FILTER		128	/* sources per socket/group */
 #define	IP_MAX_SOCK_MUTE_FILTER		128	/* XXX no longer used */
 
 /*
  * Argument structure for IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP.
  */
 struct ip_mreq {
 	struct	in_addr imr_multiaddr;	/* IP multicast address of group */
 	struct	in_addr imr_interface;	/* local IP address of interface */
 };
 
 /*
  * Modified argument structure for IP_MULTICAST_IF, obtained from Linux.
  * This is used to specify an interface index for multicast sends, as
  * the IPv4 legacy APIs do not support this (unless IP_SENDIF is available).
  */
 struct ip_mreqn {
 	struct	in_addr imr_multiaddr;	/* IP multicast address of group */
 	struct	in_addr imr_address;	/* local IP address of interface */
 	int		imr_ifindex;	/* Interface index; cast to uint32_t */
 };
 
 /*
  * Argument structure for IPv4 Multicast Source Filter APIs. [RFC3678]
  */
 struct ip_mreq_source {
 	struct	in_addr imr_multiaddr;	/* IP multicast address of group */
 	struct	in_addr imr_sourceaddr;	/* IP address of source */
 	struct	in_addr imr_interface;	/* local IP address of interface */
 };
 
 /*
  * Argument structures for Protocol-Independent Multicast Source
  * Filter APIs. [RFC3678]
  */
 struct group_req {
 	uint32_t		gr_interface;	/* interface index */
 	struct sockaddr_storage	gr_group;	/* group address */
 };
 
 struct group_source_req {
 	uint32_t		gsr_interface;	/* interface index */
 	struct sockaddr_storage	gsr_group;	/* group address */
 	struct sockaddr_storage	gsr_source;	/* source address */
 };
 
 #ifndef __MSFILTERREQ_DEFINED
 #define __MSFILTERREQ_DEFINED
 /*
  * The following structure is private; do not use it from user applications.
  * It is used to communicate IP_MSFILTER/IPV6_MSFILTER information between
  * the RFC 3678 libc functions and the kernel.
  */
 struct __msfilterreq {
 	uint32_t		 msfr_ifindex;	/* interface index */
 	uint32_t		 msfr_fmode;	/* filter mode for group */
 	uint32_t		 msfr_nsrcs;	/* # of sources in msfr_srcs */
 	struct sockaddr_storage	 msfr_group;	/* group address */
 	struct sockaddr_storage	*msfr_srcs;	/* pointer to the first member
 						 * of a contiguous array of
 						 * sources to filter in full.
 						 */
 };
 #endif
 
 struct sockaddr;
 
 /*
  * Advanced (Full-state) APIs [RFC3678]
  * The RFC specifies uint_t for the 6th argument to [sg]etsourcefilter().
  * We use uint32_t here to be consistent.
  */
 int	setipv4sourcefilter(int, struct in_addr, struct in_addr, uint32_t,
 	    uint32_t, struct in_addr *);
 int	getipv4sourcefilter(int, struct in_addr, struct in_addr, uint32_t *,
 	    uint32_t *, struct in_addr *);
 int	setsourcefilter(int, uint32_t, struct sockaddr *, socklen_t,
 	    uint32_t, uint32_t, struct sockaddr_storage *);
 int	getsourcefilter(int, uint32_t, struct sockaddr *, socklen_t,
 	    uint32_t *, uint32_t *, struct sockaddr_storage *);
 
 /*
  * Filter modes; also used to represent per-socket filter mode internally.
  */
 #define	MCAST_UNDEFINED	0	/* fmode: not yet defined */
 #define	MCAST_INCLUDE	1	/* fmode: include these source(s) */
 #define	MCAST_EXCLUDE	2	/* fmode: exclude these source(s) */
 
 /*
  * Argument for IP_PORTRANGE:
  * - which range to search when port is unspecified at bind() or connect()
  */
 #define	IP_PORTRANGE_DEFAULT	0	/* default range */
 #define	IP_PORTRANGE_HIGH	1	/* "high" - request firewall bypass */
 #define	IP_PORTRANGE_LOW	2	/* "low" - vouchsafe security */
 
 /*
  * Identifiers for IP sysctl nodes
  */
 #define	IPCTL_FORWARDING	1	/* act as router */
 #define	IPCTL_SENDREDIRECTS	2	/* may send redirects when forwarding */
 #define	IPCTL_DEFTTL		3	/* default TTL */
 #ifdef notyet
 #define	IPCTL_DEFMTU		4	/* default MTU */
 #endif
 /*	IPCTL_RTEXPIRE		5	deprecated */
 /*	IPCTL_RTMINEXPIRE	6	deprecated */
 /*	IPCTL_RTMAXCACHE	7	deprecated */
 #define	IPCTL_SOURCEROUTE	8	/* may perform source routes */
 #define	IPCTL_DIRECTEDBROADCAST	9	/* may re-broadcast received packets */
 #define	IPCTL_INTRQMAXLEN	10	/* max length of netisr queue */
 #define	IPCTL_INTRQDROPS	11	/* number of netisr q drops */
 #define	IPCTL_STATS		12	/* ipstat structure */
 #define	IPCTL_ACCEPTSOURCEROUTE	13	/* may accept source routed packets */
 #define	IPCTL_FASTFORWARDING	14	/* use fast IP forwarding code */
 					/* 15, unused, was: IPCTL_KEEPFAITH  */
 #define	IPCTL_GIF_TTL		16	/* default TTL for gif encap packet */
 
 #endif /* __BSD_VISIBLE */
 
 #ifdef _KERNEL
 
 struct ifnet; struct mbuf;	/* forward declarations for Standard C */
 
 int	 in_broadcast(struct in_addr, struct ifnet *);
 int	 in_canforward(struct in_addr);
 int	 in_localaddr(struct in_addr);
 int	 in_localip(struct in_addr);
 int	 in_ifhasaddr(struct ifnet *, struct in_addr);
 int	 inet_aton(const char *, struct in_addr *); /* in libkern */
 char	*inet_ntoa(struct in_addr); /* in libkern */
 char	*inet_ntoa_r(struct in_addr ina, char *buf); /* in libkern */
 char	*inet_ntop(int, const void *, char *, socklen_t); /* in libkern */
 int	 inet_pton(int af, const char *, void *); /* in libkern */
-void	 in_ifdetach(struct ifnet *);
+void	 in_ifdetach(struct ifnet *, int);
 
 #define	in_hosteq(s, t)	((s).s_addr == (t).s_addr)
 #define	in_nullhost(x)	((x).s_addr == INADDR_ANY)
 #define	in_allhosts(x)	((x).s_addr == htonl(INADDR_ALLHOSTS_GROUP))
 
 #define	satosin(sa)	((struct sockaddr_in *)(sa))
 #define	sintosa(sin)	((struct sockaddr *)(sin))
 #define	ifatoia(ifa)	((struct in_ifaddr *)(ifa))
 #endif /* _KERNEL */
 
 /* INET6 stuff */
 #if __POSIX_VISIBLE >= 200112
 #define	__KAME_NETINET_IN_H_INCLUDED_
 #include <netinet6/in6.h>
 #undef __KAME_NETINET_IN_H_INCLUDED_
 #endif
 
 #endif /* !_NETINET_IN_H_*/
Index: projects/vnet/sys/netinet/in_var.h
===================================================================
--- projects/vnet/sys/netinet/in_var.h	(revision 297511)
+++ projects/vnet/sys/netinet/in_var.h	(revision 297512)
@@ -1,396 +1,397 @@
 /*-
  * Copyright (c) 1985, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_var.h	8.2 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _NETINET_IN_VAR_H_
 #define _NETINET_IN_VAR_H_
 
 /*
  * Argument structure for SIOCAIFADDR.
  */
 struct	in_aliasreq {
 	char	ifra_name[IFNAMSIZ];		/* if name, e.g. "en0" */
 	struct	sockaddr_in ifra_addr;
 	struct	sockaddr_in ifra_broadaddr;
 #define ifra_dstaddr ifra_broadaddr
 	struct	sockaddr_in ifra_mask;
 	int	ifra_vhid;
 };
 
 #ifdef _KERNEL
 #include <sys/queue.h>
 #include <sys/fnv_hash.h>
 #include <sys/tree.h>
 
 struct igmp_ifsoftc;
 struct in_multi;
 struct lltable;
 
 /*
  * IPv4 per-interface state.
  */
 struct in_ifinfo {
 	struct lltable		*ii_llt;	/* ARP state */
 	struct igmp_ifsoftc	*ii_igmp;	/* IGMP state */
 	struct in_multi		*ii_allhosts;	/* 224.0.0.1 membership */
 };
 
 /*
  * Interface address, Internet version.  One of these structures
  * is allocated for each Internet address on an interface.
  * The ifaddr structure contains the protocol-independent part
  * of the structure and is assumed to be first.
  */
 struct in_ifaddr {
 	struct	ifaddr ia_ifa;		/* protocol-independent info */
 #define	ia_ifp		ia_ifa.ifa_ifp
 #define ia_flags	ia_ifa.ifa_flags
 					/* ia_subnet{,mask} in host order */
 	u_long	ia_subnet;		/* subnet address */
 	u_long	ia_subnetmask;		/* mask of subnet */
 	LIST_ENTRY(in_ifaddr) ia_hash;	/* entry in bucket of inet addresses */
 	TAILQ_ENTRY(in_ifaddr) ia_link;	/* list of internet addresses */
 	struct	sockaddr_in ia_addr;	/* reserve space for interface name */
 	struct	sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */
 #define	ia_broadaddr	ia_dstaddr
 	struct	sockaddr_in ia_sockmask; /* reserve space for general netmask */
 };
 
 /*
  * Given a pointer to an in_ifaddr (ifaddr),
  * return a pointer to the addr as a sockaddr_in.
  */
 #define IA_SIN(ia)    (&(((struct in_ifaddr *)(ia))->ia_addr))
 #define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr))
 #define IA_MASKSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_sockmask))
 
 #define IN_LNAOF(in, ifa) \
 	((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask))
 
 extern	u_char	inetctlerrmap[];
 
 #define LLTABLE(ifp)	\
 	((struct in_ifinfo *)(ifp)->if_afdata[AF_INET])->ii_llt
 /*
  * Hash table for IP addresses.
  */
 TAILQ_HEAD(in_ifaddrhead, in_ifaddr);
 LIST_HEAD(in_ifaddrhashhead, in_ifaddr);
 
 VNET_DECLARE(struct in_ifaddrhashhead *, in_ifaddrhashtbl);
 VNET_DECLARE(struct in_ifaddrhead, in_ifaddrhead);
 VNET_DECLARE(u_long, in_ifaddrhmask);		/* mask for hash table */
 
 #define	V_in_ifaddrhashtbl	VNET(in_ifaddrhashtbl)
 #define	V_in_ifaddrhead		VNET(in_ifaddrhead)
 #define	V_in_ifaddrhmask	VNET(in_ifaddrhmask)
 
 #define INADDR_NHASH_LOG2       9
 #define INADDR_NHASH		(1 << INADDR_NHASH_LOG2)
 #define INADDR_HASHVAL(x)	fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT)
 #define INADDR_HASH(x) \
 	(&V_in_ifaddrhashtbl[INADDR_HASHVAL(x) & V_in_ifaddrhmask])
 
 extern	struct rmlock in_ifaddr_lock;
 
 #define	IN_IFADDR_LOCK_ASSERT()	rm_assert(&in_ifaddr_lock, RA_LOCKED)
 #define	IN_IFADDR_RLOCK(t)	rm_rlock(&in_ifaddr_lock, (t))
 #define	IN_IFADDR_RLOCK_ASSERT()	rm_assert(&in_ifaddr_lock, RA_RLOCKED)
 #define	IN_IFADDR_RUNLOCK(t)	rm_runlock(&in_ifaddr_lock, (t))
 #define	IN_IFADDR_WLOCK()	rm_wlock(&in_ifaddr_lock)
 #define	IN_IFADDR_WLOCK_ASSERT()	rm_assert(&in_ifaddr_lock, RA_WLOCKED)
 #define	IN_IFADDR_WUNLOCK()	rm_wunlock(&in_ifaddr_lock)
 
 /*
  * Macro for finding the internet address structure (in_ifaddr)
  * corresponding to one of our IP addresses (in_addr).
  */
 #define INADDR_TO_IFADDR(addr, ia) \
 	/* struct in_addr addr; */ \
 	/* struct in_ifaddr *ia; */ \
 do { \
 \
 	LIST_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) \
 		if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \
 			break; \
 } while (0)
 
 /*
  * Macro for finding the interface (ifnet structure) corresponding to one
  * of our IP addresses.
  */
 #define INADDR_TO_IFP(addr, ifp) \
 	/* struct in_addr addr; */ \
 	/* struct ifnet *ifp; */ \
 { \
 	struct in_ifaddr *ia; \
 \
 	INADDR_TO_IFADDR(addr, ia); \
 	(ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \
 }
 
 /*
  * Macro for finding the internet address structure (in_ifaddr) corresponding
  * to a given interface (ifnet structure).
  */
 #define IFP_TO_IA(ifp, ia, t)						\
 	/* struct ifnet *ifp; */					\
 	/* struct in_ifaddr *ia; */					\
 	/* struct rm_priotracker *t; */					\
 do {									\
 	IN_IFADDR_RLOCK((t));						\
 	for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead);			\
 	    (ia) != NULL && (ia)->ia_ifp != (ifp);			\
 	    (ia) = TAILQ_NEXT((ia), ia_link))				\
 		continue;						\
 	if ((ia) != NULL)						\
 		ifa_ref(&(ia)->ia_ifa);					\
 	IN_IFADDR_RUNLOCK((t));						\
 } while (0)
 
 /*
  * Legacy IPv4 IGMP per-link structure.
  */
 struct router_info {
 	struct ifnet *rti_ifp;
 	int    rti_type; /* type of router which is querier on this interface */
 	int    rti_time; /* # of slow timeouts since last old query */
 	SLIST_ENTRY(router_info) rti_list;
 };
 
 /*
  * IPv4 multicast IGMP-layer source entry.
  */
 struct ip_msource {
 	RB_ENTRY(ip_msource)	ims_link;	/* RB tree links */
 	in_addr_t		ims_haddr;	/* host byte order */
 	struct ims_st {
 		uint16_t	ex;		/* # of exclusive members */
 		uint16_t	in;		/* # of inclusive members */
 	}			ims_st[2];	/* state at t0, t1 */
 	uint8_t			ims_stp;	/* pending query */
 };
 
 /*
  * IPv4 multicast PCB-layer source entry.
  */
 struct in_msource {
 	RB_ENTRY(ip_msource)	ims_link;	/* RB tree links */
 	in_addr_t		ims_haddr;	/* host byte order */
 	uint8_t			imsl_st[2];	/* state before/at commit */
 };
 
 RB_HEAD(ip_msource_tree, ip_msource);	/* define struct ip_msource_tree */
 
 static __inline int
 ip_msource_cmp(const struct ip_msource *a, const struct ip_msource *b)
 {
 
 	if (a->ims_haddr < b->ims_haddr)
 		return (-1);
 	if (a->ims_haddr == b->ims_haddr)
 		return (0);
 	return (1);
 }
 RB_PROTOTYPE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp);
 
 /*
  * IPv4 multicast PCB-layer group filter descriptor.
  */
 struct in_mfilter {
 	struct ip_msource_tree	imf_sources; /* source list for (S,G) */
 	u_long			imf_nsrc;    /* # of source entries */
 	uint8_t			imf_st[2];   /* state before/at commit */
 };
 
 /*
  * IPv4 group descriptor.
  *
  * For every entry on an ifnet's if_multiaddrs list which represents
  * an IP multicast group, there is one of these structures.
  *
  * If any source filters are present, then a node will exist in the RB-tree
  * to permit fast lookup by source whenever an operation takes place.
  * This permits pre-order traversal when we issue reports.
  * Source filter trees are kept separately from the socket layer to
  * greatly simplify locking.
  *
  * When IGMPv3 is active, inm_timer is the response to group query timer.
  * The state-change timer inm_sctimer is separate; whenever state changes
  * for the group the state change record is generated and transmitted,
  * and kept if retransmissions are necessary.
  *
  * FUTURE: inm_link is now only used when groups are being purged
  * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but
  * because it is at the very start of the struct, we can't do this
  * w/o breaking the ABI for ifmcstat.
  */
 struct in_multi {
 	LIST_ENTRY(in_multi) inm_link;	/* to-be-released by in_ifdetach */
 	struct	in_addr inm_addr;	/* IP multicast address, convenience */
 	struct	ifnet *inm_ifp;		/* back pointer to ifnet */
 	struct	ifmultiaddr *inm_ifma;	/* back pointer to ifmultiaddr */
 	u_int	inm_timer;		/* IGMPv1/v2 group / v3 query timer */
 	u_int	inm_state;		/* state of the membership */
 	void	*inm_rti;		/* unused, legacy field */
 	u_int	inm_refcount;		/* reference count */
 
 	/* New fields for IGMPv3 follow. */
 	struct igmp_ifsoftc	*inm_igi;	/* IGMP info */
 	SLIST_ENTRY(in_multi)	 inm_nrele;	/* to-be-released by IGMP */
 	struct ip_msource_tree	 inm_srcs;	/* tree of sources */
 	u_long			 inm_nsrc;	/* # of tree entries */
 
 	struct mbufq		 inm_scq;	/* queue of pending
 						 * state-change packets */
 	struct timeval		 inm_lastgsrtv;	/* Time of last G-S-R query */
 	uint16_t		 inm_sctimer;	/* state-change timer */
 	uint16_t		 inm_scrv;	/* state-change rexmit count */
 
 	/*
 	 * SSM state counters which track state at T0 (the time the last
 	 * state-change report's RV timer went to zero) and T1
 	 * (time of pending report, i.e. now).
 	 * Used for computing IGMPv3 state-change reports. Several refcounts
 	 * are maintained here to optimize for common use-cases.
 	 */
 	struct inm_st {
 		uint16_t	iss_fmode;	/* IGMP filter mode */
 		uint16_t	iss_asm;	/* # of ASM listeners */
 		uint16_t	iss_ex;		/* # of exclusive members */
 		uint16_t	iss_in;		/* # of inclusive members */
 		uint16_t	iss_rec;	/* # of recorded sources */
 	}			inm_st[2];	/* state at t0, t1 */
 };
 
 /*
  * Helper function to derive the filter mode on a source entry
  * from its internal counters. Predicates are:
  *  A source is only excluded if all listeners exclude it.
  *  A source is only included if no listeners exclude it,
  *  and at least one listener includes it.
  * May be used by ifmcstat(8).
  */
 static __inline uint8_t
 ims_get_mode(const struct in_multi *inm, const struct ip_msource *ims,
     uint8_t t)
 {
 
 	t = !!t;
 	if (inm->inm_st[t].iss_ex > 0 &&
 	    inm->inm_st[t].iss_ex == ims->ims_st[t].ex)
 		return (MCAST_EXCLUDE);
 	else if (ims->ims_st[t].in > 0 && ims->ims_st[t].ex == 0)
 		return (MCAST_INCLUDE);
 	return (MCAST_UNDEFINED);
 }
 
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_net_inet);
 SYSCTL_DECL(_net_inet_ip);
 SYSCTL_DECL(_net_inet_raw);
 #endif
 
 /*
  * Lock macros for IPv4 layer multicast address lists.  IPv4 lock goes
  * before link layer multicast locks in the lock order.  In most cases,
  * consumers of IN_*_MULTI() macros should acquire the locks before
  * calling them; users of the in_{add,del}multi() functions should not.
  */
 extern struct mtx in_multi_mtx;
 #define	IN_MULTI_LOCK()		mtx_lock(&in_multi_mtx)
 #define	IN_MULTI_UNLOCK()	mtx_unlock(&in_multi_mtx)
 #define	IN_MULTI_LOCK_ASSERT()	mtx_assert(&in_multi_mtx, MA_OWNED)
 #define	IN_MULTI_UNLOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_NOTOWNED)
 
 /* Acquire an in_multi record. */
 static __inline void
 inm_acquire_locked(struct in_multi *inm)
 {
 
 	IN_MULTI_LOCK_ASSERT();
 	++inm->inm_refcount;
 }
 
 /*
  * Return values for imo_multi_filter().
  */
 #define MCAST_PASS		0	/* Pass */
 #define MCAST_NOTGMEMBER	1	/* This host not a member of group */
 #define MCAST_NOTSMEMBER	2	/* This host excluded source */
 #define MCAST_MUTED		3	/* [deprecated] */
 
 struct	rtentry;
 struct	route;
 struct	ip_moptions;
 
 struct in_multi *inm_lookup_locked(struct ifnet *, const struct in_addr);
 struct in_multi *inm_lookup(struct ifnet *, const struct in_addr);
 int	imo_multi_filter(const struct ip_moptions *, const struct ifnet *,
 	    const struct sockaddr *, const struct sockaddr *);
 void	inm_commit(struct in_multi *);
 void	inm_clear_recorded(struct in_multi *);
 void	inm_print(const struct in_multi *);
 int	inm_record_source(struct in_multi *inm, const in_addr_t);
 void	inm_release(struct in_multi *);
 void	inm_release_locked(struct in_multi *);
 struct	in_multi *
 	in_addmulti(struct in_addr *, struct ifnet *);
 void	in_delmulti(struct in_multi *);
 int	in_joingroup(struct ifnet *, const struct in_addr *,
 	    /*const*/ struct in_mfilter *, struct in_multi **);
 int	in_joingroup_locked(struct ifnet *, const struct in_addr *,
 	    /*const*/ struct in_mfilter *, struct in_multi **);
 int	in_leavegroup(struct in_multi *, /*const*/ struct in_mfilter *);
 int	in_leavegroup_locked(struct in_multi *,
 	    /*const*/ struct in_mfilter *);
 int	in_control(struct socket *, u_long, caddr_t, struct ifnet *,
 	    struct thread *);
 int	in_addprefix(struct in_ifaddr *, int);
 int	in_scrubprefix(struct in_ifaddr *, u_int);
+void	in_ifscrub_all(void);
 void	ip_input(struct mbuf *);
 void	ip_direct_input(struct mbuf *);
 void	in_ifadown(struct ifaddr *ifa, int);
 struct	mbuf	*ip_tryforward(struct mbuf *);
 void	*in_domifattach(struct ifnet *);
 void	in_domifdetach(struct ifnet *, void *);
 
 
 /* XXX */
 void	 in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum);
 void	 in_rtredirect(struct sockaddr *, struct sockaddr *,
 	    struct sockaddr *, int, struct sockaddr *, u_int);
 #endif /* _KERNEL */
 
 /* INET6 stuff */
 #include <netinet6/in6_var.h>
 
 #endif /* _NETINET_IN_VAR_H_ */
Index: projects/vnet/sys/netinet/ip_input.c
===================================================================
--- projects/vnet/sys/netinet/ip_input.c	(revision 297511)
+++ projects/vnet/sys/netinet/ip_input.c	(revision 297512)
@@ -1,1346 +1,1357 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bootp.h"
 #include "opt_ipfw.h"
 #include "opt_ipstealth.h"
 #include "opt_ipsec.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hhook.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 
 #include <net/pfil.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_options.h>
 #include <machine/in_cksum.h>
 #include <netinet/ip_carp.h>
 #ifdef IPSEC
 #include <netinet/ip_ipsec.h>
 #include <netipsec/ipsec.h>
 #include <netipsec/key.h>
 #endif /* IPSEC */
 #include <netinet/in_rss.h>
 
 #include <sys/socketvar.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef CTASSERT
 CTASSERT(sizeof(struct ip) == 20);
 #endif
 
 /* IP reassembly functions are defined in ip_reass.c. */
 extern void ipreass_init(void);
 extern void ipreass_drain(void);
 extern void ipreass_slowtimo(void);
 #ifdef VIMAGE
 extern void ipreass_destroy(void);
 #endif
 
 struct rmlock in_ifaddr_lock;
 RM_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock");
 
 VNET_DEFINE(int, rsvp_on);
 
 VNET_DEFINE(int, ipforwarding);
 SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipforwarding), 0,
     "Enable IP forwarding between interfaces");
 
 static VNET_DEFINE(int, ipsendredirects) = 1;	/* XXX */
 #define	V_ipsendredirects	VNET(ipsendredirects)
 SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipsendredirects), 0,
     "Enable sending IP redirects");
 
 /*
  * XXX - Setting ip_checkinterface mostly implements the receive side of
  * the Strong ES model described in RFC 1122, but since the routing table
  * and transmit implementation do not implement the Strong ES model,
  * setting this to 1 results in an odd hybrid.
  *
  * XXX - ip_checkinterface currently must be disabled if you use ipnat
  * to translate the destination address to another local interface.
  *
  * XXX - ip_checkinterface must be disabled if you add IP aliases
  * to the loopback interface instead of the interface where the
  * packets for those addresses are received.
  */
 static VNET_DEFINE(int, ip_checkinterface);
 #define	V_ip_checkinterface	VNET(ip_checkinterface)
 SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip_checkinterface), 0,
     "Verify packet arrives on correct interface");
 
 VNET_DEFINE(struct pfil_head, inet_pfil_hook);	/* Packet filter hooks */
 
 static struct netisr_handler ip_nh = {
 	.nh_name = "ip",
 	.nh_handler = ip_input,
 	.nh_proto = NETISR_IP,
 #ifdef	RSS
 	.nh_m2cpuid = rss_soft_m2cpuid_v4,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 #else
 	.nh_policy = NETISR_POLICY_FLOW,
 #endif
 };
 
 #ifdef	RSS
 /*
  * Directly dispatched frames are currently assumed
  * to have a flowid already calculated.
  *
  * It should likely have something that assert it
  * actually has valid flow details.
  */
 static struct netisr_handler ip_direct_nh = {
 	.nh_name = "ip_direct",
 	.nh_handler = ip_direct_input,
 	.nh_proto = NETISR_IP_DIRECT,
 	.nh_m2cpuid = rss_soft_m2cpuid_v4,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 };
 #endif
 
 extern	struct domain inetdomain;
 extern	struct protosw inetsw[];
 u_char	ip_protox[IPPROTO_MAX];
 VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead);  /* first inet address */
 VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table  */
 VNET_DEFINE(u_long, in_ifaddrhmask);		/* mask for hash table */
 
 #ifdef IPCTL_DEFMTU
 SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
     &ip_mtu, 0, "Default MTU");
 #endif
 
 #ifdef IPSTEALTH
 VNET_DEFINE(int, ipstealth);
 SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipstealth), 0,
     "IP stealth mode, no TTL decrementation on forwarding");
 #endif
 
 /*
  * IP statistics are stored in the "array" of counter(9)s.
  */
 VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat);
 VNET_PCPUSTAT_SYSINIT(ipstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat,
     "IP statistics (struct ipstat, netinet/ip_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ipstat);
 #endif /* VIMAGE */
 
 /*
  * Kernel module interface for updating ipstat.  The argument is an index
  * into ipstat treated as an array.
  */
 void
 kmod_ipstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(ipstat)[statnum], 1);
 }
 
 void
 kmod_ipstat_dec(int statnum)
 {
 
 	counter_u64_add(VNET(ipstat)[statnum], -1);
 }
 
 static int
 sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip_nh, qlimit));
 }
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen,
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I",
     "Maximum size of the IP input queue");
 
 static int
 sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS)
 {
 	u_int64_t qdrops_long;
 	int error, qdrops;
 
 	netisr_getqdrops(&ip_nh, &qdrops_long);
 	qdrops = qdrops_long;
 	error = sysctl_handle_int(oidp, &qdrops, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qdrops != 0)
 		return (EINVAL);
 	netisr_clearqdrops(&ip_nh);
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops,
     CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I",
     "Number of packets dropped from the IP input queue");
 
 #ifdef	RSS
 static int
 sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip_direct_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip_direct_nh, qlimit));
 }
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_direct_queue_maxlen,
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I",
     "Maximum size of the IP direct input queue");
 
 static int
 sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS)
 {
 	u_int64_t qdrops_long;
 	int error, qdrops;
 
 	netisr_getqdrops(&ip_direct_nh, &qdrops_long);
 	qdrops = qdrops_long;
 	error = sysctl_handle_int(oidp, &qdrops, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qdrops != 0)
 		return (EINVAL);
 	netisr_clearqdrops(&ip_direct_nh);
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_direct_queue_drops,
     CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I",
     "Number of packets dropped from the IP direct input queue");
 #endif	/* RSS */
 
 /*
  * IP initialization: fill in IP protocol switch table.
  * All protocols not implemented in kernel go to raw IP protocol handler.
  */
 void
 ip_init(void)
 {
 	struct protosw *pr;
 	int i;
 
 	TAILQ_INIT(&V_in_ifaddrhead);
 	V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask);
 
 	/* Initialize IP reassembly queue. */
 	ipreass_init();
 
 	/* Initialize packet filter hooks. */
 	V_inet_pfil_hook.ph_type = PFIL_TYPE_AF;
 	V_inet_pfil_hook.ph_af = AF_INET;
 	if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to register pfil hook, "
 			"error %d\n", __func__, i);
 
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET,
 	    &V_ipsec_hhh_in[HHOOK_IPSEC_INET],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register input helper hook\n",
 		    __func__);
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET,
 	    &V_ipsec_hhh_out[HHOOK_IPSEC_INET],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register output helper hook\n",
 		    __func__);
 
 	/* Skip initialization of globals for non-default instances. */
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		panic("ip_init: PF_INET not found");
 
 	/* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
 	for (i = 0; i < IPPROTO_MAX; i++)
 		ip_protox[i] = pr - inetsw;
 	/*
 	 * Cycle through IP protocols and put them into the appropriate place
 	 * in ip_protox[].
 	 */
 	for (pr = inetdomain.dom_protosw;
 	    pr < inetdomain.dom_protoswNPROTOSW; pr++)
 		if (pr->pr_domain->dom_family == PF_INET &&
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
 			/* Be careful to only index valid IP protocols. */
 			if (pr->pr_protocol < IPPROTO_MAX)
 				ip_protox[pr->pr_protocol] = pr - inetsw;
 		}
 
 	netisr_register(&ip_nh);
 #ifdef	RSS
 	netisr_register(&ip_direct_nh);
 #endif
 }
 
 #ifdef VIMAGE
 static void
 ip_destroy(void *unused __unused)
 {
+	struct ifnet *ifp;
 	int error;
 
 	if ((error = pfil_head_unregister(&V_inet_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to unregister pfil hook, "
 		    "error %d\n", __func__, error);
 
 	error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister input helper hook "
 		    "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET: "
 		    "error %d returned\n", __func__, error);
 	}
 	error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister output helper hook "
 		    "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET: "
 		    "error %d returned\n", __func__, error);
 	}
-	/* Cleanup in_ifaddr hash table; should be empty. */
-	hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask);
 
+	/* Remove the IPv4 addresses from all interfaces. */
+	in_ifscrub_all();
+
+	/* Make sure the IPv4 routes are gone as well. */
+	IFNET_RLOCK();
+	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
+		rt_flushifroutes_af(ifp, AF_INET);
+	IFNET_RUNLOCK();
+
 	/* Destroy IP reassembly queue. */
 	ipreass_destroy();
+
+	/* Cleanup in_ifaddr hash table; should be empty. */
+	hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask);
 }
 
 VNET_SYSUNINIT(ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_destroy, NULL);
 #endif
 
 #ifdef	RSS
 /*
  * IP direct input routine.
  *
  * This is called when reinjecting completed fragments where
  * all of the previous checking and book-keeping has been done.
  */
 void
 ip_direct_input(struct mbuf *m)
 {
 	struct ip *ip;
 	int hlen;
 
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 
 	IPSTAT_INC(ips_delivered);
 	(*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p);
 	return;
 }
 #endif
 
 /*
  * Ip input routine.  Checksum and byte swap header.  If fragmented
  * try to reassemble.  Process options.  Pass to next level.
  */
 void
 ip_input(struct mbuf *m)
 {
 	struct ip *ip = NULL;
 	struct in_ifaddr *ia = NULL;
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	int    checkif, hlen = 0;
 	uint16_t sum, ip_len;
 	int dchg = 0;				/* dest changed after fw */
 	struct in_addr odst;			/* original dst address */
 
 	M_ASSERTPKTHDR(m);
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		/* Set up some basics that will be used later. */
 		ip = mtod(m, struct ip *);
 		hlen = ip->ip_hl << 2;
 		ip_len = ntohs(ip->ip_len);
 		goto ours;
 	}
 
 	IPSTAT_INC(ips_total);
 
 	if (m->m_pkthdr.len < sizeof(struct ip))
 		goto tooshort;
 
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == NULL) {
 		IPSTAT_INC(ips_toosmall);
 		return;
 	}
 	ip = mtod(m, struct ip *);
 
 	if (ip->ip_v != IPVERSION) {
 		IPSTAT_INC(ips_badvers);
 		goto bad;
 	}
 
 	hlen = ip->ip_hl << 2;
 	if (hlen < sizeof(struct ip)) {	/* minimum header length */
 		IPSTAT_INC(ips_badhlen);
 		goto bad;
 	}
 	if (hlen > m->m_len) {
 		if ((m = m_pullup(m, hlen)) == NULL) {
 			IPSTAT_INC(ips_badhlen);
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 
 	IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL);
 
 	/* 127/8 must not appear on wire - RFC1122 */
 	ifp = m->m_pkthdr.rcvif;
 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			IPSTAT_INC(ips_badaddr);
 			goto bad;
 		}
 	}
 
 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
 		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
 	} else {
 		if (hlen == sizeof(struct ip)) {
 			sum = in_cksum_hdr(ip);
 		} else {
 			sum = in_cksum(m, hlen);
 		}
 	}
 	if (sum) {
 		IPSTAT_INC(ips_badsum);
 		goto bad;
 	}
 
 #ifdef ALTQ
 	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
 		/* packet is dropped by traffic conditioner */
 		return;
 #endif
 
 	ip_len = ntohs(ip->ip_len);
 	if (ip_len < hlen) {
 		IPSTAT_INC(ips_badlen);
 		goto bad;
 	}
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IP header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len < ip_len) {
 tooshort:
 		IPSTAT_INC(ips_tooshort);
 		goto bad;
 	}
 	if (m->m_pkthdr.len > ip_len) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = ip_len;
 			m->m_pkthdr.len = ip_len;
 		} else
 			m_adj(m, ip_len - m->m_pkthdr.len);
 	}
 
 	/* Try to forward the packet, but if we fail continue */
 #ifdef IPSEC
 	/* For now we do not handle IPSEC in tryforward. */
 	if (!key_havesp(IPSEC_DIR_INBOUND) && !key_havesp(IPSEC_DIR_OUTBOUND) &&
 	    (V_ipforwarding == 1))
 		if (ip_tryforward(m) == NULL)
 			return;
 	/*
 	 * Bypass packet filtering for packets previously handled by IPsec.
 	 */
 	if (ip_ipsec_filtertunnel(m))
 		goto passin;
 #else
 	if (V_ipforwarding == 1)
 		if (ip_tryforward(m) == NULL)
 			return;
 #endif /* IPSEC */
 
 	/*
 	 * Run through list of hooks for input packets.
 	 *
 	 * NB: Beware of the destination address changing (e.g.
 	 *     by NAT rewriting).  When this happens, tell
 	 *     ip_forward to do the right thing.
 	 */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet_pfil_hook))
 		goto passin;
 
 	odst = ip->ip_dst;
 	if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0)
 		return;
 	if (m == NULL)			/* consumed by filter */
 		return;
 
 	ip = mtod(m, struct ip *);
 	dchg = (odst.s_addr != ip->ip_dst.s_addr);
 	ifp = m->m_pkthdr.rcvif;
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		goto ours;
 	}
 	if (m->m_flags & M_IP_NEXTHOP) {
 		dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL);
 		if (dchg != 0) {
 			/*
 			 * Directly ship the packet on.  This allows
 			 * forwarding packets originally destined to us
 			 * to some other directly connected host.
 			 */
 			ip_forward(m, 1);
 			return;
 		}
 	}
 passin:
 
 	/*
 	 * Process options and, if not destined for us,
 	 * ship it on.  ip_dooptions returns 1 when an
 	 * error was detected (causing an icmp message
 	 * to be sent and the original packet to be freed).
 	 */
 	if (hlen > sizeof (struct ip) && ip_dooptions(m, 0))
 		return;
 
         /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
          * matter if it is destined to another node, or whether it is 
          * a multicast one, RSVP wants it! and prevents it from being forwarded
          * anywhere else. Also checks if the rsvp daemon is running before
 	 * grabbing the packet.
          */
 	if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) 
 		goto ours;
 
 	/*
 	 * Check our list of addresses, to see if the packet is for us.
 	 * If we don't have any addresses, assume any unicast packet
 	 * we receive might be for us (and let the upper layers deal
 	 * with it).
 	 */
 	if (TAILQ_EMPTY(&V_in_ifaddrhead) &&
 	    (m->m_flags & (M_MCAST|M_BCAST)) == 0)
 		goto ours;
 
 	/*
 	 * Enable a consistency check between the destination address
 	 * and the arrival interface for a unicast packet (the RFC 1122
 	 * strong ES model) if IP forwarding is disabled and the packet
 	 * is not locally generated and the packet is not subject to
 	 * 'ipfw fwd'.
 	 *
 	 * XXX - Checking also should be disabled if the destination
 	 * address is ipnat'ed to a different interface.
 	 *
 	 * XXX - Checking is incompatible with IP aliases added
 	 * to the loopback interface instead of the interface where
 	 * the packets are received.
 	 *
 	 * XXX - This is the case for carp vhost IPs as well so we
 	 * insert a workaround. If the packet got here, we already
 	 * checked with carp_iamatch() and carp_forus().
 	 */
 	checkif = V_ip_checkinterface && (V_ipforwarding == 0) && 
 	    ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) &&
 	    ifp->if_carp == NULL && (dchg == 0);
 
 	/*
 	 * Check for exact addresses in the hash bucket.
 	 */
 	/* IN_IFADDR_RLOCK(); */
 	LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
 		/*
 		 * If the address matches, verify that the packet
 		 * arrived via the correct interface if checking is
 		 * enabled.
 		 */
 		if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && 
 		    (!checkif || ia->ia_ifp == ifp)) {
 			counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 			counter_u64_add(ia->ia_ifa.ifa_ibytes,
 			    m->m_pkthdr.len);
 			/* IN_IFADDR_RUNLOCK(); */
 			goto ours;
 		}
 	}
 	/* IN_IFADDR_RUNLOCK(); */
 
 	/*
 	 * Check for broadcast addresses.
 	 *
 	 * Only accept broadcast packets that arrive via the matching
 	 * interface.  Reception of forwarded directed broadcasts would
 	 * be handled via ip_forward() and ether_output() with the loopback
 	 * into the stack for SIMPLEX interfaces handled by ether_output().
 	 */
 	if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
 		IF_ADDR_RLOCK(ifp);
 	        TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
 			    ip->ip_dst.s_addr) {
 				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_ibytes,
 				    m->m_pkthdr.len);
 				IF_ADDR_RUNLOCK(ifp);
 				goto ours;
 			}
 #ifdef BOOTP_COMPAT
 			if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) {
 				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_ibytes,
 				    m->m_pkthdr.len);
 				IF_ADDR_RUNLOCK(ifp);
 				goto ours;
 			}
 #endif
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		ia = NULL;
 	}
 	/* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */
 	if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		if (V_ip_mrouter) {
 			/*
 			 * If we are acting as a multicast router, all
 			 * incoming multicast packets are passed to the
 			 * kernel-level multicast forwarding function.
 			 * The packet is returned (relatively) intact; if
 			 * ip_mforward() returns a non-zero value, the packet
 			 * must be discarded, else it may be accepted below.
 			 */
 			if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) {
 				IPSTAT_INC(ips_cantforward);
 				m_freem(m);
 				return;
 			}
 
 			/*
 			 * The process-level routing daemon needs to receive
 			 * all multicast IGMP packets, whether or not this
 			 * host belongs to their destination groups.
 			 */
 			if (ip->ip_p == IPPROTO_IGMP)
 				goto ours;
 			IPSTAT_INC(ips_forward);
 		}
 		/*
 		 * Assume the packet is for us, to avoid prematurely taking
 		 * a lock on the in_multi hash. Protocols must perform
 		 * their own filtering and update statistics accordingly.
 		 */
 		goto ours;
 	}
 	if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST)
 		goto ours;
 	if (ip->ip_dst.s_addr == INADDR_ANY)
 		goto ours;
 
 	/*
 	 * Not for us; forward if possible and desirable.
 	 */
 	if (V_ipforwarding == 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 	} else {
 		ip_forward(m, dchg);
 	}
 	return;
 
 ours:
 #ifdef IPSTEALTH
 	/*
 	 * IPSTEALTH: Process non-routing options only
 	 * if the packet is destined for us.
 	 */
 	if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1))
 		return;
 #endif /* IPSTEALTH */
 
 	/*
 	 * Attempt reassembly; if it succeeds, proceed.
 	 * ip_reass() will return a different mbuf.
 	 */
 	if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) {
 		/* XXXGL: shouldn't we save & set m_flags? */
 		m = ip_reass(m);
 		if (m == NULL)
 			return;
 		ip = mtod(m, struct ip *);
 		/* Get the header length of the reassembled packet */
 		hlen = ip->ip_hl << 2;
 	}
 
 #ifdef IPSEC
 	/*
 	 * enforce IPsec policy checking if we are seeing last header.
 	 * note that we do not visit this with protocols with pcb layer
 	 * code - like udp/tcp/raw ip.
 	 */
 	if (ip_ipsec_input(m, ip->ip_p) != 0)
 		goto bad;
 #endif /* IPSEC */
 
 	/*
 	 * Switch out to protocol's input routine.
 	 */
 	IPSTAT_INC(ips_delivered);
 
 	(*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p);
 	return;
 bad:
 	m_freem(m);
 }
 
 /*
  * IP timer processing;
  * if a timer expires on a reassembly
  * queue, discard it.
  */
 void
 ip_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		ipreass_slowtimo();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 void
 ip_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		ipreass_drain();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * The protocol to be inserted into ip_protox[] must be already registered
  * in inetsw[], either statically or through pf_proto_register().
  */
 int
 ipproto_register(short ipproto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/*
 	 * The protocol slot must not be occupied by another protocol
 	 * already.  An index pointing to IPPROTO_RAW is unused.
 	 */
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip_protox[ipproto] != pr - inetsw)	/* IPPROTO_RAW */
 		return (EEXIST);
 
 	/* Find the protocol position in inetsw[] and set the index. */
 	for (pr = inetdomain.dom_protosw;
 	     pr < inetdomain.dom_protoswNPROTOSW; pr++) {
 		if (pr->pr_domain->dom_family == PF_INET &&
 		    pr->pr_protocol && pr->pr_protocol == ipproto) {
 			ip_protox[pr->pr_protocol] = pr - inetsw;
 			return (0);
 		}
 	}
 	return (EPROTONOSUPPORT);
 }
 
 int
 ipproto_unregister(short ipproto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/* Check if the protocol was indeed registered. */
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip_protox[ipproto] == pr - inetsw)  /* IPPROTO_RAW */
 		return (ENOENT);
 
 	/* Reset the protocol slot to IPPROTO_RAW. */
 	ip_protox[ipproto] = pr - inetsw;
 	return (0);
 }
 
 u_char inetctlerrmap[PRC_NCMDS] = {
 	0,		0,		0,		0,
 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
 	0,		0,		EHOSTUNREACH,	0,
 	ENOPROTOOPT,	ECONNREFUSED
 };
 
 /*
  * Forward a packet.  If some error occurs return the sender
  * an icmp packet.  Note we can't always generate a meaningful
  * icmp message because icmp doesn't have a large enough repertoire
  * of codes and types.
  *
  * If not forwarding, just drop the packet.  This could be confusing
  * if ipforwarding was zero but some routing protocol was advancing
  * us as a gateway to somewhere.  However, we must let the routing
  * protocol deal with that.
  *
  * The srcrt parameter indicates whether the packet is being forwarded
  * via a source route.
  */
 void
 ip_forward(struct mbuf *m, int srcrt)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct in_ifaddr *ia;
 	struct mbuf *mcopy;
 	struct sockaddr_in *sin;
 	struct in_addr dest;
 	struct route ro;
 	int error, type = 0, code = 0, mtu = 0;
 
 	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 #ifdef IPSEC
 	if (ip_ipsec_fwd(m) != 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 #endif /* IPSEC */
 #ifdef IPSTEALTH
 	if (!V_ipstealth) {
 #endif
 		if (ip->ip_ttl <= IPTTLDEC) {
 			icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
 			    0, 0);
 			return;
 		}
 #ifdef IPSTEALTH
 	}
 #endif
 
 	bzero(&ro, sizeof(ro));
 	sin = (struct sockaddr_in *)&ro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = ip->ip_dst;
 #ifdef RADIX_MPATH
 	rtalloc_mpath_fib(&ro,
 	    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
 	    M_GETFIB(m));
 #else
 	in_rtalloc_ign(&ro, 0, M_GETFIB(m));
 #endif
 	if (ro.ro_rt != NULL) {
 		ia = ifatoia(ro.ro_rt->rt_ifa);
 		ifa_ref(&ia->ia_ifa);
 	} else
 		ia = NULL;
 #ifndef IPSEC
 	/*
 	 * 'ia' may be NULL if there is no route for this destination.
 	 * In case of IPsec, Don't discard it just yet, but pass it to
 	 * ip_output in case of outgoing IPsec policy.
 	 */
 	if (!srcrt && ia == NULL) {
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		RO_RTFREE(&ro);
 		return;
 	}
 #endif
 
 	/*
 	 * Save the IP header and at most 8 bytes of the payload,
 	 * in case we need to generate an ICMP message to the src.
 	 *
 	 * XXX this can be optimized a lot by saving the data in a local
 	 * buffer on the stack (72 bytes at most), and only allocating the
 	 * mbuf if really necessary. The vast majority of the packets
 	 * are forwarded without having to send an ICMP back (either
 	 * because unnecessary, or because rate limited), so we are
 	 * really we are wasting a lot of work here.
 	 *
 	 * We don't use m_copy() because it might return a reference
 	 * to a shared cluster. Both this function and ip_output()
 	 * assume exclusive access to the IP header in `m', so any
 	 * data in a cluster may change before we reach icmp_error().
 	 */
 	mcopy = m_gethdr(M_NOWAIT, m->m_type);
 	if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) {
 		/*
 		 * It's probably ok if the pkthdr dup fails (because
 		 * the deep copy of the tag chain failed), but for now
 		 * be conservative and just discard the copy since
 		 * code below may some day want the tags.
 		 */
 		m_free(mcopy);
 		mcopy = NULL;
 	}
 	if (mcopy != NULL) {
 		mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy));
 		mcopy->m_pkthdr.len = mcopy->m_len;
 		m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
 	}
 
 #ifdef IPSTEALTH
 	if (!V_ipstealth) {
 #endif
 		ip->ip_ttl -= IPTTLDEC;
 #ifdef IPSTEALTH
 	}
 #endif
 
 	/*
 	 * If forwarding packet using same interface that it came in on,
 	 * perhaps should send a redirect to sender to shortcut a hop.
 	 * Only send redirect if source is sending directly to us,
 	 * and if packet was not source routed (or has any options).
 	 * Also, don't send redirect if forwarding using a default route
 	 * or a route modified by a redirect.
 	 */
 	dest.s_addr = 0;
 	if (!srcrt && V_ipsendredirects &&
 	    ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) {
 		struct rtentry *rt;
 
 		rt = ro.ro_rt;
 
 		if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
 		    satosin(rt_key(rt))->sin_addr.s_addr != 0) {
 #define	RTA(rt)	((struct in_ifaddr *)(rt->rt_ifa))
 			u_long src = ntohl(ip->ip_src.s_addr);
 
 			if (RTA(rt) &&
 			    (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) {
 				if (rt->rt_flags & RTF_GATEWAY)
 					dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr;
 				else
 					dest.s_addr = ip->ip_dst.s_addr;
 				/* Router requirements says to only send host redirects */
 				type = ICMP_REDIRECT;
 				code = ICMP_REDIRECT_HOST;
 			}
 		}
 	}
 
 	error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);
 
 	if (error == EMSGSIZE && ro.ro_rt)
 		mtu = ro.ro_rt->rt_mtu;
 	RO_RTFREE(&ro);
 
 	if (error)
 		IPSTAT_INC(ips_cantforward);
 	else {
 		IPSTAT_INC(ips_forward);
 		if (type)
 			IPSTAT_INC(ips_redirectsent);
 		else {
 			if (mcopy)
 				m_freem(mcopy);
 			if (ia != NULL)
 				ifa_free(&ia->ia_ifa);
 			return;
 		}
 	}
 	if (mcopy == NULL) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return;
 	}
 
 	switch (error) {
 
 	case 0:				/* forwarded, but need redirect */
 		/* type, code set above */
 		break;
 
 	case ENETUNREACH:
 	case EHOSTUNREACH:
 	case ENETDOWN:
 	case EHOSTDOWN:
 	default:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_HOST;
 		break;
 
 	case EMSGSIZE:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_NEEDFRAG;
 
 #ifdef IPSEC
 		/* 
 		 * If IPsec is configured for this path,
 		 * override any possibly mtu value set by ip_output.
 		 */ 
 		mtu = ip_ipsec_mtu(mcopy, mtu);
 #endif /* IPSEC */
 		/*
 		 * If the MTU was set before make sure we are below the
 		 * interface MTU.
 		 * If the MTU wasn't set before use the interface mtu or
 		 * fall back to the next smaller mtu step compared to the
 		 * current packet size.
 		 */
 		if (mtu != 0) {
 			if (ia != NULL)
 				mtu = min(mtu, ia->ia_ifp->if_mtu);
 		} else {
 			if (ia != NULL)
 				mtu = ia->ia_ifp->if_mtu;
 			else
 				mtu = ip_next_mtu(ntohs(ip->ip_len), 0);
 		}
 		IPSTAT_INC(ips_cantfrag);
 		break;
 
 	case ENOBUFS:
 	case EACCES:			/* ipfw denied packet */
 		m_freem(mcopy);
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return;
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	icmp_error(mcopy, type, code, dest.s_addr, mtu);
 }
 
 void
 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
     struct mbuf *m)
 {
 
 	if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) {
 		struct bintime bt;
 
 		bintime(&bt);
 		if (inp->inp_socket->so_options & SO_BINTIME) {
 			*mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt),
 			    SCM_BINTIME, SOL_SOCKET);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 		if (inp->inp_socket->so_options & SO_TIMESTAMP) {
 			struct timeval tv;
 
 			bintime2timeval(&bt, &tv);
 			*mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv),
 			    SCM_TIMESTAMP, SOL_SOCKET);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 	if (inp->inp_flags & INP_RECVDSTADDR) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_dst,
 		    sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVTTL) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_ttl,
 		    sizeof(u_char), IP_RECVTTL, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #ifdef notyet
 	/* XXX
 	 * Moving these out of udp_input() made them even more broken
 	 * than they already were.
 	 */
 	/* options were tossed already */
 	if (inp->inp_flags & INP_RECVOPTS) {
 		*mp = sbcreatecontrol((caddr_t)opts_deleted_above,
 		    sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	/* ip_srcroute doesn't do what we want here, need to fix */
 	if (inp->inp_flags & INP_RECVRETOPTS) {
 		*mp = sbcreatecontrol((caddr_t)ip_srcroute(m),
 		    sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #endif
 	if (inp->inp_flags & INP_RECVIF) {
 		struct ifnet *ifp;
 		struct sdlbuf {
 			struct sockaddr_dl sdl;
 			u_char	pad[32];
 		} sdlbuf;
 		struct sockaddr_dl *sdp;
 		struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
 
 		if ((ifp = m->m_pkthdr.rcvif) &&
 		    ifp->if_index && ifp->if_index <= V_if_index) {
 			sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr;
 			/*
 			 * Change our mind and don't try copy.
 			 */
 			if (sdp->sdl_family != AF_LINK ||
 			    sdp->sdl_len > sizeof(sdlbuf)) {
 				goto makedummy;
 			}
 			bcopy(sdp, sdl2, sdp->sdl_len);
 		} else {
 makedummy:	
 			sdl2->sdl_len =
 			    offsetof(struct sockaddr_dl, sdl_data[0]);
 			sdl2->sdl_family = AF_LINK;
 			sdl2->sdl_index = 0;
 			sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
 		}
 		*mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len,
 		    IP_RECVIF, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVTOS) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_tos,
 		    sizeof(u_char), IP_RECVTOS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if (inp->inp_flags2 & INP_RECVFLOWID) {
 		uint32_t flowid, flow_type;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		/*
 		 * XXX should handle the failure of one or the
 		 * other - don't populate both?
 		 */
 		*mp = sbcreatecontrol((caddr_t) &flowid,
 		    sizeof(uint32_t), IP_FLOWID, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 		*mp = sbcreatecontrol((caddr_t) &flow_type,
 		    sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 #ifdef	RSS
 	if (inp->inp_flags2 & INP_RECVRSSBUCKETID) {
 		uint32_t flowid, flow_type;
 		uint32_t rss_bucketid;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) {
 			*mp = sbcreatecontrol((caddr_t) &rss_bucketid,
 			   sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 #endif
 }
 
 /*
  * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the
  * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on
  * locking.  This code remains in ip_input.c as ip_mroute.c is optionally
  * compiled.
  */
 static VNET_DEFINE(int, ip_rsvp_on);
 VNET_DEFINE(struct socket *, ip_rsvpd);
 
 #define	V_ip_rsvp_on		VNET(ip_rsvp_on)
 
 int
 ip_rsvp_init(struct socket *so)
 {
 
 	if (so->so_type != SOCK_RAW ||
 	    so->so_proto->pr_protocol != IPPROTO_RSVP)
 		return EOPNOTSUPP;
 
 	if (V_ip_rsvpd != NULL)
 		return EADDRINUSE;
 
 	V_ip_rsvpd = so;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-increment
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (!V_ip_rsvp_on) {
 		V_ip_rsvp_on = 1;
 		V_rsvp_on++;
 	}
 
 	return 0;
 }
 
 int
 ip_rsvp_done(void)
 {
 
 	V_ip_rsvpd = NULL;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-decrement
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (V_ip_rsvp_on) {
 		V_ip_rsvp_on = 0;
 		V_rsvp_on--;
 	}
 	return 0;
 }
 
 int
 rsvp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m;
 
 	m = *mp;
 	*mp = NULL;
 
 	if (rsvp_input_p) { /* call the real one if loaded */
 		*mp = m;
 		rsvp_input_p(mp, offp, proto);
 		return (IPPROTO_DONE);
 	}
 
 	/* Can still get packets with rsvp_on = 0 if there is a local member
 	 * of the group to which the RSVP packet is addressed.  But in this
 	 * case we want to throw the packet away.
 	 */
 	
 	if (!V_rsvp_on) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if (V_ip_rsvpd != NULL) { 
 		*mp = m;
 		rip_input(mp, offp, proto);
 		return (IPPROTO_DONE);
 	}
 	/* Drop the packet */
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
Index: projects/vnet/sys/netinet6/in6_ifattach.c
===================================================================
--- projects/vnet/sys/netinet6/in6_ifattach.c	(revision 297511)
+++ projects/vnet/sys/netinet6/in6_ifattach.c	(revision 297512)
@@ -1,918 +1,927 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_ifattach.c,v 1.118 2001/05/24 07:44:00 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/rmlock.h>
 #include <sys/syslog.h>
 #include <sys/md5.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/scope6_var.h>
 
 VNET_DEFINE(unsigned long, in6_maxmtu) = 0;
 
 #ifdef IP6_AUTO_LINKLOCAL
 VNET_DEFINE(int, ip6_auto_linklocal) = IP6_AUTO_LINKLOCAL;
 #else
 VNET_DEFINE(int, ip6_auto_linklocal) = 1;	/* enabled by default */
 #endif
 
 VNET_DEFINE(struct callout, in6_tmpaddrtimer_ch);
 #define	V_in6_tmpaddrtimer_ch		VNET(in6_tmpaddrtimer_ch)
 
 VNET_DECLARE(struct inpcbinfo, ripcbinfo);
 #define	V_ripcbinfo			VNET(ripcbinfo)
 
 static int get_rand_ifid(struct ifnet *, struct in6_addr *);
 static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *);
 static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *);
 static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *);
 static int in6_ifattach_loopback(struct ifnet *);
 static void in6_purgemaddrs(struct ifnet *);
 
 #define EUI64_GBIT	0x01
 #define EUI64_UBIT	0x02
 #define EUI64_TO_IFID(in6)	do {(in6)->s6_addr[8] ^= EUI64_UBIT; } while (0)
 #define EUI64_GROUP(in6)	((in6)->s6_addr[8] & EUI64_GBIT)
 #define EUI64_INDIVIDUAL(in6)	(!EUI64_GROUP(in6))
 #define EUI64_LOCAL(in6)	((in6)->s6_addr[8] & EUI64_UBIT)
 #define EUI64_UNIVERSAL(in6)	(!EUI64_LOCAL(in6))
 
 #define IFID_LOCAL(in6)		(!EUI64_LOCAL(in6))
 #define IFID_UNIVERSAL(in6)	(!EUI64_UNIVERSAL(in6))
 
 /*
  * Generate a last-resort interface identifier, when the machine has no
  * IEEE802/EUI64 address sources.
  * The goal here is to get an interface identifier that is
  * (1) random enough and (2) does not change across reboot.
  * We currently use MD5(hostname) for it.
  *
  * in6 - upper 64bits are preserved
  */
 static int
 get_rand_ifid(struct ifnet *ifp, struct in6_addr *in6)
 {
 	MD5_CTX ctxt;
 	struct prison *pr;
 	u_int8_t digest[16];
 	int hostnamelen;
 
 	pr = curthread->td_ucred->cr_prison;
 	mtx_lock(&pr->pr_mtx);
 	hostnamelen = strlen(pr->pr_hostname);
 #if 0
 	/* we need at least several letters as seed for ifid */
 	if (hostnamelen < 3) {
 		mtx_unlock(&pr->pr_mtx);
 		return -1;
 	}
 #endif
 
 	/* generate 8 bytes of pseudo-random value. */
 	bzero(&ctxt, sizeof(ctxt));
 	MD5Init(&ctxt);
 	MD5Update(&ctxt, pr->pr_hostname, hostnamelen);
 	mtx_unlock(&pr->pr_mtx);
 	MD5Final(digest, &ctxt);
 
 	/* assumes sizeof(digest) > sizeof(ifid) */
 	bcopy(digest, &in6->s6_addr[8], 8);
 
 	/* make sure to set "u" bit to local, and "g" bit to individual. */
 	in6->s6_addr[8] &= ~EUI64_GBIT;	/* g bit to "individual" */
 	in6->s6_addr[8] |= EUI64_UBIT;	/* u bit to "local" */
 
 	/* convert EUI64 into IPv6 interface identifier */
 	EUI64_TO_IFID(in6);
 
 	return 0;
 }
 
 static int
 generate_tmp_ifid(u_int8_t *seed0, const u_int8_t *seed1, u_int8_t *ret)
 {
 	MD5_CTX ctxt;
 	u_int8_t seed[16], digest[16], nullbuf[8];
 	u_int32_t val32;
 
 	/* If there's no history, start with a random seed. */
 	bzero(nullbuf, sizeof(nullbuf));
 	if (bcmp(nullbuf, seed0, sizeof(nullbuf)) == 0) {
 		int i;
 
 		for (i = 0; i < 2; i++) {
 			val32 = arc4random();
 			bcopy(&val32, seed + sizeof(val32) * i, sizeof(val32));
 		}
 	} else
 		bcopy(seed0, seed, 8);
 
 	/* copy the right-most 64-bits of the given address */
 	/* XXX assumption on the size of IFID */
 	bcopy(seed1, &seed[8], 8);
 
 	if (0) {		/* for debugging purposes only */
 		int i;
 
 		printf("generate_tmp_ifid: new randomized ID from: ");
 		for (i = 0; i < 16; i++)
 			printf("%02x", seed[i]);
 		printf(" ");
 	}
 
 	/* generate 16 bytes of pseudo-random value. */
 	bzero(&ctxt, sizeof(ctxt));
 	MD5Init(&ctxt);
 	MD5Update(&ctxt, seed, sizeof(seed));
 	MD5Final(digest, &ctxt);
 
 	/*
 	 * RFC 3041 3.2.1. (3)
 	 * Take the left-most 64-bits of the MD5 digest and set bit 6 (the
 	 * left-most bit is numbered 0) to zero.
 	 */
 	bcopy(digest, ret, 8);
 	ret[0] &= ~EUI64_UBIT;
 
 	/*
 	 * XXX: we'd like to ensure that the generated value is not zero
 	 * for simplicity.  If the caclculated digest happens to be zero,
 	 * use a random non-zero value as the last resort.
 	 */
 	if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) {
 		nd6log((LOG_INFO,
 		    "generate_tmp_ifid: computed MD5 value is zero.\n"));
 
 		val32 = arc4random();
 		val32 = 1 + (val32 % (0xffffffff - 1));
 	}
 
 	/*
 	 * RFC 3041 3.2.1. (4)
 	 * Take the rightmost 64-bits of the MD5 digest and save them in
 	 * stable storage as the history value to be used in the next
 	 * iteration of the algorithm.
 	 */
 	bcopy(&digest[8], seed0, 8);
 
 	if (0) {		/* for debugging purposes only */
 		int i;
 
 		printf("to: ");
 		for (i = 0; i < 16; i++)
 			printf("%02x", digest[i]);
 		printf("\n");
 	}
 
 	return 0;
 }
 
 /*
  * Get interface identifier for the specified interface.
  * XXX assumes single sockaddr_dl (AF_LINK address) per an interface
  *
  * in6 - upper 64bits are preserved
  */
 int
 in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6)
 {
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 	u_int8_t *addr;
 	size_t addrlen;
 	static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 	static u_int8_t allone[8] =
 		{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_LINK)
 			continue;
 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 		if (sdl == NULL)
 			continue;
 		if (sdl->sdl_alen == 0)
 			continue;
 
 		goto found;
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return -1;
 
 found:
 	IF_ADDR_LOCK_ASSERT(ifp);
 	addr = LLADDR(sdl);
 	addrlen = sdl->sdl_alen;
 
 	/* get EUI64 */
 	switch (ifp->if_type) {
 	case IFT_BRIDGE:
 	case IFT_ETHER:
 	case IFT_L2VLAN:
 	case IFT_FDDI:
 	case IFT_ISO88025:
 	case IFT_ATM:
 	case IFT_IEEE1394:
 	case IFT_IEEE80211:
 		/* IEEE802/EUI64 cases - what others? */
 		/* IEEE1394 uses 16byte length address starting with EUI64 */
 		if (addrlen > 8)
 			addrlen = 8;
 
 		/* look at IEEE802/EUI64 only */
 		if (addrlen != 8 && addrlen != 6) {
 			IF_ADDR_RUNLOCK(ifp);
 			return -1;
 		}
 
 		/*
 		 * check for invalid MAC address - on bsdi, we see it a lot
 		 * since wildboar configures all-zero MAC on pccard before
 		 * card insertion.
 		 */
 		if (bcmp(addr, allzero, addrlen) == 0) {
 			IF_ADDR_RUNLOCK(ifp);
 			return -1;
 		}
 		if (bcmp(addr, allone, addrlen) == 0) {
 			IF_ADDR_RUNLOCK(ifp);
 			return -1;
 		}
 
 		/* make EUI64 address */
 		if (addrlen == 8)
 			bcopy(addr, &in6->s6_addr[8], 8);
 		else if (addrlen == 6) {
 			in6->s6_addr[8] = addr[0];
 			in6->s6_addr[9] = addr[1];
 			in6->s6_addr[10] = addr[2];
 			in6->s6_addr[11] = 0xff;
 			in6->s6_addr[12] = 0xfe;
 			in6->s6_addr[13] = addr[3];
 			in6->s6_addr[14] = addr[4];
 			in6->s6_addr[15] = addr[5];
 		}
 		break;
 
 	case IFT_ARCNET:
 		if (addrlen != 1) {
 			IF_ADDR_RUNLOCK(ifp);
 			return -1;
 		}
 		if (!addr[0]) {
 			IF_ADDR_RUNLOCK(ifp);
 			return -1;
 		}
 
 		bzero(&in6->s6_addr[8], 8);
 		in6->s6_addr[15] = addr[0];
 
 		/*
 		 * due to insufficient bitwidth, we mark it local.
 		 */
 		in6->s6_addr[8] &= ~EUI64_GBIT;	/* g bit to "individual" */
 		in6->s6_addr[8] |= EUI64_UBIT;	/* u bit to "local" */
 		break;
 
 	case IFT_GIF:
 	case IFT_STF:
 		/*
 		 * RFC2893 says: "SHOULD use IPv4 address as ifid source".
 		 * however, IPv4 address is not very suitable as unique
 		 * identifier source (can be renumbered).
 		 * we don't do this.
 		 */
 		IF_ADDR_RUNLOCK(ifp);
 		return -1;
 
 	default:
 		IF_ADDR_RUNLOCK(ifp);
 		return -1;
 	}
 
 	/* sanity check: g bit must not indicate "group" */
 	if (EUI64_GROUP(in6)) {
 		IF_ADDR_RUNLOCK(ifp);
 		return -1;
 	}
 
 	/* convert EUI64 into IPv6 interface identifier */
 	EUI64_TO_IFID(in6);
 
 	/*
 	 * sanity check: ifid must not be all zero, avoid conflict with
 	 * subnet router anycast
 	 */
 	if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 &&
 	    bcmp(&in6->s6_addr[9], allzero, 7) == 0) {
 		IF_ADDR_RUNLOCK(ifp);
 		return -1;
 	}
 
 	IF_ADDR_RUNLOCK(ifp);
 	return 0;
 }
 
 /*
  * Get interface identifier for the specified interface.  If it is not
  * available on ifp0, borrow interface identifier from other information
  * sources.
  *
  * altifp - secondary EUI64 source
  */
 static int
 get_ifid(struct ifnet *ifp0, struct ifnet *altifp,
     struct in6_addr *in6)
 {
 	struct ifnet *ifp;
 
 	/* first, try to get it from the interface itself */
 	if (in6_get_hw_ifid(ifp0, in6) == 0) {
 		nd6log((LOG_DEBUG, "%s: got interface identifier from itself\n",
 		    if_name(ifp0)));
 		goto success;
 	}
 
 	/* try secondary EUI64 source. this basically is for ATM PVC */
 	if (altifp && in6_get_hw_ifid(altifp, in6) == 0) {
 		nd6log((LOG_DEBUG, "%s: got interface identifier from %s\n",
 		    if_name(ifp0), if_name(altifp)));
 		goto success;
 	}
 
 	/* next, try to get it from some other hardware interface */
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (ifp == ifp0)
 			continue;
 		if (in6_get_hw_ifid(ifp, in6) != 0)
 			continue;
 
 		/*
 		 * to borrow ifid from other interface, ifid needs to be
 		 * globally unique
 		 */
 		if (IFID_UNIVERSAL(in6)) {
 			nd6log((LOG_DEBUG,
 			    "%s: borrow interface identifier from %s\n",
 			    if_name(ifp0), if_name(ifp)));
 			IFNET_RUNLOCK_NOSLEEP();
 			goto success;
 		}
 	}
 	IFNET_RUNLOCK_NOSLEEP();
 
 	/* last resort: get from random number source */
 	if (get_rand_ifid(ifp, in6) == 0) {
 		nd6log((LOG_DEBUG,
 		    "%s: interface identifier generated by random number\n",
 		    if_name(ifp0)));
 		goto success;
 	}
 
 	printf("%s: failed to get interface identifier\n", if_name(ifp0));
 	return -1;
 
 success:
 	nd6log((LOG_INFO, "%s: ifid: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
 	    if_name(ifp0), in6->s6_addr[8], in6->s6_addr[9], in6->s6_addr[10],
 	    in6->s6_addr[11], in6->s6_addr[12], in6->s6_addr[13],
 	    in6->s6_addr[14], in6->s6_addr[15]));
 	return 0;
 }
 
 /*
  * altifp - secondary EUI64 source
  */
 static int
 in6_ifattach_linklocal(struct ifnet *ifp, struct ifnet *altifp)
 {
 	struct in6_ifaddr *ia;
 	struct in6_aliasreq ifra;
 	struct nd_prefixctl pr0;
 	int error;
 
 	/*
 	 * configure link-local address.
 	 */
 	in6_prepare_ifra(&ifra, NULL, &in6mask64);
 
 	ifra.ifra_addr.sin6_addr.s6_addr32[0] = htonl(0xfe800000);
 	ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0;
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
 		ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0;
 		ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1);
 	} else {
 		if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) {
 			nd6log((LOG_ERR,
 			    "%s: no ifid available\n", if_name(ifp)));
 			return (-1);
 		}
 	}
 	if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL))
 		return (-1);
 
 	/* link-local addresses should NEVER expire. */
 	ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
 	ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;
 
 	/*
 	 * Now call in6_update_ifa() to do a bunch of procedures to configure
 	 * a link-local address. We can set the 3rd argument to NULL, because
 	 * we know there's no other link-local address on the interface
 	 * and therefore we are adding one (instead of updating one).
 	 */
 	if ((error = in6_update_ifa(ifp, &ifra, NULL,
 				    IN6_IFAUPDATE_DADDELAY)) != 0) {
 		/*
 		 * XXX: When the interface does not support IPv6, this call
 		 * would fail in the SIOCSIFADDR ioctl.  I believe the
 		 * notification is rather confusing in this case, so just
 		 * suppress it.  (jinmei@kame.net 20010130)
 		 */
 		if (error != EAFNOSUPPORT)
 			nd6log((LOG_NOTICE, "in6_ifattach_linklocal: failed to "
 			    "configure a link-local address on %s "
 			    "(errno=%d)\n",
 			    if_name(ifp), error));
 		return (-1);
 	}
 
 	ia = in6ifa_ifpforlinklocal(ifp, 0); /* ia must not be NULL */
 	KASSERT(ia != NULL, ("%s: ia == NULL, ifp=%p", __func__, ifp));
 
 	ifa_free(&ia->ia_ifa);
 
 	/*
 	 * Make the link-local prefix (fe80::%link/64) as on-link.
 	 * Since we'd like to manage prefixes separately from addresses,
 	 * we make an ND6 prefix structure for the link-local prefix,
 	 * and add it to the prefix list as a never-expire prefix.
 	 * XXX: this change might affect some existing code base...
 	 */
 	bzero(&pr0, sizeof(pr0));
 	pr0.ndpr_ifp = ifp;
 	/* this should be 64 at this moment. */
 	pr0.ndpr_plen = in6_mask2len(&ifra.ifra_prefixmask.sin6_addr, NULL);
 	pr0.ndpr_prefix = ifra.ifra_addr;
 	/* apply the mask for safety. (nd6_prelist_add will apply it again) */
 	IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &in6mask64);
 	/*
 	 * Initialize parameters.  The link-local prefix must always be
 	 * on-link, and its lifetimes never expire.
 	 */
 	pr0.ndpr_raf_onlink = 1;
 	pr0.ndpr_raf_auto = 1;	/* probably meaningless */
 	pr0.ndpr_vltime = ND6_INFINITE_LIFETIME;
 	pr0.ndpr_pltime = ND6_INFINITE_LIFETIME;
 	/*
 	 * Since there is no other link-local addresses, nd6_prefix_lookup()
 	 * probably returns NULL.  However, we cannot always expect the result.
 	 * For example, if we first remove the (only) existing link-local
 	 * address, and then reconfigure another one, the prefix is still
 	 * valid with referring to the old link-local address.
 	 */
 	if (nd6_prefix_lookup(&pr0) == NULL) {
 		if ((error = nd6_prelist_add(&pr0, NULL, NULL)) != 0)
 			return (error);
 	}
 
 	return 0;
 }
 
 /*
  * ifp - must be IFT_LOOP
  */
 static int
 in6_ifattach_loopback(struct ifnet *ifp)
 {
 	struct in6_aliasreq ifra;
 	int error;
 
 	in6_prepare_ifra(&ifra, &in6addr_loopback, &in6mask128);
 
 	/*
 	 * Always initialize ia_dstaddr (= broadcast address) to loopback
 	 * address.  Follows IPv4 practice - see in_ifinit().
 	 */
 	ifra.ifra_dstaddr.sin6_len = sizeof(struct sockaddr_in6);
 	ifra.ifra_dstaddr.sin6_family = AF_INET6;
 	ifra.ifra_dstaddr.sin6_addr = in6addr_loopback;
 
 	/* the loopback  address should NEVER expire. */
 	ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
 	ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;
 
 	/*
 	 * We are sure that this is a newly assigned address, so we can set
 	 * NULL to the 3rd arg.
 	 */
 	if ((error = in6_update_ifa(ifp, &ifra, NULL, 0)) != 0) {
 		nd6log((LOG_ERR, "in6_ifattach_loopback: failed to configure "
 		    "the loopback address on %s (errno=%d)\n",
 		    if_name(ifp), error));
 		return (-1);
 	}
 
 	return 0;
 }
 
 /*
  * compute NI group address, based on the current hostname setting.
  * see RFC 4620.
  *
  * when ifp == NULL, the caller is responsible for filling scopeid.
  *
  * If oldmcprefix == 1, FF02:0:0:0:0:2::/96 is used for NI group address
  * while it is FF02:0:0:0:0:2:FF00::/104 in RFC 4620. 
  */
 static int
 in6_nigroup0(struct ifnet *ifp, const char *name, int namelen,
     struct in6_addr *in6, int oldmcprefix)
 {
 	struct prison *pr;
 	const char *p;
 	u_char *q;
 	MD5_CTX ctxt;
 	u_int8_t digest[16];
 	char l;
 	char n[64];	/* a single label must not exceed 63 chars */
 
 	/*
 	 * If no name is given and namelen is -1,
 	 * we try to do the hostname lookup ourselves.
 	 */
 	if (!name && namelen == -1) {
 		pr = curthread->td_ucred->cr_prison;
 		mtx_lock(&pr->pr_mtx);
 		name = pr->pr_hostname;
 		namelen = strlen(name);
 	} else
 		pr = NULL;
 	if (!name || !namelen) {
 		if (pr != NULL)
 			mtx_unlock(&pr->pr_mtx);
 		return -1;
 	}
 
 	p = name;
 	while (p && *p && *p != '.' && p - name < namelen)
 		p++;
 	if (p == name || p - name > sizeof(n) - 1) {
 		if (pr != NULL)
 			mtx_unlock(&pr->pr_mtx);
 		return -1;	/* label too long */
 	}
 	l = p - name;
 	strncpy(n, name, l);
 	if (pr != NULL)
 		mtx_unlock(&pr->pr_mtx);
 	n[(int)l] = '\0';
 	for (q = n; *q; q++) {
 		if ('A' <= *q && *q <= 'Z')
 			*q = *q - 'A' + 'a';
 	}
 
 	/* generate 16 bytes of pseudo-random value. */
 	bzero(&ctxt, sizeof(ctxt));
 	MD5Init(&ctxt);
 	MD5Update(&ctxt, &l, sizeof(l));
 	MD5Update(&ctxt, n, l);
 	MD5Final(digest, &ctxt);
 
 	bzero(in6, sizeof(*in6));
 	in6->s6_addr16[0] = IPV6_ADDR_INT16_MLL;
 	in6->s6_addr8[11] = 2;
 	if (oldmcprefix == 0) {
 		in6->s6_addr8[12] = 0xff;
 	 	/* Copy the first 24 bits of 128-bit hash into the address. */
 		bcopy(digest, &in6->s6_addr8[13], 3);
 	} else {
 	 	/* Copy the first 32 bits of 128-bit hash into the address. */
 		bcopy(digest, &in6->s6_addr32[3], sizeof(in6->s6_addr32[3]));
 	}
 	if (in6_setscope(in6, ifp, NULL))
 		return (-1); /* XXX: should not fail */
 
 	return 0;
 }
 
 int
 in6_nigroup(struct ifnet *ifp, const char *name, int namelen,
     struct in6_addr *in6)
 {
 
 	return (in6_nigroup0(ifp, name, namelen, in6, 0));
 }
 
 int
 in6_nigroup_oldmcprefix(struct ifnet *ifp, const char *name, int namelen,
     struct in6_addr *in6)
 {
 
 	return (in6_nigroup0(ifp, name, namelen, in6, 1));
 }
 
 /*
  * XXX multiple loopback interface needs more care.  for instance,
  * nodelocal address needs to be configured onto only one of them.
  * XXX multiple link-local address case
  *
  * altifp - secondary EUI64 source
  */
 void
 in6_ifattach(struct ifnet *ifp, struct ifnet *altifp)
 {
 	struct in6_ifaddr *ia;
 	struct in6_addr in6;
 
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return;
 	/*
 	 * quirks based on interface type
 	 */
 	switch (ifp->if_type) {
 	case IFT_STF:
 		/*
 		 * 6to4 interface is a very special kind of beast.
 		 * no multicast, no linklocal.  RFC2529 specifies how to make
 		 * linklocals for 6to4 interface, but there's no use and
 		 * it is rather harmful to have one.
 		 */
 		ND_IFINFO(ifp)->flags &= ~ND6_IFF_AUTO_LINKLOCAL;
 		break;
 	default:
 		break;
 	}
 
 	/*
 	 * usually, we require multicast capability to the interface
 	 */
 	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 		nd6log((LOG_INFO, "in6_ifattach: "
 		    "%s is not multicast capable, IPv6 not enabled\n",
 		    if_name(ifp)));
 		return;
 	}
 
 	/*
 	 * assign loopback address for loopback interface.
 	 * XXX multiple loopback interface case.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
 		struct ifaddr *ifa;
 
 		in6 = in6addr_loopback;
 		ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &in6);
 		if (ifa == NULL) {
 			if (in6_ifattach_loopback(ifp) != 0)
 				return;
 		} else
 			ifa_free(ifa);
 	}
 
 	/*
 	 * assign a link-local address, if there's none.
 	 */
 	if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
 	    ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL) {
 		int error;
 
 		ia = in6ifa_ifpforlinklocal(ifp, 0);
 		if (ia == NULL) {
 			error = in6_ifattach_linklocal(ifp, altifp);
 #if 0
 			if (error)
 				log(LOG_NOTICE, "in6_ifattach_linklocal: "
 				    "failed to add a link-local addr to %s\n",
 				    if_name(ifp));
 #endif
 		} else
 			ifa_free(&ia->ia_ifa);
 	}
 
 	/* update dynamically. */
 	if (V_in6_maxmtu < ifp->if_mtu)
 		V_in6_maxmtu = ifp->if_mtu;
 }
 
 /*
  * NOTE: in6_ifdetach() does not support loopback if at this moment.
  * We don't need this function in bsdi, because interfaces are never removed
  * from the ifnet list in bsdi.
  */
 void
-in6_ifdetach(struct ifnet *ifp)
+in6_ifdetach(struct ifnet *ifp, int purgeulp)
 {
 	struct ifaddr *ifa, *next;
 
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return;
 
 	/* remove neighbor management table */
-	nd6_purge(ifp);
+	/*
+	 * Enabling the nd6_purge will panic on vmove for interfaces on VNET
+	 * teardown as the IPv6 layer is cleaned up already and the locks
+	 * are destroyed.
+	 */
+	if (purgeulp)
+		nd6_purge(ifp);
 
 	/*
 	 * nuke any of IPv6 addresses we have
 	 * XXX: all addresses should be already removed
 	 */
 	TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		in6_purgeaddr(ifa);
 	}
-	in6_pcbpurgeif0(&V_udbinfo, ifp);
-	in6_pcbpurgeif0(&V_ulitecbinfo, ifp);
-	in6_pcbpurgeif0(&V_ripcbinfo, ifp);
+	if (purgeulp) {
+		in6_pcbpurgeif0(&V_udbinfo, ifp);
+		in6_pcbpurgeif0(&V_ulitecbinfo, ifp);
+		in6_pcbpurgeif0(&V_ripcbinfo, ifp);
+	}
 	/* leave from all multicast groups joined */
 	in6_purgemaddrs(ifp);
 
 	/*
 	 * remove neighbor management table.  we call it twice just to make
 	 * sure we nuke everything.  maybe we need just one call.
 	 * XXX: since the first call did not release addresses, some prefixes
 	 * might remain.  We should call nd6_purge() again to release the
 	 * prefixes after removing all addresses above.
 	 * (Or can we just delay calling nd6_purge until at this point?)
 	 */
-	nd6_purge(ifp);
+	if (purgeulp)
+		nd6_purge(ifp);
 }
 
 int
 in6_get_tmpifid(struct ifnet *ifp, u_int8_t *retbuf,
     const u_int8_t *baseid, int generate)
 {
 	u_int8_t nullbuf[8];
 	struct nd_ifinfo *ndi = ND_IFINFO(ifp);
 
 	bzero(nullbuf, sizeof(nullbuf));
 	if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) == 0) {
 		/* we've never created a random ID.  Create a new one. */
 		generate = 1;
 	}
 
 	if (generate) {
 		bcopy(baseid, ndi->randomseed1, sizeof(ndi->randomseed1));
 
 		/* generate_tmp_ifid will update seedn and buf */
 		(void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1,
 		    ndi->randomid);
 	}
 	bcopy(ndi->randomid, retbuf, 8);
 
 	return (0);
 }
 
 void
 in6_tmpaddrtimer(void *arg)
 {
 	CURVNET_SET((struct vnet *) arg);
 	struct nd_ifinfo *ndi;
 	u_int8_t nullbuf[8];
 	struct ifnet *ifp;
 
 	callout_reset(&V_in6_tmpaddrtimer_ch,
 	    (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor -
 	    V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, curvnet);
 
 	bzero(nullbuf, sizeof(nullbuf));
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (ifp->if_afdata[AF_INET6] == NULL)
 			continue;
 		ndi = ND_IFINFO(ifp);
 		if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) {
 			/*
 			 * We've been generating a random ID on this interface.
 			 * Create a new one.
 			 */
 			(void)generate_tmp_ifid(ndi->randomseed0,
 			    ndi->randomseed1, ndi->randomid);
 		}
 	}
 
 	CURVNET_RESTORE();
 }
 
 static void
 in6_purgemaddrs(struct ifnet *ifp)
 {
 	LIST_HEAD(,in6_multi)	 purgeinms;
 	struct in6_multi	*inm, *tinm;
 	struct ifmultiaddr	*ifma;
 
 	LIST_INIT(&purgeinms);
 	IN6_MULTI_LOCK();
 
 	/*
 	 * Extract list of in6_multi associated with the detaching ifp
 	 * which the PF_INET6 layer is about to release.
 	 * We need to do this as IF_ADDR_LOCK() may be re-acquired
 	 * by code further down.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET6 ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in6_multi *)ifma->ifma_protospec;
 		LIST_INSERT_HEAD(&purgeinms, inm, in6m_entry);
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	LIST_FOREACH_SAFE(inm, &purgeinms, in6m_entry, tinm) {
 		LIST_REMOVE(inm, in6m_entry);
 		in6m_release_locked(inm);
 	}
 	mld_ifdetach(ifp);
 
 	IN6_MULTI_UNLOCK();
 }
 
 void
 in6_ifattach_destroy(void)
 {
 
 	callout_drain(&V_in6_tmpaddrtimer_ch);
 }
 
 static void
 in6_ifattach_init(void *dummy)
 {
 
 	/* Timer for regeneranation of temporary addresses randomize ID. */
 	callout_init(&V_in6_tmpaddrtimer_ch, 0);
 	callout_reset(&V_in6_tmpaddrtimer_ch,
 	    (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor -
 	    V_ip6_temp_regen_advance) * hz,
 	    in6_tmpaddrtimer, curvnet);
 }
 
 /*
  * Cheat.
  * This must be after route_init(), which is now SI_ORDER_THIRD.
  */
 SYSINIT(in6_ifattach_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
     in6_ifattach_init, NULL);
Index: projects/vnet/sys/netinet6/in6_ifattach.h
===================================================================
--- projects/vnet/sys/netinet6/in6_ifattach.h	(revision 297511)
+++ projects/vnet/sys/netinet6/in6_ifattach.h	(revision 297512)
@@ -1,47 +1,47 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_ifattach.h,v 1.14 2001/02/08 12:48:39 jinmei Exp $
  * $FreeBSD$
  */
 
 #ifndef _NETINET6_IN6_IFATTACH_H_
 #define _NETINET6_IN6_IFATTACH_H_
 
 #ifdef _KERNEL
 void in6_ifattach(struct ifnet *, struct ifnet *);
 void in6_ifattach_destroy(void);
-void in6_ifdetach(struct ifnet *);
+void in6_ifdetach(struct ifnet *, int);
 int in6_get_tmpifid(struct ifnet *, u_int8_t *, const u_int8_t *, int);
 void in6_tmpaddrtimer(void *);
 int in6_get_hw_ifid(struct ifnet *, struct in6_addr *);
 int in6_nigroup(struct ifnet *, const char *, int, struct in6_addr *);
 int in6_nigroup_oldmcprefix(struct ifnet *, const char *, int, struct in6_addr *);
 #endif /* _KERNEL */
 
 #endif /* _NETINET6_IN6_IFATTACH_H_ */
Index: projects/vnet/sys/netinet6/ip6_input.c
===================================================================
--- projects/vnet/sys/netinet6/ip6_input.c	(revision 297511)
+++ projects/vnet/sys/netinet6/ip6_input.c	(revision 297512)
@@ -1,1705 +1,1729 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipfw.h"
 #include "opt_ipsec.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hhook.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_systm.h>
 #include <net/if_llatbl.h>
 #ifdef INET
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #endif /* INET */
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/icmp6.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/in6_ifattach.h>
+#include <netinet6/mld6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/in6_rss.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netinet6/ip6_ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif /* IPSEC */
 
 #include <netinet6/ip6protosw.h>
 
 extern struct domain inet6domain;
 
 u_char ip6_protox[IPPROTO_MAX];
 VNET_DEFINE(struct in6_ifaddrhead, in6_ifaddrhead);
 VNET_DEFINE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl);
 VNET_DEFINE(u_long, in6_ifaddrhmask);
 
 static struct netisr_handler ip6_nh = {
 	.nh_name = "ip6",
 	.nh_handler = ip6_input,
 	.nh_proto = NETISR_IPV6,
 #ifdef RSS
 	.nh_m2cpuid = rss_soft_m2cpuid_v6,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 #else
 	.nh_policy = NETISR_POLICY_FLOW,
 #endif
 };
 
 #ifdef RSS
 static struct netisr_handler ip6_direct_nh = {
 	.nh_name = "ip6_direct",
 	.nh_handler = ip6_direct_input,
 	.nh_proto = NETISR_IPV6_DIRECT,
 	.nh_m2cpuid = rss_soft_m2cpuid_v6,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 };
 #endif
 
 VNET_DEFINE(struct pfil_head, inet6_pfil_hook);
 
 VNET_PCPUSTAT_DEFINE(struct ip6stat, ip6stat);
 VNET_PCPUSTAT_SYSINIT(ip6stat);
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ip6stat);
 #endif /* VIMAGE */
 
 struct rmlock in6_ifaddr_lock;
 RM_SYSINIT(in6_ifaddr_lock, &in6_ifaddr_lock, "in6_ifaddr_lock");
 
 static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *);
 #ifdef PULLDOWN_TEST
 static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int);
 #endif
 
 /*
  * IP6 initialization: fill in IP6 protocol switch table.
  * All protocols not implemented in kernel go to raw IP6 protocol handler.
  */
 void
 ip6_init(void)
 {
 	struct protosw *pr;
 	int i;
 
 	TUNABLE_INT_FETCH("net.inet6.ip6.auto_linklocal",
 	    &V_ip6_auto_linklocal);
 	TUNABLE_INT_FETCH("net.inet6.ip6.accept_rtadv", &V_ip6_accept_rtadv);
 	TUNABLE_INT_FETCH("net.inet6.ip6.no_radr", &V_ip6_no_radr);
 
 	TAILQ_INIT(&V_in6_ifaddrhead);
 	V_in6_ifaddrhashtbl = hashinit(IN6ADDR_NHASH, M_IFADDR,
 	    &V_in6_ifaddrhmask);
 
 	/* Initialize packet filter hooks. */
 	V_inet6_pfil_hook.ph_type = PFIL_TYPE_AF;
 	V_inet6_pfil_hook.ph_af = AF_INET6;
 	if ((i = pfil_head_register(&V_inet6_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to register pfil hook, "
 			"error %d\n", __func__, i);
 
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET6,
 	    &V_ipsec_hhh_in[HHOOK_IPSEC_INET6],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register input helper hook\n",
 		    __func__);
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET6,
 	    &V_ipsec_hhh_out[HHOOK_IPSEC_INET6],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register output helper hook\n",
 		    __func__);
 
 	scope6_init();
 	addrsel_policy_init();
 	nd6_init();
 	frag6_init();
 
 	V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR;
 
 	/* Skip global initialization stuff for non-default instances. */
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		panic("ip6_init");
 
 	/* Initialize the entire ip6_protox[] array to IPPROTO_RAW. */
 	for (i = 0; i < IPPROTO_MAX; i++)
 		ip6_protox[i] = pr - inet6sw;
 	/*
 	 * Cycle through IP protocols and put them into the appropriate place
 	 * in ip6_protox[].
 	 */
 	for (pr = inet6domain.dom_protosw;
 	    pr < inet6domain.dom_protoswNPROTOSW; pr++)
 		if (pr->pr_domain->dom_family == PF_INET6 &&
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
 			/* Be careful to only index valid IP protocols. */
 			if (pr->pr_protocol < IPPROTO_MAX)
 				ip6_protox[pr->pr_protocol] = pr - inet6sw;
 		}
 
 	netisr_register(&ip6_nh);
 #ifdef RSS
 	netisr_register(&ip6_direct_nh);
 #endif
 }
 
 /*
  * The protocol to be inserted into ip6_protox[] must be already registered
  * in inet6sw[], either statically or through pf_proto_register().
  */
 int
 ip6proto_register(short ip6proto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/*
 	 * The protocol slot must not be occupied by another protocol
 	 * already.  An index pointing to IPPROTO_RAW is unused.
 	 */
 	pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip6_protox[ip6proto] != pr - inet6sw)	/* IPPROTO_RAW */
 		return (EEXIST);
 
 	/*
 	 * Find the protocol position in inet6sw[] and set the index.
 	 */
 	for (pr = inet6domain.dom_protosw;
 	    pr < inet6domain.dom_protoswNPROTOSW; pr++) {
 		if (pr->pr_domain->dom_family == PF_INET6 &&
 		    pr->pr_protocol && pr->pr_protocol == ip6proto) {
 			ip6_protox[pr->pr_protocol] = pr - inet6sw;
 			return (0);
 		}
 	}
 	return (EPROTONOSUPPORT);
 }
 
 int
 ip6proto_unregister(short ip6proto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/* Check if the protocol was indeed registered. */
 	pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip6_protox[ip6proto] == pr - inet6sw)	/* IPPROTO_RAW */
 		return (ENOENT);
 
 	/* Reset the protocol slot to IPPROTO_RAW. */
 	ip6_protox[ip6proto] = pr - inet6sw;
 	return (0);
 }
 
 #ifdef VIMAGE
 static void
 ip6_destroy(void *unused __unused)
 {
+	struct ifaddr *ifa, *nifa;
+	struct ifnet *ifp;
 	int error;
 
 	if ((error = pfil_head_unregister(&V_inet6_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to unregister pfil hook, "
 		    "error %d\n", __func__, error);
 	error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET6]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister input helper hook "
 		    "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET6: "
 		    "error %d returned\n", __func__, error);
 	}
 	error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET6]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister output helper hook "
 		    "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET6: "
 		    "error %d returned\n", __func__, error);
 	}
-	hashdestroy(V_in6_ifaddrhashtbl, M_IFADDR, V_in6_ifaddrhmask);
+
+	/* Cleanup addresses. */
+	IFNET_RLOCK();
+	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
+		/* Cannot lock here - lock recursion. */
+		/* IF_ADDR_LOCK(ifp); */
+		TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) {
+
+			if (ifa->ifa_addr->sa_family != AF_INET6)
+				continue;
+			in6_purgeaddr(ifa);
+		}
+		/* IF_ADDR_UNLOCK(ifp); */
+		in6_ifdetach(ifp, 0);
+		mld_domifdetach(ifp);
+		/* Make sure any routes are gone as well. */
+		rt_flushifroutes_af(ifp, AF_INET6);
+	}
+	IFNET_RUNLOCK();
+
 	nd6_destroy();
 	in6_ifattach_destroy();
+
+	hashdestroy(V_in6_ifaddrhashtbl, M_IFADDR, V_in6_ifaddrhmask);
 }
 
 VNET_SYSUNINIT(inet6, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip6_destroy, NULL);
 #endif
 
 static int
 ip6_input_hbh(struct mbuf *m, uint32_t *plen, uint32_t *rtalert, int *off,
     int *nxt, int *ours)
 {
 	struct ip6_hdr *ip6;
 	struct ip6_hbh *hbh;
 
 	if (ip6_hopopts_input(plen, rtalert, &m, off)) {
 #if 0	/*touches NULL pointer*/
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 #endif
 		goto out;	/* m have already been freed */
 	}
 
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * if the payload length field is 0 and the next header field
 	 * indicates Hop-by-Hop Options header, then a Jumbo Payload
 	 * option MUST be included.
 	 */
 	if (ip6->ip6_plen == 0 && *plen == 0) {
 		/*
 		 * Note that if a valid jumbo payload option is
 		 * contained, ip6_hopopts_input() must set a valid
 		 * (non-zero) payload length to the variable plen.
 		 */
 		IP6STAT_INC(ip6s_badoptions);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
 		icmp6_error(m, ICMP6_PARAM_PROB,
 			    ICMP6_PARAMPROB_HEADER,
 			    (caddr_t)&ip6->ip6_plen - (caddr_t)ip6);
 		goto out;
 	}
 #ifndef PULLDOWN_TEST
 	/* ip6_hopopts_input() ensures that mbuf is contiguous */
 	hbh = (struct ip6_hbh *)(ip6 + 1);
 #else
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
 		sizeof(struct ip6_hbh));
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		goto out;
 	}
 #endif
 	*nxt = hbh->ip6h_nxt;
 
 	/*
 	 * If we are acting as a router and the packet contains a
 	 * router alert option, see if we know the option value.
 	 * Currently, we only support the option value for MLD, in which
 	 * case we should pass the packet to the multicast routing
 	 * daemon.
 	 */
 	if (*rtalert != ~0) {
 		switch (*rtalert) {
 		case IP6OPT_RTALERT_MLD:
 			if (V_ip6_forwarding)
 				*ours = 1;
 			break;
 		default:
 			/*
 			 * RFC2711 requires unrecognized values must be
 			 * silently ignored.
 			 */
 			break;
 		}
 	}
 
 	return (0);
 
 out:
 	return (1);
 }
 
 #ifdef RSS
 /*
  * IPv6 direct input routine.
  *
  * This is called when reinjecting completed fragments where
  * all of the previous checking and book-keeping has been done.
  */
 void
 ip6_direct_input(struct mbuf *m)
 {
 	int off, nxt;
 	int nest;
 	struct m_tag *mtag;
 	struct ip6_direct_ctx *ip6dc;
 
 	mtag = m_tag_locate(m, MTAG_ABI_IPV6, IPV6_TAG_DIRECT, NULL);
 	KASSERT(mtag != NULL, ("Reinjected packet w/o direct ctx tag!"));
 
 	ip6dc = (struct ip6_direct_ctx *)(mtag + 1);
 	nxt = ip6dc->ip6dc_nxt;
 	off = ip6dc->ip6dc_off;
 
 	nest = 0;
 
 	m_tag_delete(m, mtag);
 
 	while (nxt != IPPROTO_DONE) {
 		if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) {
 			IP6STAT_INC(ip6s_toomanyhdr);
 			goto bad;
 		}
 
 		/*
 		 * protection against faulty packet - there should be
 		 * more sanity checks in header chain processing.
 		 */
 		if (m->m_pkthdr.len < off) {
 			IP6STAT_INC(ip6s_tooshort);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 			goto bad;
 		}
 
 #ifdef IPSEC
 		/*
 		 * enforce IPsec policy checking if we are seeing last header.
 		 * note that we do not visit this with protocols with pcb layer
 		 * code - like udp/tcp/raw ip.
 		 */
 		if (ip6_ipsec_input(m, nxt))
 			goto bad;
 #endif /* IPSEC */
 
 		nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
 	}
 	return;
 bad:
 	m_freem(m);
 }
 #endif
 
 void
 ip6_input(struct mbuf *m)
 {
 	struct in6_addr odst;
 	struct ip6_hdr *ip6;
 	struct in6_ifaddr *ia;
 	u_int32_t plen;
 	u_int32_t rtalert = ~0;
 	int off = sizeof(struct ip6_hdr), nest;
 	int nxt, ours = 0;
 	int srcrt = 0;
 
 #ifdef IPSEC
 	/*
 	 * should the inner packet be considered authentic?
 	 * see comment in ah4_input().
 	 * NB: m cannot be NULL when passed to the input routine
 	 */
 
 	m->m_flags &= ~M_AUTHIPHDR;
 	m->m_flags &= ~M_AUTHIPDGM;
 
 #endif /* IPSEC */
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		/*
 		 * Firewall changed destination to local.
 		 */
 		m->m_flags &= ~M_FASTFWD_OURS;
 		ours = 1;
 		ip6 = mtod(m, struct ip6_hdr *);
 		goto hbhcheck;
 	}
 
 	/*
 	 * mbuf statistics
 	 */
 	if (m->m_flags & M_EXT) {
 		if (m->m_next)
 			IP6STAT_INC(ip6s_mext2m);
 		else
 			IP6STAT_INC(ip6s_mext1);
 	} else {
 		if (m->m_next) {
 			if (m->m_flags & M_LOOP) {
 				IP6STAT_INC(ip6s_m2m[V_loif->if_index]);
 			} else if (m->m_pkthdr.rcvif->if_index < IP6S_M2MMAX)
 				IP6STAT_INC(
 				    ip6s_m2m[m->m_pkthdr.rcvif->if_index]);
 			else
 				IP6STAT_INC(ip6s_m2m[0]);
 		} else
 			IP6STAT_INC(ip6s_m1);
 	}
 
 	/* drop the packet if IPv6 operation is disabled on the IF */
 	if ((ND_IFINFO(m->m_pkthdr.rcvif)->flags & ND6_IFF_IFDISABLED))
 		goto bad;
 
 	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_receive);
 	IP6STAT_INC(ip6s_total);
 
 #ifndef PULLDOWN_TEST
 	/*
 	 * L2 bridge code and some other code can return mbuf chain
 	 * that does not conform to KAME requirement.  too bad.
 	 * XXX: fails to join if interface MTU > MCLBYTES.  jumbogram?
 	 */
 	if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) {
 		struct mbuf *n;
 
 		if (m->m_pkthdr.len > MHLEN)
 			n = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		else
 			n = m_gethdr(M_NOWAIT, MT_DATA);
 		if (n == NULL) {
 			m_freem(m);
 			return;	/* ENOBUFS */
 		}
 
 		m_move_pkthdr(n, m);
 		m_copydata(m, 0, n->m_pkthdr.len, mtod(n, caddr_t));
 		n->m_len = n->m_pkthdr.len;
 		m_freem(m);
 		m = n;
 	}
 	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /* nothing */);
 #endif
 
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		struct ifnet *inifp;
 		inifp = m->m_pkthdr.rcvif;
 		if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
 			IP6STAT_INC(ip6s_toosmall);
 			in6_ifstat_inc(inifp, ifs6_in_hdrerr);
 			return;
 		}
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 		IP6STAT_INC(ip6s_badvers);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
 		goto bad;
 	}
 
 	IP6STAT_INC(ip6s_nxthist[ip6->ip6_nxt]);
 
 	IP_PROBE(receive, NULL, NULL, ip6, m->m_pkthdr.rcvif, NULL, ip6);
 
 	/*
 	 * Check against address spoofing/corruption.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) {
 		/*
 		 * XXX: "badscope" is not very suitable for a multicast source.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) &&
 	    !(m->m_flags & M_LOOP)) {
 		/*
 		 * In this case, the packet should come from the loopback
 		 * interface.  However, we cannot just check the if_flags,
 		 * because ip6_mloopback() passes the "actual" interface
 		 * as the outgoing/incoming interface.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
 	    IPV6_ADDR_MC_SCOPE(&ip6->ip6_dst) == 0) {
 		/*
 		 * RFC4291 2.7:
 		 * Nodes must not originate a packet to a multicast address
 		 * whose scop field contains the reserved value 0; if such
 		 * a packet is received, it must be silently dropped.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #ifdef ALTQ
 	if (altq_input != NULL && (*altq_input)(m, AF_INET6) == 0) {
 		/* packet is dropped by traffic conditioner */
 		return;
 	}
 #endif
 	/*
 	 * The following check is not documented in specs.  A malicious
 	 * party may be able to use IPv4 mapped addr to confuse tcp/udp stack
 	 * and bypass security checks (act as if it was from 127.0.0.1 by using
 	 * IPv6 src ::ffff:127.0.0.1).  Be cautious.
 	 *
 	 * This check chokes if we are in an SIIT cloud.  As none of BSDs
 	 * support IPv4-less kernel compilation, we cannot support SIIT
 	 * environment at all.  So, it makes more sense for us to reject any
 	 * malicious packets for non-SIIT environment, than try to do a
 	 * partial support for SIIT environment.
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #if 0
 	/*
 	 * Reject packets with IPv4 compatible addresses (auto tunnel).
 	 *
 	 * The code forbids auto tunnel relay case in RFC1933 (the check is
 	 * stronger than RFC1933).  We may want to re-enable it if mech-xx
 	 * is revised to forbid relaying case.
 	 */
 	if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #endif
 #ifdef IPSEC
 	/*
 	 * Bypass packet filtering for packets previously handled by IPsec.
 	 */
 	if (ip6_ipsec_filtertunnel(m))
 		goto passin;
 #endif /* IPSEC */
 
 	/*
 	 * Run through list of hooks for input packets.
 	 *
 	 * NB: Beware of the destination address changing
 	 *     (e.g. by NAT rewriting).  When this happens,
 	 *     tell ip6_forward to do the right thing.
 	 */
 	odst = ip6->ip6_dst;
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet6_pfil_hook))
 		goto passin;
 
 	if (pfil_run_hooks(&V_inet6_pfil_hook, &m,
 	    m->m_pkthdr.rcvif, PFIL_IN, NULL))
 		return;
 	if (m == NULL)			/* consumed by filter */
 		return;
 	ip6 = mtod(m, struct ip6_hdr *);
 	srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst);
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		ours = 1;
 		goto hbhcheck;
 	}
 	if ((m->m_flags & M_IP6_NEXTHOP) &&
 	    m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) {
 		/*
 		 * Directly ship the packet on.  This allows forwarding
 		 * packets originally destined to us to some other directly
 		 * connected host.
 		 */
 		ip6_forward(m, 1);
 		return;
 	}
 
 passin:
 	/*
 	 * Disambiguate address scope zones (if there is ambiguity).
 	 * We first make sure that the original source or destination address
 	 * is not in our internal form for scoped addresses.  Such addresses
 	 * are not necessarily invalid spec-wise, but we cannot accept them due
 	 * to the usage conflict.
 	 * in6_setscope() then also checks and rejects the cases where src or
 	 * dst are the loopback address and the receiving interface
 	 * is not loopback.
 	 */
 	if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope); /* XXX */
 		goto bad;
 	}
 	if (in6_setscope(&ip6->ip6_src, m->m_pkthdr.rcvif, NULL) ||
 	    in6_setscope(&ip6->ip6_dst, m->m_pkthdr.rcvif, NULL)) {
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 	/*
 	 * Multicast check. Assume packet is for us to avoid
 	 * prematurely taking locks.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		ours = 1;
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mcast);
 		goto hbhcheck;
 	}
 	/*
 	 * Unicast check
 	 * XXX: For now we keep link-local IPv6 addresses with embedded
 	 *      scope zone id, therefore we use zero zoneid here.
 	 */
 	ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */);
 	if (ia != NULL) {
 		if (ia->ia6_flags & IN6_IFF_NOTREADY) {
 			char ip6bufs[INET6_ADDRSTRLEN];
 			char ip6bufd[INET6_ADDRSTRLEN];
 			/* address is not ready, so discard the packet. */
 			nd6log((LOG_INFO,
 			    "ip6_input: packet to an unready address %s->%s\n",
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst)));
 			ifa_free(&ia->ia_ifa);
 			goto bad;
 		}
 		/* Count the packet in the ip address stats */
 		counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 		counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len);
 		ifa_free(&ia->ia_ifa);
 		ours = 1;
 		goto hbhcheck;
 	}
 
 	/*
 	 * Now there is no reason to process the packet if it's not our own
 	 * and we're not a router.
 	 */
 	if (!V_ip6_forwarding) {
 		IP6STAT_INC(ip6s_cantforward);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 		goto bad;
 	}
 
   hbhcheck:
 	/*
 	 * Process Hop-by-Hop options header if it's contained.
 	 * m may be modified in ip6_hopopts_input().
 	 * If a JumboPayload option is included, plen will also be modified.
 	 */
 	plen = (u_int32_t)ntohs(ip6->ip6_plen);
 	if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
 		if (ip6_input_hbh(m, &plen, &rtalert, &off, &nxt, &ours) != 0)
 			return;
 	} else
 		nxt = ip6->ip6_nxt;
 
 	/*
 	 * Use mbuf flags to propagate Router Alert option to
 	 * ICMPv6 layer, as hop-by-hop options have been stripped.
 	 */
 	if (rtalert != ~0)
 		m->m_flags |= M_RTALERT_MLD;
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IPv6 header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) {
 		IP6STAT_INC(ip6s_tooshort);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 		goto bad;
 	}
 	if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = sizeof(struct ip6_hdr) + plen;
 			m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen;
 		} else
 			m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len);
 	}
 
 	/*
 	 * Forward if desirable.
 	 */
 	if (V_ip6_mrouter &&
 	    IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		/*
 		 * If we are acting as a multicast router, all
 		 * incoming multicast packets are passed to the
 		 * kernel-level multicast forwarding function.
 		 * The packet is returned (relatively) intact; if
 		 * ip6_mforward() returns a non-zero value, the packet
 		 * must be discarded, else it may be accepted below.
 		 *
 		 * XXX TODO: Check hlim and multicast scope here to avoid
 		 * unnecessarily calling into ip6_mforward().
 		 */
 		if (ip6_mforward &&
 		    ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) {
 			IP6STAT_INC(ip6s_cantforward);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 			goto bad;
 		}
 	} else if (!ours) {
 		ip6_forward(m, srcrt);
 		return;
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * Malicious party may be able to use IPv4 mapped addr to confuse
 	 * tcp/udp stack and bypass security checks (act as if it was from
 	 * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1).  Be cautious.
 	 *
 	 * For SIIT end node behavior, you may want to disable the check.
 	 * However, you will  become vulnerable to attacks using IPv4 mapped
 	 * source.
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 
 	/*
 	 * Tell launch routine the next header
 	 */
 	IP6STAT_INC(ip6s_delivered);
 	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_deliver);
 	nest = 0;
 
 	while (nxt != IPPROTO_DONE) {
 		if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) {
 			IP6STAT_INC(ip6s_toomanyhdr);
 			goto bad;
 		}
 
 		/*
 		 * protection against faulty packet - there should be
 		 * more sanity checks in header chain processing.
 		 */
 		if (m->m_pkthdr.len < off) {
 			IP6STAT_INC(ip6s_tooshort);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 			goto bad;
 		}
 
 #ifdef IPSEC
 		/*
 		 * enforce IPsec policy checking if we are seeing last header.
 		 * note that we do not visit this with protocols with pcb layer
 		 * code - like udp/tcp/raw ip.
 		 */
 		if (ip6_ipsec_input(m, nxt))
 			goto bad;
 #endif /* IPSEC */
 
 		nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
 	}
 	return;
 bad:
 	m_freem(m);
 }
 
 /*
  * Hop-by-Hop options header processing. If a valid jumbo payload option is
  * included, the real payload length will be stored in plenp.
  *
  * rtalertp - XXX: should be stored more smart way
  */
 static int
 ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp,
     struct mbuf **mp, int *offp)
 {
 	struct mbuf *m = *mp;
 	int off = *offp, hbhlen;
 	struct ip6_hbh *hbh;
 
 	/* validation of the length of the header */
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1);
 	hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off);
 	hbhlen = (hbh->ip6h_len + 1) << 3;
 
 	IP6_EXTHDR_CHECK(m, off, hbhlen, -1);
 	hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off);
 #else
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m,
 		sizeof(struct ip6_hdr), sizeof(struct ip6_hbh));
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		return -1;
 	}
 	hbhlen = (hbh->ip6h_len + 1) << 3;
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
 		hbhlen);
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		return -1;
 	}
 #endif
 	off += hbhlen;
 	hbhlen -= sizeof(struct ip6_hbh);
 	if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh),
 				hbhlen, rtalertp, plenp) < 0)
 		return (-1);
 
 	*offp = off;
 	*mp = m;
 	return (0);
 }
 
 /*
  * Search header for all Hop-by-hop options and process each option.
  * This function is separate from ip6_hopopts_input() in order to
  * handle a case where the sending node itself process its hop-by-hop
  * options header. In such a case, the function is called from ip6_output().
  *
  * The function assumes that hbh header is located right after the IPv6 header
  * (RFC2460 p7), opthead is pointer into data content in m, and opthead to
  * opthead + hbhlen is located in contiguous memory region.
  */
 int
 ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen,
     u_int32_t *rtalertp, u_int32_t *plenp)
 {
 	struct ip6_hdr *ip6;
 	int optlen = 0;
 	u_int8_t *opt = opthead;
 	u_int16_t rtalert_val;
 	u_int32_t jumboplen;
 	const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh);
 
 	for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) {
 		switch (*opt) {
 		case IP6OPT_PAD1:
 			optlen = 1;
 			break;
 		case IP6OPT_PADN:
 			if (hbhlen < IP6OPT_MINLEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			optlen = *(opt + 1) + 2;
 			break;
 		case IP6OPT_ROUTER_ALERT:
 			/* XXX may need check for alignment */
 			if (hbhlen < IP6OPT_RTALERT_LEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) {
 				/* XXX stat */
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 1 - opthead);
 				return (-1);
 			}
 			optlen = IP6OPT_RTALERT_LEN;
 			bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2);
 			*rtalertp = ntohs(rtalert_val);
 			break;
 		case IP6OPT_JUMBO:
 			/* XXX may need check for alignment */
 			if (hbhlen < IP6OPT_JUMBO_LEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) {
 				/* XXX stat */
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 1 - opthead);
 				return (-1);
 			}
 			optlen = IP6OPT_JUMBO_LEN;
 
 			/*
 			 * IPv6 packets that have non 0 payload length
 			 * must not contain a jumbo payload option.
 			 */
 			ip6 = mtod(m, struct ip6_hdr *);
 			if (ip6->ip6_plen) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt - opthead);
 				return (-1);
 			}
 
 			/*
 			 * We may see jumbolen in unaligned location, so
 			 * we'd need to perform bcopy().
 			 */
 			bcopy(opt + 2, &jumboplen, sizeof(jumboplen));
 			jumboplen = (u_int32_t)htonl(jumboplen);
 
 #if 1
 			/*
 			 * if there are multiple jumbo payload options,
 			 * *plenp will be non-zero and the packet will be
 			 * rejected.
 			 * the behavior may need some debate in ipngwg -
 			 * multiple options does not make sense, however,
 			 * there's no explicit mention in specification.
 			 */
 			if (*plenp != 0) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 2 - opthead);
 				return (-1);
 			}
 #endif
 
 			/*
 			 * jumbo payload length must be larger than 65535.
 			 */
 			if (jumboplen <= IPV6_MAXPACKET) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 2 - opthead);
 				return (-1);
 			}
 			*plenp = jumboplen;
 
 			break;
 		default:		/* unknown option */
 			if (hbhlen < IP6OPT_MINLEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			optlen = ip6_unknown_opt(opt, m,
 			    erroff + opt - opthead);
 			if (optlen == -1)
 				return (-1);
 			optlen += 2;
 			break;
 		}
 	}
 
 	return (0);
 
   bad:
 	m_freem(m);
 	return (-1);
 }
 
 /*
  * Unknown option processing.
  * The third argument `off' is the offset from the IPv6 header to the option,
  * which is necessary if the IPv6 header the and option header and IPv6 header
  * is not contiguous in order to return an ICMPv6 error.
  */
 int
 ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off)
 {
 	struct ip6_hdr *ip6;
 
 	switch (IP6OPT_TYPE(*optp)) {
 	case IP6OPT_TYPE_SKIP: /* ignore the option */
 		return ((int)*(optp + 1));
 	case IP6OPT_TYPE_DISCARD:	/* silently discard */
 		m_freem(m);
 		return (-1);
 	case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */
 		IP6STAT_INC(ip6s_badoptions);
 		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off);
 		return (-1);
 	case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */
 		IP6STAT_INC(ip6s_badoptions);
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    (m->m_flags & (M_BCAST|M_MCAST)))
 			m_freem(m);
 		else
 			icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_OPTION, off);
 		return (-1);
 	}
 
 	m_freem(m);		/* XXX: NOTREACHED */
 	return (-1);
 }
 
 /*
  * Create the "control" list for this pcb.
  * These functions will not modify mbuf chain at all.
  *
  * With KAME mbuf chain restriction:
  * The routine will be called from upper layer handlers like tcp6_input().
  * Thus the routine assumes that the caller (tcp6_input) have already
  * called IP6_EXTHDR_CHECK() and all the extension headers are located in the
  * very first mbuf on the mbuf chain.
  *
  * ip6_savecontrol_v4 will handle those options that are possible to be
  * set on a v4-mapped socket.
  * ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those
  * options and handle the v6-only ones itself.
  */
 struct mbuf **
 ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp,
     int *v4only)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 
 #ifdef SO_TIMESTAMP
 	if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) {
 		struct timeval tv;
 
 		microtime(&tv);
 		*mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
 		    SCM_TIMESTAMP, SOL_SOCKET);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #endif
 
 #define IS2292(inp, x, y)	(((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y))
 	/* RFC 2292 sec. 5 */
 	if ((inp->inp_flags & IN6P_PKTINFO) != 0) {
 		struct in6_pktinfo pi6;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			pi6.ipi6_addr.s6_addr32[0] = 0;
 			pi6.ipi6_addr.s6_addr32[1] = 0;
 			pi6.ipi6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP;
 			pi6.ipi6_addr.s6_addr32[3] = ip->ip_dst.s_addr;
 #else
 			/* We won't hit this code */
 			bzero(&pi6.ipi6_addr, sizeof(struct in6_addr));
 #endif
 		} else {	
 			bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr));
 			in6_clearscope(&pi6.ipi6_addr);	/* XXX */
 		}
 		pi6.ipi6_ifindex =
 		    (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0;
 
 		*mp = sbcreatecontrol((caddr_t) &pi6,
 		    sizeof(struct in6_pktinfo),
 		    IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) {
 		int hlim;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			hlim = ip->ip_ttl;
 #else
 			/* We won't hit this code */
 			hlim = 0;
 #endif
 		} else {
 			hlim = ip6->ip6_hlim & 0xff;
 		}
 		*mp = sbcreatecontrol((caddr_t) &hlim, sizeof(int),
 		    IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT),
 		    IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if ((inp->inp_flags & IN6P_TCLASS) != 0) {
 		int tclass;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			tclass = ip->ip_tos;
 #else
 			/* We won't hit this code */
 			tclass = 0;
 #endif
 		} else {
 			u_int32_t flowinfo;
 
 			flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK);
 			flowinfo >>= 20;
 			tclass = flowinfo & 0xff;
 		}
 		*mp = sbcreatecontrol((caddr_t) &tclass, sizeof(int),
 		    IPV6_TCLASS, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if (v4only != NULL) {
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 			*v4only = 1;
 		} else {
 			*v4only = 0;
 		}
 	}
 
 	return (mp);
 }
 
 void
 ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	int v4only = 0;
 
 	mp = ip6_savecontrol_v4(in6p, m, mp, &v4only);
 	if (v4only)
 		return;
 
 	/*
 	 * IPV6_HOPOPTS socket option.  Recall that we required super-user
 	 * privilege for the option (see ip6_ctloutput), but it might be too
 	 * strict, since there might be some hop-by-hop options which can be
 	 * returned to normal user.
 	 * See also RFC 2292 section 6 (or RFC 3542 section 8).
 	 */
 	if ((in6p->inp_flags & IN6P_HOPOPTS) != 0) {
 		/*
 		 * Check if a hop-by-hop options header is contatined in the
 		 * received packet, and if so, store the options as ancillary
 		 * data. Note that a hop-by-hop options header must be
 		 * just after the IPv6 header, which is assured through the
 		 * IPv6 input processing.
 		 */
 		if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
 			struct ip6_hbh *hbh;
 			int hbhlen = 0;
 #ifdef PULLDOWN_TEST
 			struct mbuf *ext;
 #endif
 
 #ifndef PULLDOWN_TEST
 			hbh = (struct ip6_hbh *)(ip6 + 1);
 			hbhlen = (hbh->ip6h_len + 1) << 3;
 #else
 			ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr),
 			    ip6->ip6_nxt);
 			if (ext == NULL) {
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 			hbh = mtod(ext, struct ip6_hbh *);
 			hbhlen = (hbh->ip6h_len + 1) << 3;
 			if (hbhlen != ext->m_len) {
 				m_freem(ext);
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 #endif
 
 			/*
 			 * XXX: We copy the whole header even if a
 			 * jumbo payload option is included, the option which
 			 * is to be removed before returning according to
 			 * RFC2292.
 			 * Note: this constraint is removed in RFC3542
 			 */
 			*mp = sbcreatecontrol((caddr_t)hbh, hbhlen,
 			    IS2292(in6p, IPV6_2292HOPOPTS, IPV6_HOPOPTS),
 			    IPPROTO_IPV6);
 			if (*mp)
 				mp = &(*mp)->m_next;
 #ifdef PULLDOWN_TEST
 			m_freem(ext);
 #endif
 		}
 	}
 
 	if ((in6p->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) {
 		int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr);
 
 		/*
 		 * Search for destination options headers or routing
 		 * header(s) through the header chain, and stores each
 		 * header as ancillary data.
 		 * Note that the order of the headers remains in
 		 * the chain of ancillary data.
 		 */
 		while (1) {	/* is explicit loop prevention necessary? */
 			struct ip6_ext *ip6e = NULL;
 			int elen;
 #ifdef PULLDOWN_TEST
 			struct mbuf *ext = NULL;
 #endif
 
 			/*
 			 * if it is not an extension header, don't try to
 			 * pull it from the chain.
 			 */
 			switch (nxt) {
 			case IPPROTO_DSTOPTS:
 			case IPPROTO_ROUTING:
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_AH: /* is it possible? */
 				break;
 			default:
 				goto loopend;
 			}
 
 #ifndef PULLDOWN_TEST
 			if (off + sizeof(*ip6e) > m->m_len)
 				goto loopend;
 			ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off);
 			if (nxt == IPPROTO_AH)
 				elen = (ip6e->ip6e_len + 2) << 2;
 			else
 				elen = (ip6e->ip6e_len + 1) << 3;
 			if (off + elen > m->m_len)
 				goto loopend;
 #else
 			ext = ip6_pullexthdr(m, off, nxt);
 			if (ext == NULL) {
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 			ip6e = mtod(ext, struct ip6_ext *);
 			if (nxt == IPPROTO_AH)
 				elen = (ip6e->ip6e_len + 2) << 2;
 			else
 				elen = (ip6e->ip6e_len + 1) << 3;
 			if (elen != ext->m_len) {
 				m_freem(ext);
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 #endif
 
 			switch (nxt) {
 			case IPPROTO_DSTOPTS:
 				if (!(in6p->inp_flags & IN6P_DSTOPTS))
 					break;
 
 				*mp = sbcreatecontrol((caddr_t)ip6e, elen,
 				    IS2292(in6p,
 					IPV6_2292DSTOPTS, IPV6_DSTOPTS),
 				    IPPROTO_IPV6);
 				if (*mp)
 					mp = &(*mp)->m_next;
 				break;
 			case IPPROTO_ROUTING:
 				if (!(in6p->inp_flags & IN6P_RTHDR))
 					break;
 
 				*mp = sbcreatecontrol((caddr_t)ip6e, elen,
 				    IS2292(in6p, IPV6_2292RTHDR, IPV6_RTHDR),
 				    IPPROTO_IPV6);
 				if (*mp)
 					mp = &(*mp)->m_next;
 				break;
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_AH: /* is it possible? */
 				break;
 
 			default:
 				/*
 				 * other cases have been filtered in the above.
 				 * none will visit this case.  here we supply
 				 * the code just in case (nxt overwritten or
 				 * other cases).
 				 */
 #ifdef PULLDOWN_TEST
 				m_freem(ext);
 #endif
 				goto loopend;
 
 			}
 
 			/* proceed with the next header. */
 			off += elen;
 			nxt = ip6e->ip6e_nxt;
 			ip6e = NULL;
 #ifdef PULLDOWN_TEST
 			m_freem(ext);
 			ext = NULL;
 #endif
 		}
 	  loopend:
 		;
 	}
 
 	if (in6p->inp_flags2 & INP_RECVFLOWID) {
 		uint32_t flowid, flow_type;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		/*
 		 * XXX should handle the failure of one or the
 		 * other - don't populate both?
 		 */
 		*mp = sbcreatecontrol((caddr_t) &flowid,
 		    sizeof(uint32_t), IPV6_FLOWID, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 		*mp = sbcreatecontrol((caddr_t) &flow_type,
 		    sizeof(uint32_t), IPV6_FLOWTYPE, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 #ifdef	RSS
 	if (in6p->inp_flags2 & INP_RECVRSSBUCKETID) {
 		uint32_t flowid, flow_type;
 		uint32_t rss_bucketid;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) {
 			*mp = sbcreatecontrol((caddr_t) &rss_bucketid,
 			   sizeof(uint32_t), IPV6_RSSBUCKETID, IPPROTO_IPV6);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 #endif
 
 }
 #undef IS2292
 
 void
 ip6_notify_pmtu(struct inpcb *inp, struct sockaddr_in6 *dst, u_int32_t mtu)
 {
 	struct socket *so;
 	struct mbuf *m_mtu;
 	struct ip6_mtuinfo mtuctl;
 
 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 	/*
 	 * Notify the error by sending IPV6_PATHMTU ancillary data if
 	 * application wanted to know the MTU value.
 	 * NOTE: we notify disconnected sockets, because some udp
 	 * applications keep sending sockets disconnected.
 	 * NOTE: our implementation doesn't notify connected sockets that has
 	 * foreign address that is different than given destination addresses
 	 * (this is permitted by RFC 3542).
 	 */
 	if ((inp->inp_flags & IN6P_MTU) == 0 || (
 	    !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
 	    !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &dst->sin6_addr)))
 		return;
 
 	mtuctl.ip6m_mtu = mtu;
 	mtuctl.ip6m_addr = *dst;
 	if (sa6_recoverscope(&mtuctl.ip6m_addr))
 		return;
 
 	if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl),
 	    IPV6_PATHMTU, IPPROTO_IPV6)) == NULL)
 		return;
 
 	so =  inp->inp_socket;
 	if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu)
 	    == 0) {
 		m_freem(m_mtu);
 		/* XXX: should count statistics */
 	} else
 		sorwakeup(so);
 }
 
 #ifdef PULLDOWN_TEST
 /*
  * pull single extension header from mbuf chain.  returns single mbuf that
  * contains the result, or NULL on error.
  */
 static struct mbuf *
 ip6_pullexthdr(struct mbuf *m, size_t off, int nxt)
 {
 	struct ip6_ext ip6e;
 	size_t elen;
 	struct mbuf *n;
 
 #ifdef DIAGNOSTIC
 	switch (nxt) {
 	case IPPROTO_DSTOPTS:
 	case IPPROTO_ROUTING:
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_AH: /* is it possible? */
 		break;
 	default:
 		printf("ip6_pullexthdr: invalid nxt=%d\n", nxt);
 	}
 #endif
 
 	m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 	if (nxt == IPPROTO_AH)
 		elen = (ip6e.ip6e_len + 2) << 2;
 	else
 		elen = (ip6e.ip6e_len + 1) << 3;
 
 	if (elen > MLEN)
 		n = m_getcl(M_NOWAIT, MT_DATA, 0);
 	else
 		n = m_get(M_NOWAIT, MT_DATA);
 	if (n == NULL)
 		return NULL;
 
 	m_copydata(m, off, elen, mtod(n, caddr_t));
 	n->m_len = elen;
 	return n;
 }
 #endif
 
 /*
  * Get pointer to the previous header followed by the header
  * currently processed.
  * XXX: This function supposes that
  *	M includes all headers,
  *	the next header field and the header length field of each header
  *	are valid, and
  *	the sum of each header length equals to OFF.
  * Because of these assumptions, this function must be called very
  * carefully. Moreover, it will not be used in the near future when
  * we develop `neater' mechanism to process extension headers.
  */
 char *
 ip6_get_prevhdr(const struct mbuf *m, int off)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 
 	if (off == sizeof(struct ip6_hdr))
 		return (&ip6->ip6_nxt);
 	else {
 		int len, nxt;
 		struct ip6_ext *ip6e = NULL;
 
 		nxt = ip6->ip6_nxt;
 		len = sizeof(struct ip6_hdr);
 		while (len < off) {
 			ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + len);
 
 			switch (nxt) {
 			case IPPROTO_FRAGMENT:
 				len += sizeof(struct ip6_frag);
 				break;
 			case IPPROTO_AH:
 				len += (ip6e->ip6e_len + 2) << 2;
 				break;
 			default:
 				len += (ip6e->ip6e_len + 1) << 3;
 				break;
 			}
 			nxt = ip6e->ip6e_nxt;
 		}
 		if (ip6e)
 			return (&ip6e->ip6e_nxt);
 		else
 			return NULL;
 	}
 }
 
 /*
  * get next header offset.  m will be retained.
  */
 int
 ip6_nexthdr(const struct mbuf *m, int off, int proto, int *nxtp)
 {
 	struct ip6_hdr ip6;
 	struct ip6_ext ip6e;
 	struct ip6_frag fh;
 
 	/* just in case */
 	if (m == NULL)
 		panic("ip6_nexthdr: m == NULL");
 	if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off)
 		return -1;
 
 	switch (proto) {
 	case IPPROTO_IPV6:
 		if (m->m_pkthdr.len < off + sizeof(ip6))
 			return -1;
 		m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6);
 		if (nxtp)
 			*nxtp = ip6.ip6_nxt;
 		off += sizeof(ip6);
 		return off;
 
 	case IPPROTO_FRAGMENT:
 		/*
 		 * terminate parsing if it is not the first fragment,
 		 * it does not make sense to parse through it.
 		 */
 		if (m->m_pkthdr.len < off + sizeof(fh))
 			return -1;
 		m_copydata(m, off, sizeof(fh), (caddr_t)&fh);
 		/* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */
 		if (fh.ip6f_offlg & IP6F_OFF_MASK)
 			return -1;
 		if (nxtp)
 			*nxtp = fh.ip6f_nxt;
 		off += sizeof(struct ip6_frag);
 		return off;
 
 	case IPPROTO_AH:
 		if (m->m_pkthdr.len < off + sizeof(ip6e))
 			return -1;
 		m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 		if (nxtp)
 			*nxtp = ip6e.ip6e_nxt;
 		off += (ip6e.ip6e_len + 2) << 2;
 		return off;
 
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_ROUTING:
 	case IPPROTO_DSTOPTS:
 		if (m->m_pkthdr.len < off + sizeof(ip6e))
 			return -1;
 		m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 		if (nxtp)
 			*nxtp = ip6e.ip6e_nxt;
 		off += (ip6e.ip6e_len + 1) << 3;
 		return off;
 
 	case IPPROTO_NONE:
 	case IPPROTO_ESP:
 	case IPPROTO_IPCOMP:
 		/* give up */
 		return -1;
 
 	default:
 		return -1;
 	}
 
 	/* NOTREACHED */
 }
 
 /*
  * get offset for the last header in the chain.  m will be kept untainted.
  */
 int
 ip6_lasthdr(const struct mbuf *m, int off, int proto, int *nxtp)
 {
 	int newoff;
 	int nxt;
 
 	if (!nxtp) {
 		nxt = -1;
 		nxtp = &nxt;
 	}
 	while (1) {
 		newoff = ip6_nexthdr(m, off, proto, nxtp);
 		if (newoff < 0)
 			return off;
 		else if (newoff < off)
 			return -1;	/* invalid */
 		else if (newoff == off)
 			return newoff;
 
 		off = newoff;
 		proto = *nxtp;
 	}
 }
 
 /*
  * System control for IP6
  */
 
 u_char	inet6ctlerrmap[PRC_NCMDS] = {
 	0,		0,		0,		0,
 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
 	0,		0,		0,		0,
 	ENOPROTOOPT
 };
Index: projects/vnet/sys/netinet6/mld6.c
===================================================================
--- projects/vnet/sys/netinet6/mld6.c	(revision 297511)
+++ projects/vnet/sys/netinet6/mld6.c	(revision 297512)
@@ -1,3320 +1,3316 @@
 /*-
  * Copyright (c) 2009 Bruce Simpson.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1988 Stephen Deering.
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/sysctl.h>
 #include <sys/kernel.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/ktr.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet/icmp6.h>
 #include <netinet6/mld6.h>
 #include <netinet6/mld6_var.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifndef KTR_MLD
 #define KTR_MLD KTR_INET6
 #endif
 
 static struct mld_ifsoftc *
 		mli_alloc_locked(struct ifnet *);
 static void	mli_delete_locked(const struct ifnet *);
 static void	mld_dispatch_packet(struct mbuf *);
 static void	mld_dispatch_queue(struct mbufq *, int);
 static void	mld_final_leave(struct in6_multi *, struct mld_ifsoftc *);
 static void	mld_fasttimo_vnet(void);
 static int	mld_handle_state_change(struct in6_multi *,
 		    struct mld_ifsoftc *);
 static int	mld_initial_join(struct in6_multi *, struct mld_ifsoftc *,
 		    const int);
 #ifdef KTR
 static char *	mld_rec_type_to_str(const int);
 #endif
 static void	mld_set_version(struct mld_ifsoftc *, const int);
 static void	mld_slowtimo_vnet(void);
 static int	mld_v1_input_query(struct ifnet *, const struct ip6_hdr *,
 		    /*const*/ struct mld_hdr *);
 static int	mld_v1_input_report(struct ifnet *, const struct ip6_hdr *,
 		    /*const*/ struct mld_hdr *);
 static void	mld_v1_process_group_timer(struct mld_ifsoftc *,
 		    struct in6_multi *);
 static void	mld_v1_process_querier_timers(struct mld_ifsoftc *);
 static int	mld_v1_transmit_report(struct in6_multi *, const int);
 static void	mld_v1_update_group(struct in6_multi *, const int);
 static void	mld_v2_cancel_link_timers(struct mld_ifsoftc *);
 static void	mld_v2_dispatch_general_query(struct mld_ifsoftc *);
 static struct mbuf *
 		mld_v2_encap_report(struct ifnet *, struct mbuf *);
 static int	mld_v2_enqueue_filter_change(struct mbufq *,
 		    struct in6_multi *);
 static int	mld_v2_enqueue_group_record(struct mbufq *,
 		    struct in6_multi *, const int, const int, const int,
 		    const int);
 static int	mld_v2_input_query(struct ifnet *, const struct ip6_hdr *,
 		    struct mbuf *, const int, const int);
 static int	mld_v2_merge_state_changes(struct in6_multi *,
 		    struct mbufq *);
 static void	mld_v2_process_group_timers(struct mld_ifsoftc *,
 		    struct mbufq *, struct mbufq *,
 		    struct in6_multi *, const int);
 static int	mld_v2_process_group_query(struct in6_multi *,
 		    struct mld_ifsoftc *mli, int, struct mbuf *, const int);
 static int	sysctl_mld_gsr(SYSCTL_HANDLER_ARGS);
 static int	sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS);
 
 /*
  * Normative references: RFC 2710, RFC 3590, RFC 3810.
  *
  * Locking:
  *  * The MLD subsystem lock ends up being system-wide for the moment,
  *    but could be per-VIMAGE later on.
  *  * The permitted lock order is: IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK.
  *    Any may be taken independently; if any are held at the same
  *    time, the above lock order must be followed.
  *  * IN6_MULTI_LOCK covers in_multi.
  *  * MLD_LOCK covers per-link state and any global variables in this file.
  *  * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
  *    per-link state iterators.
  *
  *  XXX LOR PREVENTION
  *  A special case for IPv6 is the in6_setscope() routine. ip6_output()
  *  will not accept an ifp; it wants an embedded scope ID, unlike
  *  ip_output(), which happily takes the ifp given to it. The embedded
  *  scope ID is only used by MLD to select the outgoing interface.
  *
  *  During interface attach and detach, MLD will take MLD_LOCK *after*
  *  the IF_AFDATA_LOCK.
  *  As in6_setscope() takes IF_AFDATA_LOCK then SCOPE_LOCK, we can't call
  *  it with MLD_LOCK held without triggering an LOR. A netisr with indirect
  *  dispatch could work around this, but we'd rather not do that, as it
  *  can introduce other races.
  *
  *  As such, we exploit the fact that the scope ID is just the interface
  *  index, and embed it in the IPv6 destination address accordingly.
  *  This is potentially NOT VALID for MLDv1 reports, as they
  *  are always sent to the multicast group itself; as MLDv2
  *  reports are always sent to ff02::16, this is not an issue
  *  when MLDv2 is in use.
  *
  *  This does not however eliminate the LOR when ip6_output() itself
  *  calls in6_setscope() internally whilst MLD_LOCK is held. This will
  *  trigger a LOR warning in WITNESS when the ifnet is detached.
  *
  *  The right answer is probably to make IF_AFDATA_LOCK an rwlock, given
  *  how it's used across the network stack. Here we're simply exploiting
  *  the fact that MLD runs at a similar layer in the stack to scope6.c.
  *
  * VIMAGE:
  *  * Each in6_multi corresponds to an ifp, and each ifp corresponds
  *    to a vnet in ifp->if_vnet.
  */
 static struct mtx		 mld_mtx;
 static MALLOC_DEFINE(M_MLD, "mld", "mld state");
 
 #define	MLD_EMBEDSCOPE(pin6, zoneid)					\
 	if (IN6_IS_SCOPE_LINKLOCAL(pin6) ||				\
 	    IN6_IS_ADDR_MC_INTFACELOCAL(pin6))				\
 		(pin6)->s6_addr16[1] = htons((zoneid) & 0xFFFF)		\
 
 /*
  * VIMAGE-wide globals.
  */
 static VNET_DEFINE(struct timeval, mld_gsrdelay) = {10, 0};
 static VNET_DEFINE(LIST_HEAD(, mld_ifsoftc), mli_head);
 static VNET_DEFINE(int, interface_timers_running6);
 static VNET_DEFINE(int, state_change_timers_running6);
 static VNET_DEFINE(int, current_state_timers_running6);
 
 #define	V_mld_gsrdelay			VNET(mld_gsrdelay)
 #define	V_mli_head			VNET(mli_head)
 #define	V_interface_timers_running6	VNET(interface_timers_running6)
 #define	V_state_change_timers_running6	VNET(state_change_timers_running6)
 #define	V_current_state_timers_running6	VNET(current_state_timers_running6)
 
 SYSCTL_DECL(_net_inet6);	/* Note: Not in any common header. */
 
 SYSCTL_NODE(_net_inet6, OID_AUTO, mld, CTLFLAG_RW, 0,
     "IPv6 Multicast Listener Discovery");
 
 /*
  * Virtualized sysctls.
  */
 SYSCTL_PROC(_net_inet6_mld, OID_AUTO, gsrdelay,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(mld_gsrdelay.tv_sec), 0, sysctl_mld_gsr, "I",
     "Rate limit for MLDv2 Group-and-Source queries in seconds");
 
 /*
  * Non-virtualized sysctls.
  */
 static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mld_ifinfo,
     "Per-interface MLDv2 state");
 
 static int	mld_v1enable = 1;
 SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN,
     &mld_v1enable, 0, "Enable fallback to MLDv1");
 
 static int	mld_use_allow = 1;
 SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN,
     &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves");
 
 /*
  * Packed Router Alert option structure declaration.
  */
 struct mld_raopt {
 	struct ip6_hbh		hbh;
 	struct ip6_opt		pad;
 	struct ip6_opt_router	ra;
 } __packed;
 
 /*
  * Router Alert hop-by-hop option header.
  */
 static struct mld_raopt mld_ra = {
 	.hbh = { 0, 0 },
 	.pad = { .ip6o_type = IP6OPT_PADN, 0 },
 	.ra = {
 	    .ip6or_type = IP6OPT_ROUTER_ALERT,
 	    .ip6or_len = IP6OPT_RTALERT_LEN - 2,
 	    .ip6or_value[0] = ((IP6OPT_RTALERT_MLD >> 8) & 0xFF),
 	    .ip6or_value[1] = (IP6OPT_RTALERT_MLD & 0xFF)
 	}
 };
 static struct ip6_pktopts mld_po;
 
 static __inline void
 mld_save_context(struct mbuf *m, struct ifnet *ifp)
 {
 
 #ifdef VIMAGE
 	m->m_pkthdr.PH_loc.ptr = ifp->if_vnet;
 #endif /* VIMAGE */
 	m->m_pkthdr.flowid = ifp->if_index;
 }
 
 static __inline void
 mld_scrub_context(struct mbuf *m)
 {
 
 	m->m_pkthdr.PH_loc.ptr = NULL;
 	m->m_pkthdr.flowid = 0;
 }
 
 /*
  * Restore context from a queued output chain.
  * Return saved ifindex.
  *
  * VIMAGE: The assertion is there to make sure that we
  * actually called CURVNET_SET() with what's in the mbuf chain.
  */
 static __inline uint32_t
 mld_restore_context(struct mbuf *m)
 {
 
 #if defined(VIMAGE) && defined(INVARIANTS)
 	KASSERT(curvnet == m->m_pkthdr.PH_loc.ptr,
-	    ("%s: called when curvnet was not restored", __func__));
+	    ("%s: called when curvnet was not restored: cuvnet %p m ptr %p",
+	    __func__, curvnet, m->m_pkthdr.PH_loc.ptr));
 #endif
 	return (m->m_pkthdr.flowid);
 }
 
 /*
  * Retrieve or set threshold between group-source queries in seconds.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by MLD lock.
  */
 static int
 sysctl_mld_gsr(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	MLD_LOCK();
 
 	i = V_mld_gsrdelay.tv_sec;
 
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (i < -1 || i >= 60) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_MLD, "change mld_gsrdelay from %d to %d",
 	     V_mld_gsrdelay.tv_sec, i);
 	V_mld_gsrdelay.tv_sec = i;
 
 out_locked:
 	MLD_UNLOCK();
 	return (error);
 }
 
 /*
  * Expose struct mld_ifsoftc to userland, keyed by ifindex.
  * For use by ifmcstat(8).
  *
  * SMPng: NOTE: Does an unlocked ifindex space read.
  * VIMAGE: Assume curvnet set by caller. The node handler itself
  * is not directly virtualized.
  */
 static int
 sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS)
 {
 	int			*name;
 	int			 error;
 	u_int			 namelen;
 	struct ifnet		*ifp;
 	struct mld_ifsoftc	*mli;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = sysctl_wire_old_buffer(req, sizeof(struct mld_ifinfo));
 	if (error)
 		return (error);
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 
 	if (name[0] <= 0 || name[0] > V_if_index) {
 		error = ENOENT;
 		goto out_locked;
 	}
 
 	error = ENOENT;
 
 	ifp = ifnet_byindex(name[0]);
 	if (ifp == NULL)
 		goto out_locked;
 
 	LIST_FOREACH(mli, &V_mli_head, mli_link) {
 		if (ifp == mli->mli_ifp) {
 			struct mld_ifinfo info;
 
 			info.mli_version = mli->mli_version;
 			info.mli_v1_timer = mli->mli_v1_timer;
 			info.mli_v2_timer = mli->mli_v2_timer;
 			info.mli_flags = mli->mli_flags;
 			info.mli_rv = mli->mli_rv;
 			info.mli_qi = mli->mli_qi;
 			info.mli_qri = mli->mli_qri;
 			info.mli_uri = mli->mli_uri;
 			error = SYSCTL_OUT(req, &info, sizeof(info));
 			break;
 		}
 	}
 
 out_locked:
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Dispatch an entire queue of pending packet chains.
  * VIMAGE: Assumes the vnet pointer has been set.
  */
 static void
 mld_dispatch_queue(struct mbufq *mq, int limit)
 {
 	struct mbuf *m;
 
 	while ((m = mbufq_dequeue(mq)) != NULL) {
 		CTR3(KTR_MLD, "%s: dispatch %p from %p", __func__, mq, m);
 		mld_dispatch_packet(m);
 		if (--limit == 0)
 			break;
 	}
 }
 
 /*
  * Filter outgoing MLD report state by group.
  *
  * Reports are ALWAYS suppressed for ALL-HOSTS (ff02::1)
  * and node-local addresses. However, kernel and socket consumers
  * always embed the KAME scope ID in the address provided, so strip it
  * when performing comparison.
  * Note: This is not the same as the *multicast* scope.
  *
  * Return zero if the given group is one for which MLD reports
  * should be suppressed, or non-zero if reports should be issued.
  */
 static __inline int
 mld_is_addr_reported(const struct in6_addr *addr)
 {
 
 	KASSERT(IN6_IS_ADDR_MULTICAST(addr), ("%s: not multicast", __func__));
 
 	if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_NODELOCAL)
 		return (0);
 
 	if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_LINKLOCAL) {
 		struct in6_addr tmp = *addr;
 		in6_clearscope(&tmp);
 		if (IN6_ARE_ADDR_EQUAL(&tmp, &in6addr_linklocal_allnodes))
 			return (0);
 	}
 
 	return (1);
 }
 
 /*
  * Attach MLD when PF_INET6 is attached to an interface.
  *
  * SMPng: Normally called with IF_AFDATA_LOCK held.
  */
 struct mld_ifsoftc *
 mld_domifattach(struct ifnet *ifp)
 {
 	struct mld_ifsoftc *mli;
 
 	CTR3(KTR_MLD, "%s: called for ifp %p(%s)",
 	    __func__, ifp, if_name(ifp));
 
 	MLD_LOCK();
 
 	mli = mli_alloc_locked(ifp);
 	if (!(ifp->if_flags & IFF_MULTICAST))
 		mli->mli_flags |= MLIF_SILENT;
 	if (mld_use_allow)
 		mli->mli_flags |= MLIF_USEALLOW;
 
 	MLD_UNLOCK();
 
 	return (mli);
 }
 
 /*
  * VIMAGE: assume curvnet set by caller.
  */
 static struct mld_ifsoftc *
 mli_alloc_locked(/*const*/ struct ifnet *ifp)
 {
 	struct mld_ifsoftc *mli;
 
 	MLD_LOCK_ASSERT();
 
 	mli = malloc(sizeof(struct mld_ifsoftc), M_MLD, M_NOWAIT|M_ZERO);
 	if (mli == NULL)
 		goto out;
 
 	mli->mli_ifp = ifp;
 	mli->mli_version = MLD_VERSION_2;
 	mli->mli_flags = 0;
 	mli->mli_rv = MLD_RV_INIT;
 	mli->mli_qi = MLD_QI_INIT;
 	mli->mli_qri = MLD_QRI_INIT;
 	mli->mli_uri = MLD_URI_INIT;
 	SLIST_INIT(&mli->mli_relinmhead);
 	mbufq_init(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS);
 
 	LIST_INSERT_HEAD(&V_mli_head, mli, mli_link);
 
 	CTR2(KTR_MLD, "allocate mld_ifsoftc for ifp %p(%s)",
 	     ifp, if_name(ifp));
 
 out:
 	return (mli);
 }
 
 /*
  * Hook for ifdetach.
  *
  * NOTE: Some finalization tasks need to run before the protocol domain
  * is detached, but also before the link layer does its cleanup.
  * Run before link-layer cleanup; cleanup groups, but do not free MLD state.
  *
  * SMPng: Caller must hold IN6_MULTI_LOCK().
  * Must take IF_ADDR_LOCK() to cover if_multiaddrs iterator.
  * XXX This routine is also bitten by unlocked ifma_protospec access.
  */
 void
 mld_ifdetach(struct ifnet *ifp)
 {
 	struct mld_ifsoftc	*mli;
 	struct ifmultiaddr	*ifma;
 	struct in6_multi	*inm, *tinm;
 
 	CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp,
 	    if_name(ifp));
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK();
 
 	mli = MLD_IFINFO(ifp);
 	if (mli->mli_version == MLD_VERSION_2) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET6 ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in6_multi *)ifma->ifma_protospec;
 			if (inm->in6m_state == MLD_LEAVING_MEMBER) {
 				SLIST_INSERT_HEAD(&mli->mli_relinmhead,
 				    inm, in6m_nrele);
 			}
 			in6m_clear_recorded(inm);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele,
 		    tinm) {
 			SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele);
 			in6m_release_locked(inm);
 		}
 	}
 
 	MLD_UNLOCK();
 }
 
 /*
  * Hook for domifdetach.
  * Runs after link-layer cleanup; free MLD state.
  *
  * SMPng: Normally called with IF_AFDATA_LOCK held.
  */
 void
 mld_domifdetach(struct ifnet *ifp)
 {
 
 	CTR3(KTR_MLD, "%s: called for ifp %p(%s)",
 	    __func__, ifp, if_name(ifp));
 
 	MLD_LOCK();
 	mli_delete_locked(ifp);
 	MLD_UNLOCK();
 }
 
 static void
 mli_delete_locked(const struct ifnet *ifp)
 {
 	struct mld_ifsoftc *mli, *tmli;
 
 	CTR3(KTR_MLD, "%s: freeing mld_ifsoftc for ifp %p(%s)",
 	    __func__, ifp, if_name(ifp));
 
 	MLD_LOCK_ASSERT();
 
 	LIST_FOREACH_SAFE(mli, &V_mli_head, mli_link, tmli) {
 		if (mli->mli_ifp == ifp) {
 			/*
 			 * Free deferred General Query responses.
 			 */
 			mbufq_drain(&mli->mli_gq);
 
 			LIST_REMOVE(mli, mli_link);
 
 			KASSERT(SLIST_EMPTY(&mli->mli_relinmhead),
 			    ("%s: there are dangling in_multi references",
 			    __func__));
 
 			free(mli, M_MLD);
 			return;
 		}
 	}
-#ifdef INVARIANTS
-	panic("%s: mld_ifsoftc not found for ifp %p\n", __func__,  ifp);
-#endif
 }
 
 /*
  * Process a received MLDv1 general or address-specific query.
  * Assumes that the query header has been pulled up to sizeof(mld_hdr).
  *
  * NOTE: Can't be fully const correct as we temporarily embed scope ID in
  * mld_addr. This is OK as we own the mbuf chain.
  */
 static int
 mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
     /*const*/ struct mld_hdr *mld)
 {
 	struct ifmultiaddr	*ifma;
 	struct mld_ifsoftc	*mli;
 	struct in6_multi	*inm;
 	int			 is_general_query;
 	uint16_t		 timer;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	is_general_query = 0;
 
 	if (!mld_v1enable) {
 		CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	/*
 	 * RFC3810 Section 6.2: MLD queries must originate from
 	 * a router's link-local address.
 	 */
 	if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
 		CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	/*
 	 * Do address field validation upfront before we accept
 	 * the query.
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) {
 		/*
 		 * MLDv1 General Query.
 		 * If this was not sent to the all-nodes group, ignore it.
 		 */
 		struct in6_addr		 dst;
 
 		dst = ip6->ip6_dst;
 		in6_clearscope(&dst);
 		if (!IN6_ARE_ADDR_EQUAL(&dst, &in6addr_linklocal_allnodes))
 			return (EINVAL);
 		is_general_query = 1;
 	} else {
 		/*
 		 * Embed scope ID of receiving interface in MLD query for
 		 * lookup whilst we don't hold other locks.
 		 */
 		in6_setscope(&mld->mld_addr, ifp, NULL);
 	}
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 
 	/*
 	 * Switch to MLDv1 host compatibility mode.
 	 */
 	mli = MLD_IFINFO(ifp);
 	KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp));
 	mld_set_version(mli, MLD_VERSION_1);
 
 	timer = (ntohs(mld->mld_maxdelay) * PR_FASTHZ) / MLD_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	IF_ADDR_RLOCK(ifp);
 	if (is_general_query) {
 		/*
 		 * For each reporting group joined on this
 		 * interface, kick the report timer.
 		 */
 		CTR2(KTR_MLD, "process v1 general query on ifp %p(%s)",
 		    ifp, if_name(ifp));
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET6 ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in6_multi *)ifma->ifma_protospec;
 			mld_v1_update_group(inm, timer);
 		}
 	} else {
 		/*
 		 * MLDv1 Group-Specific Query.
 		 * If this is a group-specific MLDv1 query, we need only
 		 * look up the single group to process it.
 		 */
 		inm = in6m_lookup_locked(ifp, &mld->mld_addr);
 		if (inm != NULL) {
 			CTR3(KTR_MLD, "process v1 query %s on ifp %p(%s)",
 			    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 			    ifp, if_name(ifp));
 			mld_v1_update_group(inm, timer);
 		}
 		/* XXX Clear embedded scope ID as userland won't expect it. */
 		in6_clearscope(&mld->mld_addr);
 	}
 
 	IF_ADDR_RUNLOCK(ifp);
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Update the report timer on a group in response to an MLDv1 query.
  *
  * If we are becoming the reporting member for this group, start the timer.
  * If we already are the reporting member for this group, and timer is
  * below the threshold, reset it.
  *
  * We may be updating the group for the first time since we switched
  * to MLDv2. If we are, then we must clear any recorded source lists,
  * and transition to REPORTING state; the group timer is overloaded
  * for group and group-source query responses. 
  *
  * Unlike MLDv2, the delay per group should be jittered
  * to avoid bursts of MLDv1 reports.
  */
 static void
 mld_v1_update_group(struct in6_multi *inm, const int timer)
 {
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	CTR4(KTR_MLD, "%s: %s/%s timer=%d", __func__,
 	    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    if_name(inm->in6m_ifp), timer);
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 		break;
 	case MLD_REPORTING_MEMBER:
 		if (inm->in6m_timer != 0 &&
 		    inm->in6m_timer <= timer) {
 			CTR1(KTR_MLD, "%s: REPORTING and timer running, "
 			    "skipping.", __func__);
 			break;
 		}
 		/* FALLTHROUGH */
 	case MLD_SG_QUERY_PENDING_MEMBER:
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 		CTR1(KTR_MLD, "%s: ->REPORTING", __func__);
 		inm->in6m_state = MLD_REPORTING_MEMBER;
 		inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 		V_current_state_timers_running6 = 1;
 		break;
 	case MLD_SLEEPING_MEMBER:
 		CTR1(KTR_MLD, "%s: ->AWAKENING", __func__);
 		inm->in6m_state = MLD_AWAKENING_MEMBER;
 		break;
 	case MLD_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Process a received MLDv2 general, group-specific or
  * group-and-source-specific query.
  *
  * Assumes that the query header has been pulled up to sizeof(mldv2_query).
  *
  * Return 0 if successful, otherwise an appropriate error code is returned.
  */
 static int
 mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
     struct mbuf *m, const int off, const int icmp6len)
 {
 	struct mld_ifsoftc	*mli;
 	struct mldv2_query	*mld;
 	struct in6_multi	*inm;
 	uint32_t		 maxdelay, nsrc, qqi;
 	int			 is_general_query;
 	uint16_t		 timer;
 	uint8_t			 qrv;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	is_general_query = 0;
 
 	/*
 	 * RFC3810 Section 6.2: MLD queries must originate from
 	 * a router's link-local address.
 	 */
 	if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
 		CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	CTR2(KTR_MLD, "input v2 query on ifp %p(%s)", ifp, if_name(ifp));
 
 	mld = (struct mldv2_query *)(mtod(m, uint8_t *) + off);
 
 	maxdelay = ntohs(mld->mld_maxdelay);	/* in 1/10ths of a second */
 	if (maxdelay >= 32768) {
 		maxdelay = (MLD_MRC_MANT(maxdelay) | 0x1000) <<
 			   (MLD_MRC_EXP(maxdelay) + 3);
 	}
 	timer = (maxdelay * PR_FASTHZ) / MLD_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	qrv = MLD_QRV(mld->mld_misc);
 	if (qrv < 2) {
 		CTR3(KTR_MLD, "%s: clamping qrv %d to %d", __func__,
 		    qrv, MLD_RV_INIT);
 		qrv = MLD_RV_INIT;
 	}
 
 	qqi = mld->mld_qqi;
 	if (qqi >= 128) {
 		qqi = MLD_QQIC_MANT(mld->mld_qqi) <<
 		     (MLD_QQIC_EXP(mld->mld_qqi) + 3);
 	}
 
 	nsrc = ntohs(mld->mld_numsrc);
 	if (nsrc > MLD_MAX_GS_SOURCES)
 		return (EMSGSIZE);
 	if (icmp6len < sizeof(struct mldv2_query) +
 	    (nsrc * sizeof(struct in6_addr)))
 		return (EMSGSIZE);
 
 	/*
 	 * Do further input validation upfront to avoid resetting timers
 	 * should we need to discard this query.
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) {
 		/*
 		 * A general query with a source list has undefined
 		 * behaviour; discard it.
 		 */
 		if (nsrc > 0)
 			return (EINVAL);
 		is_general_query = 1;
 	} else {
 		/*
 		 * Embed scope ID of receiving interface in MLD query for
 		 * lookup whilst we don't hold other locks (due to KAME
 		 * locking lameness). We own this mbuf chain just now.
 		 */
 		in6_setscope(&mld->mld_addr, ifp, NULL);
 	}
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 
 	mli = MLD_IFINFO(ifp);
 	KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp));
 
 	/*
 	 * Discard the v2 query if we're in Compatibility Mode.
 	 * The RFC is pretty clear that hosts need to stay in MLDv1 mode
 	 * until the Old Version Querier Present timer expires.
 	 */
 	if (mli->mli_version != MLD_VERSION_2)
 		goto out_locked;
 
 	mld_set_version(mli, MLD_VERSION_2);
 	mli->mli_rv = qrv;
 	mli->mli_qi = qqi;
 	mli->mli_qri = maxdelay;
 
 	CTR4(KTR_MLD, "%s: qrv %d qi %d maxdelay %d", __func__, qrv, qqi,
 	    maxdelay);
 
 	if (is_general_query) {
 		/*
 		 * MLDv2 General Query.
 		 *
 		 * Schedule a current-state report on this ifp for
 		 * all groups, possibly containing source lists.
 		 *
 		 * If there is a pending General Query response
 		 * scheduled earlier than the selected delay, do
 		 * not schedule any other reports.
 		 * Otherwise, reset the interface timer.
 		 */
 		CTR2(KTR_MLD, "process v2 general query on ifp %p(%s)",
 		    ifp, if_name(ifp));
 		if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) {
 			mli->mli_v2_timer = MLD_RANDOM_DELAY(timer);
 			V_interface_timers_running6 = 1;
 		}
 	} else {
 		/*
 		 * MLDv2 Group-specific or Group-and-source-specific Query.
 		 *
 		 * Group-source-specific queries are throttled on
 		 * a per-group basis to defeat denial-of-service attempts.
 		 * Queries for groups we are not a member of on this
 		 * link are simply ignored.
 		 */
 		IF_ADDR_RLOCK(ifp);
 		inm = in6m_lookup_locked(ifp, &mld->mld_addr);
 		if (inm == NULL) {
 			IF_ADDR_RUNLOCK(ifp);
 			goto out_locked;
 		}
 		if (nsrc > 0) {
 			if (!ratecheck(&inm->in6m_lastgsrtv,
 			    &V_mld_gsrdelay)) {
 				CTR1(KTR_MLD, "%s: GS query throttled.",
 				    __func__);
 				IF_ADDR_RUNLOCK(ifp);
 				goto out_locked;
 			}
 		}
 		CTR2(KTR_MLD, "process v2 group query on ifp %p(%s)",
 		     ifp, if_name(ifp));
 		/*
 		 * If there is a pending General Query response
 		 * scheduled sooner than the selected delay, no
 		 * further report need be scheduled.
 		 * Otherwise, prepare to respond to the
 		 * group-specific or group-and-source query.
 		 */
 		if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer)
 			mld_v2_process_group_query(inm, mli, timer, m, off);
 
 		/* XXX Clear embedded scope ID as userland won't expect it. */
 		in6_clearscope(&mld->mld_addr);
 		IF_ADDR_RUNLOCK(ifp);
 	}
 
 out_locked:
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a recieved MLDv2 group-specific or group-and-source-specific
  * query.
  * Return <0 if any error occured. Currently this is ignored.
  */
 static int
 mld_v2_process_group_query(struct in6_multi *inm, struct mld_ifsoftc *mli,
     int timer, struct mbuf *m0, const int off)
 {
 	struct mldv2_query	*mld;
 	int			 retval;
 	uint16_t		 nsrc;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	retval = 0;
 	mld = (struct mldv2_query *)(mtod(m0, uint8_t *) + off);
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		return (retval);
 		break;
 	case MLD_REPORTING_MEMBER:
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 		break;
 	}
 
 	nsrc = ntohs(mld->mld_numsrc);
 
 	/*
 	 * Deal with group-specific queries upfront.
 	 * If any group query is already pending, purge any recorded
 	 * source-list state if it exists, and schedule a query response
 	 * for this group-specific query.
 	 */
 	if (nsrc == 0) {
 		if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER ||
 		    inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) {
 			in6m_clear_recorded(inm);
 			timer = min(inm->in6m_timer, timer);
 		}
 		inm->in6m_state = MLD_G_QUERY_PENDING_MEMBER;
 		inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 		V_current_state_timers_running6 = 1;
 		return (retval);
 	}
 
 	/*
 	 * Deal with the case where a group-and-source-specific query has
 	 * been received but a group-specific query is already pending.
 	 */
 	if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER) {
 		timer = min(inm->in6m_timer, timer);
 		inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 		V_current_state_timers_running6 = 1;
 		return (retval);
 	}
 
 	/*
 	 * Finally, deal with the case where a group-and-source-specific
 	 * query has been received, where a response to a previous g-s-r
 	 * query exists, or none exists.
 	 * In this case, we need to parse the source-list which the Querier
 	 * has provided us with and check if we have any source list filter
 	 * entries at T1 for these sources. If we do not, there is no need
 	 * schedule a report and the query may be dropped.
 	 * If we do, we must record them and schedule a current-state
 	 * report for those sources.
 	 */
 	if (inm->in6m_nsrc > 0) {
 		struct mbuf		*m;
 		uint8_t			*sp;
 		int			 i, nrecorded;
 		int			 soff;
 
 		m = m0;
 		soff = off + sizeof(struct mldv2_query);
 		nrecorded = 0;
 		for (i = 0; i < nsrc; i++) {
 			sp = mtod(m, uint8_t *) + soff;
 			retval = in6m_record_source(inm,
 			    (const struct in6_addr *)sp);
 			if (retval < 0)
 				break;
 			nrecorded += retval;
 			soff += sizeof(struct in6_addr);
 			if (soff >= m->m_len) {
 				soff = soff - m->m_len;
 				m = m->m_next;
 				if (m == NULL)
 					break;
 			}
 		}
 		if (nrecorded > 0) {
 			CTR1(KTR_MLD,
 			    "%s: schedule response to SG query", __func__);
 			inm->in6m_state = MLD_SG_QUERY_PENDING_MEMBER;
 			inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 			V_current_state_timers_running6 = 1;
 		}
 	}
 
 	return (retval);
 }
 
 /*
  * Process a received MLDv1 host membership report.
  * Assumes mld points to mld_hdr in pulled up mbuf chain.
  *
  * NOTE: Can't be fully const correct as we temporarily embed scope ID in
  * mld_addr. This is OK as we own the mbuf chain.
  */
 static int
 mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6,
     /*const*/ struct mld_hdr *mld)
 {
 	struct in6_addr		 src, dst;
 	struct in6_ifaddr	*ia;
 	struct in6_multi	*inm;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	if (!mld_v1enable) {
 		CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	if (ifp->if_flags & IFF_LOOPBACK)
 		return (0);
 
 	/*
 	 * MLDv1 reports must originate from a host's link-local address,
 	 * or the unspecified address (when booting).
 	 */
 	src = ip6->ip6_src;
 	in6_clearscope(&src);
 	if (!IN6_IS_SCOPE_LINKLOCAL(&src) && !IN6_IS_ADDR_UNSPECIFIED(&src)) {
 		CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC2710 Section 4: MLDv1 reports must pertain to a multicast
 	 * group, and must be directed to the group itself.
 	 */
 	dst = ip6->ip6_dst;
 	in6_clearscope(&dst);
 	if (!IN6_IS_ADDR_MULTICAST(&mld->mld_addr) ||
 	    !IN6_ARE_ADDR_EQUAL(&mld->mld_addr, &dst)) {
 		CTR3(KTR_MLD, "ignore v1 query dst %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_dst),
 		    ifp, if_name(ifp));
 		return (EINVAL);
 	}
 
 	/*
 	 * Make sure we don't hear our own membership report, as fast
 	 * leave requires knowing that we are the only member of a
 	 * group. Assume we used the link-local address if available,
 	 * otherwise look for ::.
 	 *
 	 * XXX Note that scope ID comparison is needed for the address
 	 * returned by in6ifa_ifpforlinklocal(), but SHOULD NOT be
 	 * performed for the on-wire address.
 	 */
 	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 	if ((ia && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, IA6_IN6(ia))) ||
 	    (ia == NULL && IN6_IS_ADDR_UNSPECIFIED(&src))) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 
 	CTR3(KTR_MLD, "process v1 report %s on ifp %p(%s)",
 	    ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp));
 
 	/*
 	 * Embed scope ID of receiving interface in MLD query for lookup
 	 * whilst we don't hold other locks (due to KAME locking lameness).
 	 */
 	if (!IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr))
 		in6_setscope(&mld->mld_addr, ifp, NULL);
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 	IF_ADDR_RLOCK(ifp);
 
 	/*
 	 * MLDv1 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, and our group timer is pending or about to be reset,
 	 * stop our group timer by transitioning to the 'lazy' state.
 	 */
 	inm = in6m_lookup_locked(ifp, &mld->mld_addr);
 	if (inm != NULL) {
 		struct mld_ifsoftc *mli;
 
 		mli = inm->in6m_mli;
 		KASSERT(mli != NULL,
 		    ("%s: no mli for ifp %p", __func__, ifp));
 
 		/*
 		 * If we are in MLDv2 host mode, do not allow the
 		 * other host's MLDv1 report to suppress our reports.
 		 */
 		if (mli->mli_version == MLD_VERSION_2)
 			goto out_locked;
 
 		inm->in6m_timer = 0;
 
 		switch (inm->in6m_state) {
 		case MLD_NOT_MEMBER:
 		case MLD_SILENT_MEMBER:
 		case MLD_SLEEPING_MEMBER:
 			break;
 		case MLD_REPORTING_MEMBER:
 		case MLD_IDLE_MEMBER:
 		case MLD_AWAKENING_MEMBER:
 			CTR3(KTR_MLD,
 			    "report suppressed for %s on ifp %p(%s)",
 			    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 			    ifp, if_name(ifp));
 		case MLD_LAZY_MEMBER:
 			inm->in6m_state = MLD_LAZY_MEMBER;
 			break;
 		case MLD_G_QUERY_PENDING_MEMBER:
 		case MLD_SG_QUERY_PENDING_MEMBER:
 		case MLD_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IF_ADDR_RUNLOCK(ifp);
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 
 	/* XXX Clear embedded scope ID as userland won't expect it. */
 	in6_clearscope(&mld->mld_addr);
 
 	return (0);
 }
 
 /*
  * MLD input path.
  *
  * Assume query messages which fit in a single ICMPv6 message header
  * have been pulled up.
  * Assume that userland will want to see the message, even if it
  * otherwise fails kernel input validation; do not free it.
  * Pullup may however free the mbuf chain m if it fails.
  *
  * Return IPPROTO_DONE if we freed m. Otherwise, return 0.
  */
 int
 mld_input(struct mbuf *m, int off, int icmp6len)
 {
 	struct ifnet	*ifp;
 	struct ip6_hdr	*ip6;
 	struct mld_hdr	*mld;
 	int		 mldlen;
 
 	CTR3(KTR_MLD, "%s: called w/mbuf (%p,%d)", __func__, m, off);
 
 	ifp = m->m_pkthdr.rcvif;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/* Pullup to appropriate size. */
 	mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off);
 	if (mld->mld_type == MLD_LISTENER_QUERY &&
 	    icmp6len >= sizeof(struct mldv2_query)) {
 		mldlen = sizeof(struct mldv2_query);
 	} else {
 		mldlen = sizeof(struct mld_hdr);
 	}
 	IP6_EXTHDR_GET(mld, struct mld_hdr *, m, off, mldlen);
 	if (mld == NULL) {
 		ICMP6STAT_INC(icp6s_badlen);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Userland needs to see all of this traffic for implementing
 	 * the endpoint discovery portion of multicast routing.
 	 */
 	switch (mld->mld_type) {
 	case MLD_LISTENER_QUERY:
 		icmp6_ifstat_inc(ifp, ifs6_in_mldquery);
 		if (icmp6len == sizeof(struct mld_hdr)) {
 			if (mld_v1_input_query(ifp, ip6, mld) != 0)
 				return (0);
 		} else if (icmp6len >= sizeof(struct mldv2_query)) {
 			if (mld_v2_input_query(ifp, ip6, m, off,
 			    icmp6len) != 0)
 				return (0);
 		}
 		break;
 	case MLD_LISTENER_REPORT:
 		icmp6_ifstat_inc(ifp, ifs6_in_mldreport);
 		if (mld_v1_input_report(ifp, ip6, mld) != 0)
 			return (0);
 		break;
 	case MLDV2_LISTENER_REPORT:
 		icmp6_ifstat_inc(ifp, ifs6_in_mldreport);
 		break;
 	case MLD_LISTENER_DONE:
 		icmp6_ifstat_inc(ifp, ifs6_in_mlddone);
 		break;
 	default:
 		break;
 	}
 
 	return (0);
 }
 
 /*
  * Fast timeout handler (global).
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 mld_fasttimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		mld_fasttimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Fast timeout handler (per-vnet).
  *
  * VIMAGE: Assume caller has set up our curvnet.
  */
 static void
 mld_fasttimo_vnet(void)
 {
 	struct mbufq		 scq;	/* State-change packets */
 	struct mbufq		 qrq;	/* Query response packets */
 	struct ifnet		*ifp;
 	struct mld_ifsoftc	*mli;
 	struct ifmultiaddr	*ifma;
 	struct in6_multi	*inm, *tinm;
 	int			 uri_fasthz;
 
 	uri_fasthz = 0;
 
 	/*
 	 * Quick check to see if any work needs to be done, in order to
 	 * minimize the overhead of fasttimo processing.
 	 * SMPng: XXX Unlocked reads.
 	 */
 	if (!V_current_state_timers_running6 &&
 	    !V_interface_timers_running6 &&
 	    !V_state_change_timers_running6)
 		return;
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 
 	/*
 	 * MLDv2 General Query response timer processing.
 	 */
 	if (V_interface_timers_running6) {
 		CTR1(KTR_MLD, "%s: interface timers running", __func__);
 
 		V_interface_timers_running6 = 0;
 		LIST_FOREACH(mli, &V_mli_head, mli_link) {
 			if (mli->mli_v2_timer == 0) {
 				/* Do nothing. */
 			} else if (--mli->mli_v2_timer == 0) {
 				mld_v2_dispatch_general_query(mli);
 			} else {
 				V_interface_timers_running6 = 1;
 			}
 		}
 	}
 
 	if (!V_current_state_timers_running6 &&
 	    !V_state_change_timers_running6)
 		goto out_locked;
 
 	V_current_state_timers_running6 = 0;
 	V_state_change_timers_running6 = 0;
 
 	CTR1(KTR_MLD, "%s: state change timers running", __func__);
 
 	/*
 	 * MLD host report and state-change timer processing.
 	 * Note: Processing a v2 group timer may remove a node.
 	 */
 	LIST_FOREACH(mli, &V_mli_head, mli_link) {
 		ifp = mli->mli_ifp;
 
 		if (mli->mli_version == MLD_VERSION_2) {
 			uri_fasthz = MLD_RANDOM_DELAY(mli->mli_uri *
 			    PR_FASTHZ);
 			mbufq_init(&qrq, MLD_MAX_G_GS_PACKETS);
 			mbufq_init(&scq, MLD_MAX_STATE_CHANGE_PACKETS);
 		}
 
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET6 ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in6_multi *)ifma->ifma_protospec;
 			switch (mli->mli_version) {
 			case MLD_VERSION_1:
 				mld_v1_process_group_timer(mli, inm);
 				break;
 			case MLD_VERSION_2:
 				mld_v2_process_group_timers(mli, &qrq,
 				    &scq, inm, uri_fasthz);
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 
 		switch (mli->mli_version) {
 		case MLD_VERSION_1:
 			/*
 			 * Transmit reports for this lifecycle.  This
 			 * is done while not holding IF_ADDR_LOCK
 			 * since this can call
 			 * in6ifa_ifpforlinklocal() which locks
 			 * IF_ADDR_LOCK internally as well as
 			 * ip6_output() to transmit a packet.
 			 */
 			SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead,
 			    in6m_nrele, tinm) {
 				SLIST_REMOVE_HEAD(&mli->mli_relinmhead,
 				    in6m_nrele);
 				(void)mld_v1_transmit_report(inm,
 				    MLD_LISTENER_REPORT);
 			}
 			break;
 		case MLD_VERSION_2:
 			mld_dispatch_queue(&qrq, 0);
 			mld_dispatch_queue(&scq, 0);
 
 			/*
 			 * Free the in_multi reference(s) for
 			 * this lifecycle.
 			 */
 			SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead,
 			    in6m_nrele, tinm) {
 				SLIST_REMOVE_HEAD(&mli->mli_relinmhead,
 				    in6m_nrele);
 				in6m_release_locked(inm);
 			}
 			break;
 		}
 	}
 
 out_locked:
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 }
 
 /*
  * Update host report group timer.
  * Will update the global pending timer flags.
  */
 static void
 mld_v1_process_group_timer(struct mld_ifsoftc *mli, struct in6_multi *inm)
 {
 	int report_timer_expired;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	if (inm->in6m_timer == 0) {
 		report_timer_expired = 0;
 	} else if (--inm->in6m_timer == 0) {
 		report_timer_expired = 1;
 	} else {
 		V_current_state_timers_running6 = 1;
 		return;
 	}
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 		break;
 	case MLD_REPORTING_MEMBER:
 		if (report_timer_expired) {
 			inm->in6m_state = MLD_IDLE_MEMBER;
 			SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm,
 			    in6m_nrele);
 		}
 		break;
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Update a group's timers for MLDv2.
  * Will update the global pending timer flags.
  * Note: Unlocked read from mli.
  */
 static void
 mld_v2_process_group_timers(struct mld_ifsoftc *mli,
     struct mbufq *qrq, struct mbufq *scq,
     struct in6_multi *inm, const int uri_fasthz)
 {
 	int query_response_timer_expired;
 	int state_change_retransmit_timer_expired;
 #ifdef KTR
 	char ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	query_response_timer_expired = 0;
 	state_change_retransmit_timer_expired = 0;
 
 	/*
 	 * During a transition from compatibility mode back to MLDv2,
 	 * a group record in REPORTING state may still have its group
 	 * timer active. This is a no-op in this function; it is easier
 	 * to deal with it here than to complicate the slow-timeout path.
 	 */
 	if (inm->in6m_timer == 0) {
 		query_response_timer_expired = 0;
 	} else if (--inm->in6m_timer == 0) {
 		query_response_timer_expired = 1;
 	} else {
 		V_current_state_timers_running6 = 1;
 	}
 
 	if (inm->in6m_sctimer == 0) {
 		state_change_retransmit_timer_expired = 0;
 	} else if (--inm->in6m_sctimer == 0) {
 		state_change_retransmit_timer_expired = 1;
 	} else {
 		V_state_change_timers_running6 = 1;
 	}
 
 	/* We are in fasttimo, so be quick about it. */
 	if (!state_change_retransmit_timer_expired &&
 	    !query_response_timer_expired)
 		return;
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 	case MLD_IDLE_MEMBER:
 		break;
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 		/*
 		 * Respond to a previously pending Group-Specific
 		 * or Group-and-Source-Specific query by enqueueing
 		 * the appropriate Current-State report for
 		 * immediate transmission.
 		 */
 		if (query_response_timer_expired) {
 			int retval;
 
 			retval = mld_v2_enqueue_group_record(qrq, inm, 0, 1,
 			    (inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER),
 			    0);
 			CTR2(KTR_MLD, "%s: enqueue record = %d",
 			    __func__, retval);
 			inm->in6m_state = MLD_REPORTING_MEMBER;
 			in6m_clear_recorded(inm);
 		}
 		/* FALLTHROUGH */
 	case MLD_REPORTING_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		if (state_change_retransmit_timer_expired) {
 			/*
 			 * State-change retransmission timer fired.
 			 * If there are any further pending retransmissions,
 			 * set the global pending state-change flag, and
 			 * reset the timer.
 			 */
 			if (--inm->in6m_scrv > 0) {
 				inm->in6m_sctimer = uri_fasthz;
 				V_state_change_timers_running6 = 1;
 			}
 			/*
 			 * Retransmit the previously computed state-change
 			 * report. If there are no further pending
 			 * retransmissions, the mbuf queue will be consumed.
 			 * Update T0 state to T1 as we have now sent
 			 * a state-change.
 			 */
 			(void)mld_v2_merge_state_changes(inm, scq);
 
 			in6m_commit(inm);
 			CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 			    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 			    if_name(inm->in6m_ifp));
 
 			/*
 			 * If we are leaving the group for good, make sure
 			 * we release MLD's reference to it.
 			 * This release must be deferred using a SLIST,
 			 * as we are called from a loop which traverses
 			 * the in_ifmultiaddr TAILQ.
 			 */
 			if (inm->in6m_state == MLD_LEAVING_MEMBER &&
 			    inm->in6m_scrv == 0) {
 				inm->in6m_state = MLD_NOT_MEMBER;
 				SLIST_INSERT_HEAD(&mli->mli_relinmhead,
 				    inm, in6m_nrele);
 			}
 		}
 		break;
 	}
 }
 
 /*
  * Switch to a different version on the given interface,
  * as per Section 9.12.
  */
 static void
 mld_set_version(struct mld_ifsoftc *mli, const int version)
 {
 	int old_version_timer;
 
 	MLD_LOCK_ASSERT();
 
 	CTR4(KTR_MLD, "%s: switching to v%d on ifp %p(%s)", __func__,
 	    version, mli->mli_ifp, if_name(mli->mli_ifp));
 
 	if (version == MLD_VERSION_1) {
 		/*
 		 * Compute the "Older Version Querier Present" timer as per
 		 * Section 9.12.
 		 */
 		old_version_timer = (mli->mli_rv * mli->mli_qi) + mli->mli_qri;
 		old_version_timer *= PR_SLOWHZ;
 		mli->mli_v1_timer = old_version_timer;
 	}
 
 	if (mli->mli_v1_timer > 0 && mli->mli_version != MLD_VERSION_1) {
 		mli->mli_version = MLD_VERSION_1;
 		mld_v2_cancel_link_timers(mli);
 	}
 }
 
 /*
  * Cancel pending MLDv2 timers for the given link and all groups
  * joined on it; state-change, general-query, and group-query timers.
  */
 static void
 mld_v2_cancel_link_timers(struct mld_ifsoftc *mli)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in6_multi	*inm, *tinm;
 
 	CTR3(KTR_MLD, "%s: cancel v2 timers on ifp %p(%s)", __func__,
 	    mli->mli_ifp, if_name(mli->mli_ifp));
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	/*
 	 * Fast-track this potentially expensive operation
 	 * by checking all the global 'timer pending' flags.
 	 */
 	if (!V_interface_timers_running6 &&
 	    !V_state_change_timers_running6 &&
 	    !V_current_state_timers_running6)
 		return;
 
 	mli->mli_v2_timer = 0;
 
 	ifp = mli->mli_ifp;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET6)
 			continue;
 		inm = (struct in6_multi *)ifma->ifma_protospec;
 		switch (inm->in6m_state) {
 		case MLD_NOT_MEMBER:
 		case MLD_SILENT_MEMBER:
 		case MLD_IDLE_MEMBER:
 		case MLD_LAZY_MEMBER:
 		case MLD_SLEEPING_MEMBER:
 		case MLD_AWAKENING_MEMBER:
 			break;
 		case MLD_LEAVING_MEMBER:
 			/*
 			 * If we are leaving the group and switching
 			 * version, we need to release the final
 			 * reference held for issuing the INCLUDE {}.
 			 */
 			SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm,
 			    in6m_nrele);
 			/* FALLTHROUGH */
 		case MLD_G_QUERY_PENDING_MEMBER:
 		case MLD_SG_QUERY_PENDING_MEMBER:
 			in6m_clear_recorded(inm);
 			/* FALLTHROUGH */
 		case MLD_REPORTING_MEMBER:
 			inm->in6m_sctimer = 0;
 			inm->in6m_timer = 0;
 			inm->in6m_state = MLD_REPORTING_MEMBER;
 			/*
 			 * Free any pending MLDv2 state-change records.
 			 */
 			mbufq_drain(&inm->in6m_scq);
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 	SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele);
 		in6m_release_locked(inm);
 	}
 }
 
 /*
  * Global slowtimo handler.
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 mld_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		mld_slowtimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Per-vnet slowtimo handler.
  */
 static void
 mld_slowtimo_vnet(void)
 {
 	struct mld_ifsoftc *mli;
 
 	MLD_LOCK();
 
 	LIST_FOREACH(mli, &V_mli_head, mli_link) {
 		mld_v1_process_querier_timers(mli);
 	}
 
 	MLD_UNLOCK();
 }
 
 /*
  * Update the Older Version Querier Present timers for a link.
  * See Section 9.12 of RFC 3810.
  */
 static void
 mld_v1_process_querier_timers(struct mld_ifsoftc *mli)
 {
 
 	MLD_LOCK_ASSERT();
 
 	if (mli->mli_version != MLD_VERSION_2 && --mli->mli_v1_timer == 0) {
 		/*
 		 * MLDv1 Querier Present timer expired; revert to MLDv2.
 		 */
 		CTR5(KTR_MLD,
 		    "%s: transition from v%d -> v%d on %p(%s)",
 		    __func__, mli->mli_version, MLD_VERSION_2,
 		    mli->mli_ifp, if_name(mli->mli_ifp));
 		mli->mli_version = MLD_VERSION_2;
 	}
 }
 
 /*
  * Transmit an MLDv1 report immediately.
  */
 static int
 mld_v1_transmit_report(struct in6_multi *in6m, const int type)
 {
 	struct ifnet		*ifp;
 	struct in6_ifaddr	*ia;
 	struct ip6_hdr		*ip6;
 	struct mbuf		*mh, *md;
 	struct mld_hdr		*mld;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	ifp = in6m->in6m_ifp;
 	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 	/* ia may be NULL if link-local address is tentative. */
 
 	mh = m_gethdr(M_NOWAIT, MT_DATA);
 	if (mh == NULL) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (ENOMEM);
 	}
 	md = m_get(M_NOWAIT, MT_DATA);
 	if (md == NULL) {
 		m_free(mh);
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (ENOMEM);
 	}
 	mh->m_next = md;
 
 	/*
 	 * FUTURE: Consider increasing alignment by ETHER_HDR_LEN, so
 	 * that ether_output() does not need to allocate another mbuf
 	 * for the header in the most common case.
 	 */
 	M_ALIGN(mh, sizeof(struct ip6_hdr));
 	mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr);
 	mh->m_len = sizeof(struct ip6_hdr);
 
 	ip6 = mtod(mh, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any;
 	ip6->ip6_dst = in6m->in6m_addr;
 
 	md->m_len = sizeof(struct mld_hdr);
 	mld = mtod(md, struct mld_hdr *);
 	mld->mld_type = type;
 	mld->mld_code = 0;
 	mld->mld_cksum = 0;
 	mld->mld_maxdelay = 0;
 	mld->mld_reserved = 0;
 	mld->mld_addr = in6m->in6m_addr;
 	in6_clearscope(&mld->mld_addr);
 	mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6,
 	    sizeof(struct ip6_hdr), sizeof(struct mld_hdr));
 
 	mld_save_context(mh, ifp);
 	mh->m_flags |= M_MLDV1;
 
 	mld_dispatch_packet(mh);
 
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	return (0);
 }
 
 /*
  * Process a state change from the upper layer for the given IPv6 group.
  *
  * Each socket holds a reference on the in_multi in its own ip_moptions.
  * The socket layer will have made the necessary updates to.the group
  * state, it is now up to MLD to issue a state change report if there
  * has been any change between T0 (when the last state-change was issued)
  * and T1 (now).
  *
  * We use the MLDv2 state machine at group level. The MLd module
  * however makes the decision as to which MLD protocol version to speak.
  * A state change *from* INCLUDE {} always means an initial join.
  * A state change *to* INCLUDE {} always means a final leave.
  *
  * If delay is non-zero, and the state change is an initial multicast
  * join, the state change report will be delayed by 'delay' ticks
  * in units of PR_FASTHZ if MLDv1 is active on the link; otherwise
  * the initial MLDv2 state change report will be delayed by whichever
  * is sooner, a pending state-change timer or delay itself.
  *
  * VIMAGE: curvnet should have been set by caller, as this routine
  * is called from the socket option handlers.
  */
 int
 mld_change_state(struct in6_multi *inm, const int delay)
 {
 	struct mld_ifsoftc *mli;
 	struct ifnet *ifp;
 	int error;
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	error = 0;
 
 	/*
 	 * Try to detect if the upper layer just asked us to change state
 	 * for an interface which has now gone away.
 	 */
 	KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->in6m_ifma->ifma_ifp;
 	if (ifp != NULL) {
 		/*
 		 * Sanity check that netinet6's notion of ifp is the
 		 * same as net's.
 		 */
 		KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__));
 	}
 
 	MLD_LOCK();
 
 	mli = MLD_IFINFO(ifp);
 	KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp));
 
 	/*
 	 * If we detect a state transition to or from MCAST_UNDEFINED
 	 * for this group, then we are starting or finishing an MLD
 	 * life cycle for this group.
 	 */
 	if (inm->in6m_st[1].iss_fmode != inm->in6m_st[0].iss_fmode) {
 		CTR3(KTR_MLD, "%s: inm transition %d -> %d", __func__,
 		    inm->in6m_st[0].iss_fmode, inm->in6m_st[1].iss_fmode);
 		if (inm->in6m_st[0].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_MLD, "%s: initial join", __func__);
 			error = mld_initial_join(inm, mli, delay);
 			goto out_locked;
 		} else if (inm->in6m_st[1].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_MLD, "%s: final leave", __func__);
 			mld_final_leave(inm, mli);
 			goto out_locked;
 		}
 	} else {
 		CTR1(KTR_MLD, "%s: filter set change", __func__);
 	}
 
 	error = mld_handle_state_change(inm, mli);
 
 out_locked:
 	MLD_UNLOCK();
 	return (error);
 }
 
 /*
  * Perform the initial join for an MLD group.
  *
  * When joining a group:
  *  If the group should have its MLD traffic suppressed, do nothing.
  *  MLDv1 starts sending MLDv1 host membership reports.
  *  MLDv2 will schedule an MLDv2 state-change report containing the
  *  initial state of the membership.
  *
  * If the delay argument is non-zero, then we must delay sending the
  * initial state change for delay ticks (in units of PR_FASTHZ).
  */
 static int
 mld_initial_join(struct in6_multi *inm, struct mld_ifsoftc *mli,
     const int delay)
 {
 	struct ifnet		*ifp;
 	struct mbufq		*mq;
 	int			 error, retval, syncstates;
 	int			 odelay;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	CTR4(KTR_MLD, "%s: initial join %s on ifp %p(%s)",
 	    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp, if_name(inm->in6m_ifp));
 
 	error = 0;
 	syncstates = 1;
 
 	ifp = inm->in6m_ifp;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	/*
 	 * Groups joined on loopback or marked as 'not reported',
 	 * enter the MLD_SILENT_MEMBER state and
 	 * are never reported in any protocol exchanges.
 	 * All other groups enter the appropriate state machine
 	 * for the version in use on this link.
 	 * A link marked as MLIF_SILENT causes MLD to be completely
 	 * disabled for the link.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (mli->mli_flags & MLIF_SILENT) ||
 	    !mld_is_addr_reported(&inm->in6m_addr)) {
 		CTR1(KTR_MLD,
 "%s: not kicking state machine for silent group", __func__);
 		inm->in6m_state = MLD_SILENT_MEMBER;
 		inm->in6m_timer = 0;
 	} else {
 		/*
 		 * Deal with overlapping in_multi lifecycle.
 		 * If this group was LEAVING, then make sure
 		 * we drop the reference we picked up to keep the
 		 * group around for the final INCLUDE {} enqueue.
 		 */
 		if (mli->mli_version == MLD_VERSION_2 &&
 		    inm->in6m_state == MLD_LEAVING_MEMBER)
 			in6m_release_locked(inm);
 
 		inm->in6m_state = MLD_REPORTING_MEMBER;
 
 		switch (mli->mli_version) {
 		case MLD_VERSION_1:
 			/*
 			 * If a delay was provided, only use it if
 			 * it is greater than the delay normally
 			 * used for an MLDv1 state change report,
 			 * and delay sending the initial MLDv1 report
 			 * by not transitioning to the IDLE state.
 			 */
 			odelay = MLD_RANDOM_DELAY(MLD_V1_MAX_RI * PR_FASTHZ);
 			if (delay) {
 				inm->in6m_timer = max(delay, odelay);
 				V_current_state_timers_running6 = 1;
 			} else {
 				inm->in6m_state = MLD_IDLE_MEMBER;
 				error = mld_v1_transmit_report(inm,
 				     MLD_LISTENER_REPORT);
 				if (error == 0) {
 					inm->in6m_timer = odelay;
 					V_current_state_timers_running6 = 1;
 				}
 			}
 			break;
 
 		case MLD_VERSION_2:
 			/*
 			 * Defer update of T0 to T1, until the first copy
 			 * of the state change has been transmitted.
 			 */
 			syncstates = 0;
 
 			/*
 			 * Immediately enqueue a State-Change Report for
 			 * this interface, freeing any previous reports.
 			 * Don't kick the timers if there is nothing to do,
 			 * or if an error occurred.
 			 */
 			mq = &inm->in6m_scq;
 			mbufq_drain(mq);
 			retval = mld_v2_enqueue_group_record(mq, inm, 1,
 			    0, 0, (mli->mli_flags & MLIF_USEALLOW));
 			CTR2(KTR_MLD, "%s: enqueue record = %d",
 			    __func__, retval);
 			if (retval <= 0) {
 				error = retval * -1;
 				break;
 			}
 
 			/*
 			 * Schedule transmission of pending state-change
 			 * report up to RV times for this link. The timer
 			 * will fire at the next mld_fasttimo (~200ms),
 			 * giving us an opportunity to merge the reports.
 			 *
 			 * If a delay was provided to this function, only
 			 * use this delay if sooner than the existing one.
 			 */
 			KASSERT(mli->mli_rv > 1,
 			   ("%s: invalid robustness %d", __func__,
 			    mli->mli_rv));
 			inm->in6m_scrv = mli->mli_rv;
 			if (delay) {
 				if (inm->in6m_sctimer > 1) {
 					inm->in6m_sctimer =
 					    min(inm->in6m_sctimer, delay);
 				} else
 					inm->in6m_sctimer = delay;
 			} else
 				inm->in6m_sctimer = 1;
 			V_state_change_timers_running6 = 1;
 
 			error = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Only update the T0 state if state change is atomic,
 	 * i.e. we don't need to wait for a timer to fire before we
 	 * can consider the state change to have been communicated.
 	 */
 	if (syncstates) {
 		in6m_commit(inm);
 		CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 		    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 	}
 
 	return (error);
 }
 
 /*
  * Issue an intermediate state change during the life-cycle.
  */
 static int
 mld_handle_state_change(struct in6_multi *inm, struct mld_ifsoftc *mli)
 {
 	struct ifnet		*ifp;
 	int			 retval;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	CTR4(KTR_MLD, "%s: state change for %s on ifp %p(%s)",
 	    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp, if_name(inm->in6m_ifp));
 
 	ifp = inm->in6m_ifp;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	KASSERT(mli && mli->mli_ifp == ifp,
 	    ("%s: inconsistent ifp", __func__));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (mli->mli_flags & MLIF_SILENT) ||
 	    !mld_is_addr_reported(&inm->in6m_addr) ||
 	    (mli->mli_version != MLD_VERSION_2)) {
 		if (!mld_is_addr_reported(&inm->in6m_addr)) {
 			CTR1(KTR_MLD,
 "%s: not kicking state machine for silent group", __func__);
 		}
 		CTR1(KTR_MLD, "%s: nothing to do", __func__);
 		in6m_commit(inm);
 		CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 		    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 		return (0);
 	}
 
 	mbufq_drain(&inm->in6m_scq);
 
 	retval = mld_v2_enqueue_group_record(&inm->in6m_scq, inm, 1, 0, 0,
 	    (mli->mli_flags & MLIF_USEALLOW));
 	CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval);
 	if (retval <= 0)
 		return (-retval);
 
 	/*
 	 * If record(s) were enqueued, start the state-change
 	 * report timer for this group.
 	 */
 	inm->in6m_scrv = mli->mli_rv;
 	inm->in6m_sctimer = 1;
 	V_state_change_timers_running6 = 1;
 
 	return (0);
 }
 
 /*
  * Perform the final leave for a multicast address.
  *
  * When leaving a group:
  *  MLDv1 sends a DONE message, if and only if we are the reporter.
  *  MLDv2 enqueues a state-change report containing a transition
  *  to INCLUDE {} for immediate transmission.
  */
 static void
 mld_final_leave(struct in6_multi *inm, struct mld_ifsoftc *mli)
 {
 	int syncstates;
 #ifdef KTR
 	char ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	syncstates = 1;
 
 	CTR4(KTR_MLD, "%s: final leave %s on ifp %p(%s)",
 	    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp, if_name(inm->in6m_ifp));
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		/* Already leaving or left; do nothing. */
 		CTR1(KTR_MLD,
 "%s: not kicking state machine for silent group", __func__);
 		break;
 	case MLD_REPORTING_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 		if (mli->mli_version == MLD_VERSION_1) {
 #ifdef INVARIANTS
 			if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER ||
 			    inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER)
 			panic("%s: MLDv2 state reached, not MLDv2 mode",
 			     __func__);
 #endif
 			mld_v1_transmit_report(inm, MLD_LISTENER_DONE);
 			inm->in6m_state = MLD_NOT_MEMBER;
 			V_current_state_timers_running6 = 1;
 		} else if (mli->mli_version == MLD_VERSION_2) {
 			/*
 			 * Stop group timer and all pending reports.
 			 * Immediately enqueue a state-change report
 			 * TO_IN {} to be sent on the next fast timeout,
 			 * giving us an opportunity to merge reports.
 			 */
 			mbufq_drain(&inm->in6m_scq);
 			inm->in6m_timer = 0;
 			inm->in6m_scrv = mli->mli_rv;
 			CTR4(KTR_MLD, "%s: Leaving %s/%s with %d "
 			    "pending retransmissions.", __func__,
 			    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 			    if_name(inm->in6m_ifp), inm->in6m_scrv);
 			if (inm->in6m_scrv == 0) {
 				inm->in6m_state = MLD_NOT_MEMBER;
 				inm->in6m_sctimer = 0;
 			} else {
 				int retval;
 
 				in6m_acquire_locked(inm);
 
 				retval = mld_v2_enqueue_group_record(
 				    &inm->in6m_scq, inm, 1, 0, 0,
 				    (mli->mli_flags & MLIF_USEALLOW));
 				KASSERT(retval != 0,
 				    ("%s: enqueue record = %d", __func__,
 				     retval));
 
 				inm->in6m_state = MLD_LEAVING_MEMBER;
 				inm->in6m_sctimer = 1;
 				V_state_change_timers_running6 = 1;
 				syncstates = 0;
 			}
 			break;
 		}
 		break;
 	case MLD_LAZY_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 		/* Our reports are suppressed; do nothing. */
 		break;
 	}
 
 	if (syncstates) {
 		in6m_commit(inm);
 		CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 		    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 		inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
 		CTR3(KTR_MLD, "%s: T1 now MCAST_UNDEFINED for %p/%s",
 		    __func__, &inm->in6m_addr, if_name(inm->in6m_ifp));
 	}
 }
 
 /*
  * Enqueue an MLDv2 group record to the given output queue.
  *
  * If is_state_change is zero, a current-state record is appended.
  * If is_state_change is non-zero, a state-change report is appended.
  *
  * If is_group_query is non-zero, an mbuf packet chain is allocated.
  * If is_group_query is zero, and if there is a packet with free space
  * at the tail of the queue, it will be appended to providing there
  * is enough free space.
  * Otherwise a new mbuf packet chain is allocated.
  *
  * If is_source_query is non-zero, each source is checked to see if
  * it was recorded for a Group-Source query, and will be omitted if
  * it is not both in-mode and recorded.
  *
  * If use_block_allow is non-zero, state change reports for initial join
  * and final leave, on an inclusive mode group with a source list, will be
  * rewritten to use the ALLOW_NEW and BLOCK_OLD record types, respectively.
  *
  * The function will attempt to allocate leading space in the packet
  * for the IPv6+ICMP headers to be prepended without fragmenting the chain.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 mld_v2_enqueue_group_record(struct mbufq *mq, struct in6_multi *inm,
     const int is_state_change, const int is_group_query,
     const int is_source_query, const int use_block_allow)
 {
 	struct mldv2_record	 mr;
 	struct mldv2_record	*pmr;
 	struct ifnet		*ifp;
 	struct ip6_msource	*ims, *nims;
 	struct mbuf		*m0, *m, *md;
 	int			 error, is_filter_list_change;
 	int			 minrec0len, m0srcs, msrcs, nbytes, off;
 	int			 record_has_sources;
 	int			 now;
 	int			 type;
 	uint8_t			 mode;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	error = 0;
 	ifp = inm->in6m_ifp;
 	is_filter_list_change = 0;
 	m = NULL;
 	m0 = NULL;
 	m0srcs = 0;
 	msrcs = 0;
 	nbytes = 0;
 	nims = NULL;
 	record_has_sources = 1;
 	pmr = NULL;
 	type = MLD_DO_NOTHING;
 	mode = inm->in6m_st[1].iss_fmode;
 
 	/*
 	 * If we did not transition out of ASM mode during t0->t1,
 	 * and there are no source nodes to process, we can skip
 	 * the generation of source records.
 	 */
 	if (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0 &&
 	    inm->in6m_nsrc == 0)
 		record_has_sources = 0;
 
 	if (is_state_change) {
 		/*
 		 * Queue a state change record.
 		 * If the mode did not change, and there are non-ASM
 		 * listeners or source filters present,
 		 * we potentially need to issue two records for the group.
 		 * If there are ASM listeners, and there was no filter
 		 * mode transition of any kind, do nothing.
 		 *
 		 * If we are transitioning to MCAST_UNDEFINED, we need
 		 * not send any sources. A transition to/from this state is
 		 * considered inclusive with some special treatment.
 		 *
 		 * If we are rewriting initial joins/leaves to use
 		 * ALLOW/BLOCK, and the group's membership is inclusive,
 		 * we need to send sources in all cases.
 		 */
 		if (mode != inm->in6m_st[0].iss_fmode) {
 			if (mode == MCAST_EXCLUDE) {
 				CTR1(KTR_MLD, "%s: change to EXCLUDE",
 				    __func__);
 				type = MLD_CHANGE_TO_EXCLUDE_MODE;
 			} else {
 				CTR1(KTR_MLD, "%s: change to INCLUDE",
 				    __func__);
 				if (use_block_allow) {
 					/*
 					 * XXX
 					 * Here we're interested in state
 					 * edges either direction between
 					 * MCAST_UNDEFINED and MCAST_INCLUDE.
 					 * Perhaps we should just check
 					 * the group state, rather than
 					 * the filter mode.
 					 */
 					if (mode == MCAST_UNDEFINED) {
 						type = MLD_BLOCK_OLD_SOURCES;
 					} else {
 						type = MLD_ALLOW_NEW_SOURCES;
 					}
 				} else {
 					type = MLD_CHANGE_TO_INCLUDE_MODE;
 					if (mode == MCAST_UNDEFINED)
 						record_has_sources = 0;
 				}
 			}
 		} else {
 			if (record_has_sources) {
 				is_filter_list_change = 1;
 			} else {
 				type = MLD_DO_NOTHING;
 			}
 		}
 	} else {
 		/*
 		 * Queue a current state record.
 		 */
 		if (mode == MCAST_EXCLUDE) {
 			type = MLD_MODE_IS_EXCLUDE;
 		} else if (mode == MCAST_INCLUDE) {
 			type = MLD_MODE_IS_INCLUDE;
 			KASSERT(inm->in6m_st[1].iss_asm == 0,
 			    ("%s: inm %p is INCLUDE but ASM count is %d",
 			     __func__, inm, inm->in6m_st[1].iss_asm));
 		}
 	}
 
 	/*
 	 * Generate the filter list changes using a separate function.
 	 */
 	if (is_filter_list_change)
 		return (mld_v2_enqueue_filter_change(mq, inm));
 
 	if (type == MLD_DO_NOTHING) {
 		CTR3(KTR_MLD, "%s: nothing to do for %s/%s",
 		    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 		return (0);
 	}
 
 	/*
 	 * If any sources are present, we must be able to fit at least
 	 * one in the trailing space of the tail packet's mbuf,
 	 * ideally more.
 	 */
 	minrec0len = sizeof(struct mldv2_record);
 	if (record_has_sources)
 		minrec0len += sizeof(struct in6_addr);
 
 	CTR4(KTR_MLD, "%s: queueing %s for %s/%s", __func__,
 	    mld_rec_type_to_str(type),
 	    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    if_name(inm->in6m_ifp));
 
 	/*
 	 * Check if we have a packet in the tail of the queue for this
 	 * group into which the first group record for this group will fit.
 	 * Otherwise allocate a new packet.
 	 * Always allocate leading space for IP6+RA+ICMPV6+REPORT.
 	 * Note: Group records for G/GSR query responses MUST be sent
 	 * in their own packet.
 	 */
 	m0 = mbufq_last(mq);
 	if (!is_group_query &&
 	    m0 != NULL &&
 	    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) &&
 	    (m0->m_pkthdr.len + minrec0len) <
 	     (ifp->if_mtu - MLD_MTUSPACE)) {
 		m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 			    sizeof(struct mldv2_record)) /
 			    sizeof(struct in6_addr);
 		m = m0;
 		CTR1(KTR_MLD, "%s: use existing packet", __func__);
 	} else {
 		if (mbufq_full(mq)) {
 			CTR1(KTR_MLD, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = NULL;
 		m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
 		    sizeof(struct mldv2_record)) / sizeof(struct in6_addr);
 		if (!is_state_change && !is_group_query)
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m == NULL)
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (-ENOMEM);
 
 		mld_save_context(m, ifp);
 
 		CTR1(KTR_MLD, "%s: allocated first packet", __func__);
 	}
 
 	/*
 	 * Append group record.
 	 * If we have sources, we don't know how many yet.
 	 */
 	mr.mr_type = type;
 	mr.mr_datalen = 0;
 	mr.mr_numsrc = 0;
 	mr.mr_addr = inm->in6m_addr;
 	in6_clearscope(&mr.mr_addr);
 	if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) {
 		if (m != m0)
 			m_freem(m);
 		CTR1(KTR_MLD, "%s: m_append() failed.", __func__);
 		return (-ENOMEM);
 	}
 	nbytes += sizeof(struct mldv2_record);
 
 	/*
 	 * Append as many sources as will fit in the first packet.
 	 * If we are appending to a new packet, the chain allocation
 	 * may potentially use clusters; use m_getptr() in this case.
 	 * If we are appending to an existing packet, we need to obtain
 	 * a pointer to the group record after m_append(), in case a new
 	 * mbuf was allocated.
 	 *
 	 * Only append sources which are in-mode at t1. If we are
 	 * transitioning to MCAST_UNDEFINED state on the group, and
 	 * use_block_allow is zero, do not include source entries.
 	 * Otherwise, we need to include this source in the report.
 	 *
 	 * Only report recorded sources in our filter set when responding
 	 * to a group-source query.
 	 */
 	if (record_has_sources) {
 		if (m == m0) {
 			md = m_last(m);
 			pmr = (struct mldv2_record *)(mtod(md, uint8_t *) +
 			    md->m_len - nbytes);
 		} else {
 			md = m_getptr(m, 0, &off);
 			pmr = (struct mldv2_record *)(mtod(md, uint8_t *) +
 			    off);
 		}
 		msrcs = 0;
 		RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs,
 		    nims) {
 			CTR2(KTR_MLD, "%s: visit node %s", __func__,
 			    ip6_sprintf(ip6tbuf, &ims->im6s_addr));
 			now = im6s_get_mode(inm, ims, 1);
 			CTR2(KTR_MLD, "%s: node is %d", __func__, now);
 			if ((now != mode) ||
 			    (now == mode &&
 			     (!use_block_allow && mode == MCAST_UNDEFINED))) {
 				CTR1(KTR_MLD, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->im6s_stp == 0) {
 				CTR1(KTR_MLD, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_MLD, "%s: append node", __func__);
 			if (!m_append(m, sizeof(struct in6_addr),
 			    (void *)&ims->im6s_addr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_MLD, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			nbytes += sizeof(struct in6_addr);
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		CTR2(KTR_MLD, "%s: msrcs is %d this packet", __func__,
 		    msrcs);
 		pmr->mr_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(struct in6_addr));
 	}
 
 	if (is_source_query && msrcs == 0) {
 		CTR1(KTR_MLD, "%s: no recorded sources to report", __func__);
 		if (m != m0)
 			m_freem(m);
 		return (0);
 	}
 
 	/*
 	 * We are good to go with first packet.
 	 */
 	if (m != m0) {
 		CTR1(KTR_MLD, "%s: enqueueing first packet", __func__);
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		mbufq_enqueue(mq, m);
 	} else
 		m->m_pkthdr.PH_vt.vt_nrecs++;
 
 	/*
 	 * No further work needed if no source list in packet(s).
 	 */
 	if (!record_has_sources)
 		return (nbytes);
 
 	/*
 	 * Whilst sources remain to be announced, we need to allocate
 	 * a new packet and fill out as many sources as will fit.
 	 * Always try for a cluster first.
 	 */
 	while (nims != NULL) {
 		if (mbufq_full(mq)) {
 			CTR1(KTR_MLD, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m == NULL)
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (-ENOMEM);
 		mld_save_context(m, ifp);
 		md = m_getptr(m, 0, &off);
 		pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off);
 		CTR1(KTR_MLD, "%s: allocated next packet", __func__);
 
 		if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) {
 			if (m != m0)
 				m_freem(m);
 			CTR1(KTR_MLD, "%s: m_append() failed.", __func__);
 			return (-ENOMEM);
 		}
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		nbytes += sizeof(struct mldv2_record);
 
 		m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
 		    sizeof(struct mldv2_record)) / sizeof(struct in6_addr);
 
 		msrcs = 0;
 		RB_FOREACH_FROM(ims, ip6_msource_tree, nims) {
 			CTR2(KTR_MLD, "%s: visit node %s",
 			    __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr));
 			now = im6s_get_mode(inm, ims, 1);
 			if ((now != mode) ||
 			    (now == mode &&
 			     (!use_block_allow && mode == MCAST_UNDEFINED))) {
 				CTR1(KTR_MLD, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->im6s_stp == 0) {
 				CTR1(KTR_MLD, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_MLD, "%s: append node", __func__);
 			if (!m_append(m, sizeof(struct in6_addr),
 			    (void *)&ims->im6s_addr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_MLD, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		pmr->mr_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(struct in6_addr));
 
 		CTR1(KTR_MLD, "%s: enqueueing next packet", __func__);
 		mbufq_enqueue(mq, m);
 	}
 
 	return (nbytes);
 }
 
 /*
  * Type used to mark record pass completion.
  * We exploit the fact we can cast to this easily from the
  * current filter modes on each ip_msource node.
  */
 typedef enum {
 	REC_NONE = 0x00,	/* MCAST_UNDEFINED */
 	REC_ALLOW = 0x01,	/* MCAST_INCLUDE */
 	REC_BLOCK = 0x02,	/* MCAST_EXCLUDE */
 	REC_FULL = REC_ALLOW | REC_BLOCK
 } rectype_t;
 
 /*
  * Enqueue an MLDv2 filter list change to the given output queue.
  *
  * Source list filter state is held in an RB-tree. When the filter list
  * for a group is changed without changing its mode, we need to compute
  * the deltas between T0 and T1 for each source in the filter set,
  * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
  *
  * As we may potentially queue two record types, and the entire R-B tree
  * needs to be walked at once, we break this out into its own function
  * so we can generate a tightly packed queue of packets.
  *
  * XXX This could be written to only use one tree walk, although that makes
  * serializing into the mbuf chains a bit harder. For now we do two walks
  * which makes things easier on us, and it may or may not be harder on
  * the L2 cache.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 mld_v2_enqueue_filter_change(struct mbufq *mq, struct in6_multi *inm)
 {
 	static const int MINRECLEN =
 	    sizeof(struct mldv2_record) + sizeof(struct in6_addr);
 	struct ifnet		*ifp;
 	struct mldv2_record	 mr;
 	struct mldv2_record	*pmr;
 	struct ip6_msource	*ims, *nims;
 	struct mbuf		*m, *m0, *md;
 	int			 m0srcs, nbytes, npbytes, off, rsrcs, schanged;
 	int			 nallow, nblock;
 	uint8_t			 mode, now, then;
 	rectype_t		 crt, drt, nrt;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	if (inm->in6m_nsrc == 0 ||
 	    (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0))
 		return (0);
 
 	ifp = inm->in6m_ifp;			/* interface */
 	mode = inm->in6m_st[1].iss_fmode;	/* filter mode at t1 */
 	crt = REC_NONE;	/* current group record type */
 	drt = REC_NONE;	/* mask of completed group record types */
 	nrt = REC_NONE;	/* record type for current node */
 	m0srcs = 0;	/* # source which will fit in current mbuf chain */
 	npbytes = 0;	/* # of bytes appended this packet */
 	nbytes = 0;	/* # of bytes appended to group's state-change queue */
 	rsrcs = 0;	/* # sources encoded in current record */
 	schanged = 0;	/* # nodes encoded in overall filter change */
 	nallow = 0;	/* # of source entries in ALLOW_NEW */
 	nblock = 0;	/* # of source entries in BLOCK_OLD */
 	nims = NULL;	/* next tree node pointer */
 
 	/*
 	 * For each possible filter record mode.
 	 * The first kind of source we encounter tells us which
 	 * is the first kind of record we start appending.
 	 * If a node transitioned to UNDEFINED at t1, its mode is treated
 	 * as the inverse of the group's filter mode.
 	 */
 	while (drt != REC_FULL) {
 		do {
 			m0 = mbufq_last(mq);
 			if (m0 != NULL &&
 			    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
 			     MLD_V2_REPORT_MAXRECS) &&
 			    (m0->m_pkthdr.len + MINRECLEN) <
 			     (ifp->if_mtu - MLD_MTUSPACE)) {
 				m = m0;
 				m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 					    sizeof(struct mldv2_record)) /
 					    sizeof(struct in6_addr);
 				CTR1(KTR_MLD,
 				    "%s: use previous packet", __func__);
 			} else {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				if (m == NULL)
 					m = m_gethdr(M_NOWAIT, MT_DATA);
 				if (m == NULL) {
 					CTR1(KTR_MLD,
 					    "%s: m_get*() failed", __func__);
 					return (-ENOMEM);
 				}
 				m->m_pkthdr.PH_vt.vt_nrecs = 0;
 				mld_save_context(m, ifp);
 				m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
 				    sizeof(struct mldv2_record)) /
 				    sizeof(struct in6_addr);
 				npbytes = 0;
 				CTR1(KTR_MLD,
 				    "%s: allocated new packet", __func__);
 			}
 			/*
 			 * Append the MLD group record header to the
 			 * current packet's data area.
 			 * Recalculate pointer to free space for next
 			 * group record, in case m_append() allocated
 			 * a new mbuf or cluster.
 			 */
 			memset(&mr, 0, sizeof(mr));
 			mr.mr_addr = inm->in6m_addr;
 			in6_clearscope(&mr.mr_addr);
 			if (!m_append(m, sizeof(mr), (void *)&mr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_MLD,
 				    "%s: m_append() failed", __func__);
 				return (-ENOMEM);
 			}
 			npbytes += sizeof(struct mldv2_record);
 			if (m != m0) {
 				/* new packet; offset in chain */
 				md = m_getptr(m, npbytes -
 				    sizeof(struct mldv2_record), &off);
 				pmr = (struct mldv2_record *)(mtod(md,
 				    uint8_t *) + off);
 			} else {
 				/* current packet; offset from last append */
 				md = m_last(m);
 				pmr = (struct mldv2_record *)(mtod(md,
 				    uint8_t *) + md->m_len -
 				    sizeof(struct mldv2_record));
 			}
 			/*
 			 * Begin walking the tree for this record type
 			 * pass, or continue from where we left off
 			 * previously if we had to allocate a new packet.
 			 * Only report deltas in-mode at t1.
 			 * We need not report included sources as allowed
 			 * if we are in inclusive mode on the group,
 			 * however the converse is not true.
 			 */
 			rsrcs = 0;
 			if (nims == NULL) {
 				nims = RB_MIN(ip6_msource_tree,
 				    &inm->in6m_srcs);
 			}
 			RB_FOREACH_FROM(ims, ip6_msource_tree, nims) {
 				CTR2(KTR_MLD, "%s: visit node %s", __func__,
 				    ip6_sprintf(ip6tbuf, &ims->im6s_addr));
 				now = im6s_get_mode(inm, ims, 1);
 				then = im6s_get_mode(inm, ims, 0);
 				CTR3(KTR_MLD, "%s: mode: t0 %d, t1 %d",
 				    __func__, then, now);
 				if (now == then) {
 					CTR1(KTR_MLD,
 					    "%s: skip unchanged", __func__);
 					continue;
 				}
 				if (mode == MCAST_EXCLUDE &&
 				    now == MCAST_INCLUDE) {
 					CTR1(KTR_MLD,
 					    "%s: skip IN src on EX group",
 					    __func__);
 					continue;
 				}
 				nrt = (rectype_t)now;
 				if (nrt == REC_NONE)
 					nrt = (rectype_t)(~mode & REC_FULL);
 				if (schanged++ == 0) {
 					crt = nrt;
 				} else if (crt != nrt)
 					continue;
 				if (!m_append(m, sizeof(struct in6_addr),
 				    (void *)&ims->im6s_addr)) {
 					if (m != m0)
 						m_freem(m);
 					CTR1(KTR_MLD,
 					    "%s: m_append() failed", __func__);
 					return (-ENOMEM);
 				}
 				nallow += !!(crt == REC_ALLOW);
 				nblock += !!(crt == REC_BLOCK);
 				if (++rsrcs == m0srcs)
 					break;
 			}
 			/*
 			 * If we did not append any tree nodes on this
 			 * pass, back out of allocations.
 			 */
 			if (rsrcs == 0) {
 				npbytes -= sizeof(struct mldv2_record);
 				if (m != m0) {
 					CTR1(KTR_MLD,
 					    "%s: m_free(m)", __func__);
 					m_freem(m);
 				} else {
 					CTR1(KTR_MLD,
 					    "%s: m_adj(m, -mr)", __func__);
 					m_adj(m, -((int)sizeof(
 					    struct mldv2_record)));
 				}
 				continue;
 			}
 			npbytes += (rsrcs * sizeof(struct in6_addr));
 			if (crt == REC_ALLOW)
 				pmr->mr_type = MLD_ALLOW_NEW_SOURCES;
 			else if (crt == REC_BLOCK)
 				pmr->mr_type = MLD_BLOCK_OLD_SOURCES;
 			pmr->mr_numsrc = htons(rsrcs);
 			/*
 			 * Count the new group record, and enqueue this
 			 * packet if it wasn't already queued.
 			 */
 			m->m_pkthdr.PH_vt.vt_nrecs++;
 			if (m != m0)
 				mbufq_enqueue(mq, m);
 			nbytes += npbytes;
 		} while (nims != NULL);
 		drt |= crt;
 		crt = (~crt & REC_FULL);
 	}
 
 	CTR3(KTR_MLD, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
 	    nallow, nblock);
 
 	return (nbytes);
 }
 
 static int
 mld_v2_merge_state_changes(struct in6_multi *inm, struct mbufq *scq)
 {
 	struct mbufq	*gq;
 	struct mbuf	*m;		/* pending state-change */
 	struct mbuf	*m0;		/* copy of pending state-change */
 	struct mbuf	*mt;		/* last state-change in packet */
 	int		 docopy, domerge;
 	u_int		 recslen;
 
 	docopy = 0;
 	domerge = 0;
 	recslen = 0;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	/*
 	 * If there are further pending retransmissions, make a writable
 	 * copy of each queued state-change message before merging.
 	 */
 	if (inm->in6m_scrv > 0)
 		docopy = 1;
 
 	gq = &inm->in6m_scq;
 #ifdef KTR
 	if (mbufq_first(gq) == NULL) {
 		CTR2(KTR_MLD, "%s: WARNING: queue for inm %p is empty",
 		    __func__, inm);
 	}
 #endif
 
 	m = mbufq_first(gq);
 	while (m != NULL) {
 		/*
 		 * Only merge the report into the current packet if
 		 * there is sufficient space to do so; an MLDv2 report
 		 * packet may only contain 65,535 group records.
 		 * Always use a simple mbuf chain concatentation to do this,
 		 * as large state changes for single groups may have
 		 * allocated clusters.
 		 */
 		domerge = 0;
 		mt = mbufq_last(scq);
 		if (mt != NULL) {
 			recslen = m_length(m, NULL);
 
 			if ((mt->m_pkthdr.PH_vt.vt_nrecs +
 			    m->m_pkthdr.PH_vt.vt_nrecs <=
 			    MLD_V2_REPORT_MAXRECS) &&
 			    (mt->m_pkthdr.len + recslen <=
 			    (inm->in6m_ifp->if_mtu - MLD_MTUSPACE)))
 				domerge = 1;
 		}
 
 		if (!domerge && mbufq_full(gq)) {
 			CTR2(KTR_MLD,
 			    "%s: outbound queue full, skipping whole packet %p",
 			    __func__, m);
 			mt = m->m_nextpkt;
 			if (!docopy)
 				m_freem(m);
 			m = mt;
 			continue;
 		}
 
 		if (!docopy) {
 			CTR2(KTR_MLD, "%s: dequeueing %p", __func__, m);
 			m0 = mbufq_dequeue(gq);
 			m = m0->m_nextpkt;
 		} else {
 			CTR2(KTR_MLD, "%s: copying %p", __func__, m);
 			m0 = m_dup(m, M_NOWAIT);
 			if (m0 == NULL)
 				return (ENOMEM);
 			m0->m_nextpkt = NULL;
 			m = m->m_nextpkt;
 		}
 
 		if (!domerge) {
 			CTR3(KTR_MLD, "%s: queueing %p to scq %p)",
 			    __func__, m0, scq);
 			mbufq_enqueue(scq, m0);
 		} else {
 			struct mbuf *mtl;	/* last mbuf of packet mt */
 
 			CTR3(KTR_MLD, "%s: merging %p with ifscq tail %p)",
 			    __func__, m0, mt);
 
 			mtl = m_last(mt);
 			m0->m_flags &= ~M_PKTHDR;
 			mt->m_pkthdr.len += recslen;
 			mt->m_pkthdr.PH_vt.vt_nrecs +=
 			    m0->m_pkthdr.PH_vt.vt_nrecs;
 
 			mtl->m_next = m0;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Respond to a pending MLDv2 General Query.
  */
 static void
 mld_v2_dispatch_general_query(struct mld_ifsoftc *mli)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in6_multi	*inm;
 	int			 retval;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	KASSERT(mli->mli_version == MLD_VERSION_2,
 	    ("%s: called when version %d", __func__, mli->mli_version));
 
 	/*
 	 * Check that there are some packets queued. If so, send them first.
 	 * For large number of groups the reply to general query can take
 	 * many packets, we should finish sending them before starting of
 	 * queuing the new reply.
 	 */
 	if (mbufq_len(&mli->mli_gq) != 0)
 		goto send;
 
 	ifp = mli->mli_ifp;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET6 ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 
 		inm = (struct in6_multi *)ifma->ifma_protospec;
 		KASSERT(ifp == inm->in6m_ifp,
 		    ("%s: inconsistent ifp", __func__));
 
 		switch (inm->in6m_state) {
 		case MLD_NOT_MEMBER:
 		case MLD_SILENT_MEMBER:
 			break;
 		case MLD_REPORTING_MEMBER:
 		case MLD_IDLE_MEMBER:
 		case MLD_LAZY_MEMBER:
 		case MLD_SLEEPING_MEMBER:
 		case MLD_AWAKENING_MEMBER:
 			inm->in6m_state = MLD_REPORTING_MEMBER;
 			retval = mld_v2_enqueue_group_record(&mli->mli_gq,
 			    inm, 0, 0, 0, 0);
 			CTR2(KTR_MLD, "%s: enqueue record = %d",
 			    __func__, retval);
 			break;
 		case MLD_G_QUERY_PENDING_MEMBER:
 		case MLD_SG_QUERY_PENDING_MEMBER:
 		case MLD_LEAVING_MEMBER:
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 send:
 	mld_dispatch_queue(&mli->mli_gq, MLD_MAX_RESPONSE_BURST);
 
 	/*
 	 * Slew transmission of bursts over 500ms intervals.
 	 */
 	if (mbufq_first(&mli->mli_gq) != NULL) {
 		mli->mli_v2_timer = 1 + MLD_RANDOM_DELAY(
 		    MLD_RESPONSE_BURST_INTERVAL);
 		V_interface_timers_running6 = 1;
 	}
 }
 
 /*
  * Transmit the next pending message in the output queue.
  *
  * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
  * MRT: Nothing needs to be done, as MLD traffic is always local to
  * a link and uses a link-scope multicast address.
  */
 static void
 mld_dispatch_packet(struct mbuf *m)
 {
 	struct ip6_moptions	 im6o;
 	struct ifnet		*ifp;
 	struct ifnet		*oifp;
 	struct mbuf		*m0;
 	struct mbuf		*md;
 	struct ip6_hdr		*ip6;
 	struct mld_hdr		*mld;
 	int			 error;
 	int			 off;
 	int			 type;
 	uint32_t		 ifindex;
 
 	CTR2(KTR_MLD, "%s: transmit %p", __func__, m);
 
 	/*
 	 * Set VNET image pointer from enqueued mbuf chain
 	 * before doing anything else. Whilst we use interface
 	 * indexes to guard against interface detach, they are
 	 * unique to each VIMAGE and must be retrieved.
 	 */
 	ifindex = mld_restore_context(m);
 
 	/*
 	 * Check if the ifnet still exists. This limits the scope of
 	 * any race in the absence of a global ifp lock for low cost
 	 * (an array lookup).
 	 */
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR3(KTR_MLD, "%s: dropped %p as ifindex %u went away.",
 		    __func__, m, ifindex);
 		m_freem(m);
 		IP6STAT_INC(ip6s_noroute);
 		goto out;
 	}
 
 	im6o.im6o_multicast_hlim  = 1;
 	im6o.im6o_multicast_loop = (V_ip6_mrouter != NULL);
 	im6o.im6o_multicast_ifp = ifp;
 
 	if (m->m_flags & M_MLDV1) {
 		m0 = m;
 	} else {
 		m0 = mld_v2_encap_report(ifp, m);
 		if (m0 == NULL) {
 			CTR2(KTR_MLD, "%s: dropped %p", __func__, m);
 			IP6STAT_INC(ip6s_odropped);
 			goto out;
 		}
 	}
 
 	mld_scrub_context(m0);
 	m_clrprotoflags(m);
 	m0->m_pkthdr.rcvif = V_loif;
 
 	ip6 = mtod(m0, struct ip6_hdr *);
 #if 0
 	(void)in6_setscope(&ip6->ip6_dst, ifp, NULL);	/* XXX LOR */
 #else
 	/*
 	 * XXX XXX Break some KPI rules to prevent an LOR which would
 	 * occur if we called in6_setscope() at transmission.
 	 * See comments at top of file.
 	 */
 	MLD_EMBEDSCOPE(&ip6->ip6_dst, ifp->if_index);
 #endif
 
 	/*
 	 * Retrieve the ICMPv6 type before handoff to ip6_output(),
 	 * so we can bump the stats.
 	 */
 	md = m_getptr(m0, sizeof(struct ip6_hdr), &off);
 	mld = (struct mld_hdr *)(mtod(md, uint8_t *) + off);
 	type = mld->mld_type;
 
 	error = ip6_output(m0, &mld_po, NULL, IPV6_UNSPECSRC, &im6o,
 	    &oifp, NULL);
 	if (error) {
 		CTR3(KTR_MLD, "%s: ip6_output(%p) = %d", __func__, m0, error);
 		goto out;
 	}
 	ICMP6STAT_INC(icp6s_outhist[type]);
 	if (oifp != NULL) {
 		icmp6_ifstat_inc(oifp, ifs6_out_msg);
 		switch (type) {
 		case MLD_LISTENER_REPORT:
 		case MLDV2_LISTENER_REPORT:
 			icmp6_ifstat_inc(oifp, ifs6_out_mldreport);
 			break;
 		case MLD_LISTENER_DONE:
 			icmp6_ifstat_inc(oifp, ifs6_out_mlddone);
 			break;
 		}
 	}
 out:
 	return;
 }
 
 /*
  * Encapsulate an MLDv2 report.
  *
  * KAME IPv6 requires that hop-by-hop options be passed separately,
  * and that the IPv6 header be prepended in a separate mbuf.
  *
  * Returns a pointer to the new mbuf chain head, or NULL if the
  * allocation failed.
  */
 static struct mbuf *
 mld_v2_encap_report(struct ifnet *ifp, struct mbuf *m)
 {
 	struct mbuf		*mh;
 	struct mldv2_report	*mld;
 	struct ip6_hdr		*ip6;
 	struct in6_ifaddr	*ia;
 	int			 mldreclen;
 
 	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
 	KASSERT((m->m_flags & M_PKTHDR),
 	    ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
 
 	/*
 	 * RFC3590: OK to send as :: or tentative during DAD.
 	 */
 	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 	if (ia == NULL)
 		CTR1(KTR_MLD, "%s: warning: ia is NULL", __func__);
 
 	mh = m_gethdr(M_NOWAIT, MT_DATA);
 	if (mh == NULL) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		m_freem(m);
 		return (NULL);
 	}
 	M_ALIGN(mh, sizeof(struct ip6_hdr) + sizeof(struct mldv2_report));
 
 	mldreclen = m_length(m, NULL);
 	CTR2(KTR_MLD, "%s: mldreclen is %d", __func__, mldreclen);
 
 	mh->m_len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report);
 	mh->m_pkthdr.len = sizeof(struct ip6_hdr) +
 	    sizeof(struct mldv2_report) + mldreclen;
 
 	ip6 = mtod(mh, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any;
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	ip6->ip6_dst = in6addr_linklocal_allv2routers;
 	/* scope ID will be set in netisr */
 
 	mld = (struct mldv2_report *)(ip6 + 1);
 	mld->mld_type = MLDV2_LISTENER_REPORT;
 	mld->mld_code = 0;
 	mld->mld_cksum = 0;
 	mld->mld_v2_reserved = 0;
 	mld->mld_v2_numrecs = htons(m->m_pkthdr.PH_vt.vt_nrecs);
 	m->m_pkthdr.PH_vt.vt_nrecs = 0;
 
 	mh->m_next = m;
 	mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6,
 	    sizeof(struct ip6_hdr), sizeof(struct mldv2_report) + mldreclen);
 	return (mh);
 }
 
 #ifdef KTR
 static char *
 mld_rec_type_to_str(const int type)
 {
 
 	switch (type) {
 		case MLD_CHANGE_TO_EXCLUDE_MODE:
 			return "TO_EX";
 			break;
 		case MLD_CHANGE_TO_INCLUDE_MODE:
 			return "TO_IN";
 			break;
 		case MLD_MODE_IS_EXCLUDE:
 			return "MODE_EX";
 			break;
 		case MLD_MODE_IS_INCLUDE:
 			return "MODE_IN";
 			break;
 		case MLD_ALLOW_NEW_SOURCES:
 			return "ALLOW_NEW";
 			break;
 		case MLD_BLOCK_OLD_SOURCES:
 			return "BLOCK_OLD";
 			break;
 		default:
 			break;
 	}
 	return "unknown";
 }
 #endif
 
 static void
 mld_init(void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: initializing", __func__);
 	MLD_LOCK_INIT();
 
 	ip6_initpktopts(&mld_po);
 	mld_po.ip6po_hlim = 1;
 	mld_po.ip6po_hbh = &mld_ra.hbh;
 	mld_po.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER;
 	mld_po.ip6po_flags = IP6PO_DONTFRAG;
 }
 SYSINIT(mld_init, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_init, NULL);
 
 static void
 mld_uninit(void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: tearing down", __func__);
 	MLD_LOCK_DESTROY();
 }
 SYSUNINIT(mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_uninit, NULL);
 
 static void
 vnet_mld_init(const void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: initializing", __func__);
 
 	LIST_INIT(&V_mli_head);
 }
 VNET_SYSINIT(vnet_mld_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_init,
     NULL);
 
 static void
 vnet_mld_uninit(const void *unused __unused)
 {
 
+	/* This can happen if we shutdown the network stack. */
 	CTR1(KTR_MLD, "%s: tearing down", __func__);
-
-	KASSERT(LIST_EMPTY(&V_mli_head),
-	    ("%s: mli list not empty; ifnets not detached?", __func__));
 }
 VNET_SYSUNINIT(vnet_mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_uninit,
     NULL);
 
 static int
 mld_modevent(module_t mod, int type, void *unused __unused)
 {
 
     switch (type) {
     case MOD_LOAD:
     case MOD_UNLOAD:
 	break;
     default:
 	return (EOPNOTSUPP);
     }
     return (0);
 }
 
 static moduledata_t mld_mod = {
     "mld",
     mld_modevent,
     0
 };
 DECLARE_MODULE(mld, mld_mod, SI_SUB_PROTO_MC, SI_ORDER_ANY);
Index: projects/vnet/sys/sys/kernel.h
===================================================================
--- projects/vnet/sys/sys/kernel.h	(revision 297511)
+++ projects/vnet/sys/sys/kernel.h	(revision 297512)
@@ -1,369 +1,370 @@
 /*-
  * Copyright (c) 1995 Terrence R. Lambert
  * All rights reserved.
  *
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kernel.h	8.3 (Berkeley) 1/21/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_KERNEL_H_
 #define	_SYS_KERNEL_H_
 
 #include <sys/linker_set.h>
 
 #ifdef _KERNEL
 
 /* for intrhook below */
 #include <sys/queue.h>
 
 /* Global variables for the kernel. */
 
 /* 1.1 */
 extern char kernelname[MAXPATHLEN];
 
 extern int tick;			/* usec per tick (1000000 / hz) */
 extern int hz;				/* system clock's frequency */
 extern int psratio;			/* ratio: prof / stat */
 extern int stathz;			/* statistics clock's frequency */
 extern int profhz;			/* profiling clock's frequency */
 extern int profprocs;			/* number of process's profiling */
 extern volatile int ticks;
 
 #endif /* _KERNEL */
 
 /*
  * Enumerated types for known system startup interfaces.
  *
  * Startup occurs in ascending numeric order; the list entries are
  * sorted prior to attempting startup to guarantee order.  Items
  * of the same level are arbitrated for order based on the 'order'
  * element.
  *
  * These numbers are arbitrary and are chosen ONLY for ordering; the
  * enumeration values are explicit rather than implicit to provide
  * for binary compatibility with inserted elements.
  *
  * The SI_SUB_LAST value must have the highest lexical value.
  */
 enum sysinit_sub_id {
 	SI_SUB_DUMMY		= 0x0000000,	/* not executed; for linker*/
 	SI_SUB_DONE		= 0x0000001,	/* processed*/
 	SI_SUB_TUNABLES		= 0x0700000,	/* establish tunable values */
 	SI_SUB_COPYRIGHT	= 0x0800001,	/* first use of console*/
 	SI_SUB_VM		= 0x1000000,	/* virtual memory system init*/
 	SI_SUB_KMEM		= 0x1800000,	/* kernel memory*/
 	SI_SUB_HYPERVISOR	= 0x1A40000,	/*
 						 * Hypervisor detection and
 						 * virtualization support 
 						 * setup.
 						 */
 	SI_SUB_WITNESS		= 0x1A80000,	/* witness initialization */
 	SI_SUB_MTX_POOL_DYNAMIC	= 0x1AC0000,	/* dynamic mutex pool */
 	SI_SUB_LOCK		= 0x1B00000,	/* various locks */
 	SI_SUB_EVENTHANDLER	= 0x1C00000,	/* eventhandler init */
 	SI_SUB_VNET_PRELINK	= 0x1E00000,	/* vnet init before modules */
 	SI_SUB_KLD		= 0x2000000,	/* KLD and module setup */
 	SI_SUB_CPU		= 0x2100000,	/* CPU resource(s)*/
 	SI_SUB_RACCT		= 0x2110000,	/* resource accounting */
 	SI_SUB_KDTRACE		= 0x2140000,	/* Kernel dtrace hooks */
 	SI_SUB_RANDOM		= 0x2160000,	/* random number generator */
 	SI_SUB_MAC		= 0x2180000,	/* TrustedBSD MAC subsystem */
 	SI_SUB_MAC_POLICY	= 0x21C0000,	/* TrustedBSD MAC policies */
 	SI_SUB_MAC_LATE		= 0x21D0000,	/* TrustedBSD MAC subsystem */
 	SI_SUB_VNET		= 0x21E0000,	/* vnet 0 */
 	SI_SUB_INTRINSIC	= 0x2200000,	/* proc 0*/
 	SI_SUB_VM_CONF		= 0x2300000,	/* config VM, set limits*/
 	SI_SUB_DDB_SERVICES	= 0x2380000,	/* capture, scripting, etc. */
 	SI_SUB_RUN_QUEUE	= 0x2400000,	/* set up run queue*/
 	SI_SUB_KTRACE		= 0x2480000,	/* ktrace */
 	SI_SUB_OPENSOLARIS	= 0x2490000,	/* OpenSolaris compatibility */
 	SI_SUB_AUDIT		= 0x24C0000,	/* audit */
 	SI_SUB_CREATE_INIT	= 0x2500000,	/* create init process*/
 	SI_SUB_SCHED_IDLE	= 0x2600000,	/* required idle procs */
 	SI_SUB_MBUF		= 0x2700000,	/* mbuf subsystem */
 	SI_SUB_INTR		= 0x2800000,	/* interrupt threads */
 	SI_SUB_SOFTINTR		= 0x2800001,	/* start soft interrupt thread */
 	SI_SUB_DEVFS		= 0x2F00000,	/* devfs ready for devices */
 	SI_SUB_INIT_IF		= 0x3000000,	/* prep for net interfaces */
 	SI_SUB_NETGRAPH		= 0x3010000,	/* Let Netgraph initialize */
 	SI_SUB_DTRACE		= 0x3020000,	/* DTrace subsystem */
 	SI_SUB_DTRACE_PROVIDER	= 0x3048000,	/* DTrace providers */
 	SI_SUB_DTRACE_ANON	= 0x308C000,	/* DTrace anon enabling */
 	SI_SUB_DRIVERS		= 0x3100000,	/* Let Drivers initialize */
 	SI_SUB_CONFIGURE	= 0x3800000,	/* Configure devices */
 	SI_SUB_VFS		= 0x4000000,	/* virtual filesystem*/
 	SI_SUB_CLOCKS		= 0x4800000,	/* real time and stat clocks*/
 	SI_SUB_SYSV_SHM		= 0x6400000,	/* System V shared memory*/
 	SI_SUB_SYSV_SEM		= 0x6800000,	/* System V semaphores*/
 	SI_SUB_SYSV_MSG		= 0x6C00000,	/* System V message queues*/
 	SI_SUB_P1003_1B		= 0x6E00000,	/* P1003.1B realtime */
 	SI_SUB_PSEUDO		= 0x7000000,	/* pseudo devices*/
 	SI_SUB_EXEC		= 0x7400000,	/* execve() handlers */
 	SI_SUB_PROTO_BEGIN	= 0x8000000,	/* VNET initialization */
 	SI_SUB_PROTO_PFIL	= 0x8100000,	/* Intialize pfil before FWs */
 	SI_SUB_PROTO_IF		= 0x8400000,	/* interfaces*/
 	SI_SUB_PROTO_DOMAININIT	= 0x8600000,	/* domain registration system */
 	SI_SUB_PROTO_MC		= 0x8700000,	/* Multicast */
 	SI_SUB_PROTO_DOMAIN	= 0x8800000,	/* domains (address families?)*/
 	SI_SUB_FW		= 0x8806000,	/* Firewalls */
 	SI_SUB_PROTO_IFATTACHDOMAIN = 0x8808000,/* domain dependent data init */
 	SI_SUB_PROTO_END	= 0x8ffffff,	/* VNET helper functions */
 	SI_SUB_KPROF		= 0x9000000,	/* kernel profiling*/
 	SI_SUB_KICK_SCHEDULER	= 0xa000000,	/* start the timeout events*/
 	SI_SUB_INT_CONFIG_HOOKS	= 0xa800000,	/* Interrupts enabled config */
 	SI_SUB_ROOT_CONF	= 0xb000000,	/* Find root devices */
 	SI_SUB_INTRINSIC_POST	= 0xd000000,	/* proc 0 cleanup*/
 	SI_SUB_SYSCALLS		= 0xd800000,	/* register system calls */
+	SI_SUB_PSEUDO_DONE	= 0xdb00000,	/* pseudo dev removal; last! */
 	SI_SUB_VNET_DONE	= 0xdc00000,	/* vnet registration complete */
 	SI_SUB_KTHREAD_INIT	= 0xe000000,	/* init process*/
 	SI_SUB_KTHREAD_PAGE	= 0xe400000,	/* pageout daemon*/
 	SI_SUB_KTHREAD_VM	= 0xe800000,	/* vm daemon*/
 	SI_SUB_KTHREAD_BUF	= 0xea00000,	/* buffer daemon*/
 	SI_SUB_KTHREAD_UPDATE	= 0xec00000,	/* update daemon*/
 	SI_SUB_KTHREAD_IDLE	= 0xee00000,	/* idle procs*/
 	SI_SUB_SMP		= 0xf000000,	/* start the APs*/
 	SI_SUB_RACCTD		= 0xf100000,	/* start racctd*/
 	SI_SUB_LAST		= 0xfffffff	/* final initialization */
 };
 
 
 /*
  * Some enumerated orders; "ANY" sorts last.
  */
 enum sysinit_elem_order {
 	SI_ORDER_FIRST		= 0x0000000,	/* first*/
 	SI_ORDER_SECOND		= 0x0000001,	/* second*/
 	SI_ORDER_THIRD		= 0x0000002,	/* third*/
 	SI_ORDER_FOURTH		= 0x0000003,	/* fourth*/
 	SI_ORDER_MIDDLE		= 0x1000000,	/* somewhere in the middle */
 	SI_ORDER_ANY		= 0xfffffff	/* last*/
 };
 
 
 /*
  * A system initialization call instance
  *
  * At the moment there is one instance of sysinit.  We probably do not
  * want two which is why this code is if'd out, but we definitely want
  * to discern SYSINIT's which take non-constant data pointers and
  * SYSINIT's which take constant data pointers,
  *
  * The C_* macros take functions expecting const void * arguments
  * while the non-C_* macros take functions expecting just void * arguments.
  *
  * With -Wcast-qual on, the compiler issues warnings:
  *	- if we pass non-const data or functions taking non-const data
  *	  to a C_* macro.
  *
  *	- if we pass const data to the normal macros
  *
  * However, no warning is issued if we pass a function taking const data
  * through a normal non-const macro.  This is ok because the function is
  * saying it won't modify the data so we don't care whether the data is
  * modifiable or not.
  */
 
 typedef void (*sysinit_nfunc_t)(void *);
 typedef void (*sysinit_cfunc_t)(const void *);
 
 struct sysinit {
 	enum sysinit_sub_id	subsystem;	/* subsystem identifier*/
 	enum sysinit_elem_order	order;		/* init order within subsystem*/
 	sysinit_cfunc_t func;			/* function		*/
 	const void	*udata;			/* multiplexer/argument */
 };
 
 /*
  * Default: no special processing
  *
  * The C_ version of SYSINIT is for data pointers to const
  * data ( and functions taking data pointers to const data ).
  * At the moment it is no different from SYSINIT and thus
  * still results in warnings.
  *
  * The casts are necessary to have the compiler produce the
  * correct warnings when -Wcast-qual is used.
  *
  */
 #define	C_SYSINIT(uniquifier, subsystem, order, func, ident)	\
 	static struct sysinit uniquifier ## _sys_init = {	\
 		subsystem,					\
 		order,						\
 		func,						\
 		(ident)						\
 	};							\
 	DATA_SET(sysinit_set,uniquifier ## _sys_init)
 
 #define	SYSINIT(uniquifier, subsystem, order, func, ident)	\
 	C_SYSINIT(uniquifier, subsystem, order,			\
 	(sysinit_cfunc_t)(sysinit_nfunc_t)func, (void *)(ident))
 
 /*
  * Called on module unload: no special processing
  */
 #define	C_SYSUNINIT(uniquifier, subsystem, order, func, ident)	\
 	static struct sysinit uniquifier ## _sys_uninit = {	\
 		subsystem,					\
 		order,						\
 		func,						\
 		(ident)						\
 	};							\
 	DATA_SET(sysuninit_set,uniquifier ## _sys_uninit)
 
 #define	SYSUNINIT(uniquifier, subsystem, order, func, ident)	\
 	C_SYSUNINIT(uniquifier, subsystem, order,		\
 	(sysinit_cfunc_t)(sysinit_nfunc_t)func, (void *)(ident))
 
 void	sysinit_add(struct sysinit **set, struct sysinit **set_end);
 
 /*
  * Infrastructure for tunable 'constants'.  Value may be specified at compile
  * time or kernel load time.  Rules relating tunables together can be placed
  * in a SYSINIT function at SI_SUB_TUNABLES with SI_ORDER_ANY.
  *
  * WARNING: developers should never use the reserved suffixes specified in
  * loader.conf(5) for any tunables or conflicts will result.
  */
 
 /*
  * int
  * please avoid using for new tunables!
  */
 extern void tunable_int_init(void *);
 struct tunable_int {
 	const char *path;
 	int *var;
 };
 #define	TUNABLE_INT(path, var)					\
 	static struct tunable_int __CONCAT(__tunable_int_, __LINE__) = { \
 		(path),						\
 		(var),						\
 	};							\
 	SYSINIT(__CONCAT(__Tunable_init_, __LINE__),		\
 	    SI_SUB_TUNABLES, SI_ORDER_MIDDLE, tunable_int_init,	\
 	    &__CONCAT(__tunable_int_, __LINE__))
 
 #define	TUNABLE_INT_FETCH(path, var)	getenv_int((path), (var))
 
 /*
  * long
  */
 extern void tunable_long_init(void *);
 struct tunable_long {
 	const char *path;
 	long *var;
 };
 #define	TUNABLE_LONG(path, var)					\
 	static struct tunable_long __CONCAT(__tunable_long_, __LINE__) = { \
 		(path),						\
 		(var),						\
 	};							\
 	SYSINIT(__CONCAT(__Tunable_init_, __LINE__),		\
 	    SI_SUB_TUNABLES, SI_ORDER_MIDDLE, tunable_long_init,\
 	    &__CONCAT(__tunable_long_, __LINE__))
 
 #define	TUNABLE_LONG_FETCH(path, var)	getenv_long((path), (var))
 
 /*
  * unsigned long
  */
 extern void tunable_ulong_init(void *);
 struct tunable_ulong {
 	const char *path;
 	unsigned long *var;
 };
 #define	TUNABLE_ULONG(path, var)				\
 	static struct tunable_ulong __CONCAT(__tunable_ulong_, __LINE__) = { \
 		(path),						\
 		(var),						\
 	};							\
 	SYSINIT(__CONCAT(__Tunable_init_, __LINE__),		\
 	    SI_SUB_TUNABLES, SI_ORDER_MIDDLE, tunable_ulong_init, \
 	    &__CONCAT(__tunable_ulong_, __LINE__))
 
 #define	TUNABLE_ULONG_FETCH(path, var)	getenv_ulong((path), (var))
 
 /*
  * quad
  */
 extern void tunable_quad_init(void *);
 struct tunable_quad {
 	const char *path;
 	quad_t *var;
 };
 #define	TUNABLE_QUAD(path, var)					\
 	static struct tunable_quad __CONCAT(__tunable_quad_, __LINE__) = { \
 		(path),						\
 		(var),						\
 	};							\
 	SYSINIT(__CONCAT(__Tunable_init_, __LINE__),		\
 	    SI_SUB_TUNABLES, SI_ORDER_MIDDLE, tunable_quad_init, \
 	    &__CONCAT(__tunable_quad_, __LINE__))
 
 #define	TUNABLE_QUAD_FETCH(path, var)	getenv_quad((path), (var))
 
 extern void tunable_str_init(void *);
 struct tunable_str {
 	const char *path;
 	char *var;
 	int size;
 };
 #define	TUNABLE_STR(path, var, size)				\
 	static struct tunable_str __CONCAT(__tunable_str_, __LINE__) = { \
 		(path),						\
 		(var),						\
 		(size),						\
 	};							\
 	SYSINIT(__CONCAT(__Tunable_init_, __LINE__),		\
 	    SI_SUB_TUNABLES, SI_ORDER_MIDDLE, tunable_str_init,	\
 	    &__CONCAT(__tunable_str_, __LINE__))
 
 #define	TUNABLE_STR_FETCH(path, var, size)			\
 	getenv_string((path), (var), (size))
 
 struct intr_config_hook {
 	TAILQ_ENTRY(intr_config_hook) ich_links;
 	void	(*ich_func)(void *arg);
 	void	*ich_arg;
 };
 
 int	config_intrhook_establish(struct intr_config_hook *hook);
 void	config_intrhook_disestablish(struct intr_config_hook *hook);
 
 #endif /* !_SYS_KERNEL_H_*/