diff --git a/sys/net/altq/altq_cbq.c b/sys/net/altq/altq_cbq.c
index 0e267fcb49e2..86102ebcd537 100644
--- a/sys/net/altq/altq_cbq.c
+++ b/sys/net/altq/altq_cbq.c
@@ -1,563 +1,564 @@
 /*-
  * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by the SMCC Technology
  *      Development Group at Sun Microsystems, Inc.
  *
  * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or
  *      promote products derived from this software without specific prior
  *      written permission.
  *
  * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE
  * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE.  The software is
  * provided "as is" without express or implied warranty of any kind.
  *
  * These notices must be retained in any copies of any part of this software.
  *
  * $KAME: altq_cbq.c,v 1.19 2003/09/17 14:23:25 kjc Exp $
  * $FreeBSD$
  */
 
 #include "opt_altq.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #ifdef ALTQ_CBQ	/* cbq is enabled by ALTQ_CBQ option in opt_altq.h */
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <netinet/in.h>
 
 #include <netpfil/pf/pf.h>
 #include <netpfil/pf/pf_altq.h>
 #include <netpfil/pf/pf_mtag.h>
 #include <net/altq/altq.h>
 #include <net/altq/altq_cbq.h>
 
 /*
  * Forward Declarations.
  */
 static int		 cbq_class_destroy(cbq_state_t *, struct rm_class *);
 static struct rm_class  *clh_to_clp(cbq_state_t *, u_int32_t);
 static int		 cbq_clear_interface(cbq_state_t *);
 static int		 cbq_request(struct ifaltq *, int, void *);
 static int		 cbq_enqueue(struct ifaltq *, struct mbuf *,
 			     struct altq_pktattr *);
 static struct mbuf	*cbq_dequeue(struct ifaltq *, int);
 static void		 cbqrestart(struct ifaltq *);
 static void		 get_class_stats(class_stats_t *, struct rm_class *);
 static void		 cbq_purge(cbq_state_t *);
 
 /*
  * int
  * cbq_class_destroy(cbq_mod_state_t *, struct rm_class *) - This
  *	function destroys a given traffic class.  Before destroying
  *	the class, all traffic for that class is released.
  */
 static int
 cbq_class_destroy(cbq_state_t *cbqp, struct rm_class *cl)
 {
 	int	i;
 
 	/* delete the class */
 	rmc_delete_class(&cbqp->ifnp, cl);
 
 	/*
 	 * free the class handle
 	 */
 	for (i = 0; i < CBQ_MAX_CLASSES; i++)
 		if (cbqp->cbq_class_tbl[i] == cl)
 			cbqp->cbq_class_tbl[i] = NULL;
 
 	if (cl == cbqp->ifnp.root_)
 		cbqp->ifnp.root_ = NULL;
 	if (cl == cbqp->ifnp.default_)
 		cbqp->ifnp.default_ = NULL;
 	return (0);
 }
 
 /* convert class handle to class pointer */
 static struct rm_class *
 clh_to_clp(cbq_state_t *cbqp, u_int32_t chandle)
 {
 	int i;
 	struct rm_class *cl;
 
 	if (chandle == 0)
 		return (NULL);
 	/*
 	 * first, try optimistically the slot matching the lower bits of
 	 * the handle.  if it fails, do the linear table search.
 	 */
 	i = chandle % CBQ_MAX_CLASSES;
 	if ((cl = cbqp->cbq_class_tbl[i]) != NULL &&
 	    cl->stats_.handle == chandle)
 		return (cl);
 	for (i = 0; i < CBQ_MAX_CLASSES; i++)
 		if ((cl = cbqp->cbq_class_tbl[i]) != NULL &&
 		    cl->stats_.handle == chandle)
 			return (cl);
 	return (NULL);
 }
 
 static int
 cbq_clear_interface(cbq_state_t *cbqp)
 {
 	int		 again, i;
 	struct rm_class	*cl;
 
 #ifdef ALTQ3_CLFIER_COMPAT
 	/* free the filters for this interface */
 	acc_discard_filters(&cbqp->cbq_classifier, NULL, 1);
 #endif
 
 	/* clear out the classes now */
 	do {
 		again = 0;
 		for (i = 0; i < CBQ_MAX_CLASSES; i++) {
 			if ((cl = cbqp->cbq_class_tbl[i]) != NULL) {
 				if (is_a_parent_class(cl))
 					again++;
 				else {
 					cbq_class_destroy(cbqp, cl);
 					cbqp->cbq_class_tbl[i] = NULL;
 					if (cl == cbqp->ifnp.root_)
 						cbqp->ifnp.root_ = NULL;
 					if (cl == cbqp->ifnp.default_)
 						cbqp->ifnp.default_ = NULL;
 				}
 			}
 		}
 	} while (again);
 
 	return (0);
 }
 
 static int
 cbq_request(struct ifaltq *ifq, int req, void *arg)
 {
 	cbq_state_t	*cbqp = (cbq_state_t *)ifq->altq_disc;
 
 	IFQ_LOCK_ASSERT(ifq);
 
 	switch (req) {
 	case ALTRQ_PURGE:
 		cbq_purge(cbqp);
 		break;
 	}
 	return (0);
 }
 
 /* copy the stats info in rm_class to class_states_t */
 static void
 get_class_stats(class_stats_t *statsp, struct rm_class *cl)
 {
 	statsp->xmit_cnt	= cl->stats_.xmit_cnt;
 	statsp->drop_cnt	= cl->stats_.drop_cnt;
 	statsp->over		= cl->stats_.over;
 	statsp->borrows		= cl->stats_.borrows;
 	statsp->overactions	= cl->stats_.overactions;
 	statsp->delays		= cl->stats_.delays;
 
 	statsp->depth		= cl->depth_;
 	statsp->priority	= cl->pri_;
 	statsp->maxidle		= cl->maxidle_;
 	statsp->minidle		= cl->minidle_;
 	statsp->offtime		= cl->offtime_;
 	statsp->qmax		= qlimit(cl->q_);
 	statsp->ns_per_byte	= cl->ns_per_byte_;
 	statsp->wrr_allot	= cl->w_allotment_;
 	statsp->qcnt		= qlen(cl->q_);
 	statsp->avgidle		= cl->avgidle_;
 
 	statsp->qtype		= qtype(cl->q_);
 #ifdef ALTQ_RED
 	if (q_is_red(cl->q_))
 		red_getstats(cl->red_, &statsp->red[0]);
 #endif
 #ifdef ALTQ_RIO
 	if (q_is_rio(cl->q_))
 		rio_getstats((rio_t *)cl->red_, &statsp->red[0]);
 #endif
 #ifdef ALTQ_CODEL
 	if (q_is_codel(cl->q_))
 		codel_getstats(cl->codel_, &statsp->codel);
 #endif
 }
 
 int
 cbq_pfattach(struct pf_altq *a)
 {
 	struct ifnet	*ifp;
 	int		 s, error;
 
 	if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL)
 		return (EINVAL);
 	s = splnet();
 	error = altq_attach(&ifp->if_snd, ALTQT_CBQ, a->altq_disc,
 	    cbq_enqueue, cbq_dequeue, cbq_request);
 	splx(s);
 	return (error);
 }
 
 int
 cbq_add_altq(struct ifnet *ifp, struct pf_altq *a)
 {
 	cbq_state_t	*cbqp;
 
 	if (ifp == NULL)
 		return (EINVAL);
 	if (!ALTQ_IS_READY(&ifp->if_snd))
 		return (ENODEV);
 
 	/* allocate and initialize cbq_state_t */
 	cbqp = malloc(sizeof(cbq_state_t), M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (cbqp == NULL)
 		return (ENOMEM);
 	CALLOUT_INIT(&cbqp->cbq_callout);
 	cbqp->cbq_qlen = 0;
 	cbqp->ifnp.ifq_ = &ifp->if_snd;	    /* keep the ifq */
 
 	/* keep the state in pf_altq */
 	a->altq_disc = cbqp;
 
 	return (0);
 }
 
 int
 cbq_remove_altq(struct pf_altq *a)
 {
 	cbq_state_t	*cbqp;
 
 	if ((cbqp = a->altq_disc) == NULL)
 		return (EINVAL);
 	a->altq_disc = NULL;
 
 	cbq_clear_interface(cbqp);
 
 	if (cbqp->ifnp.default_)
 		cbq_class_destroy(cbqp, cbqp->ifnp.default_);
 	if (cbqp->ifnp.root_)
 		cbq_class_destroy(cbqp, cbqp->ifnp.root_);
 
 	/* deallocate cbq_state_t */
 	free(cbqp, M_DEVBUF);
 
 	return (0);
 }
 
 int
 cbq_add_queue(struct pf_altq *a)
 {
 	struct rm_class	*borrow, *parent;
 	cbq_state_t	*cbqp;
 	struct rm_class	*cl;
 	struct cbq_opts	*opts;
 	int		i;
 
 	if ((cbqp = a->altq_disc) == NULL)
 		return (EINVAL);
 	if (a->qid == 0)
 		return (EINVAL);
 
 	/*
 	 * find a free slot in the class table.  if the slot matching
 	 * the lower bits of qid is free, use this slot.  otherwise,
 	 * use the first free slot.
 	 */
 	i = a->qid % CBQ_MAX_CLASSES;
 	if (cbqp->cbq_class_tbl[i] != NULL) {
 		for (i = 0; i < CBQ_MAX_CLASSES; i++)
 			if (cbqp->cbq_class_tbl[i] == NULL)
 				break;
 		if (i == CBQ_MAX_CLASSES)
 			return (EINVAL);
 	}
 
 	opts = &a->pq_u.cbq_opts;
 	/* check parameters */
 	if (a->priority >= CBQ_MAXPRI)
 		return (EINVAL);
 
 	/* Get pointers to parent and borrow classes.  */
 	parent = clh_to_clp(cbqp, a->parent_qid);
 	if (opts->flags & CBQCLF_BORROW)
 		borrow = parent;
 	else
 		borrow = NULL;
 
 	/*
 	 * A class must borrow from it's parent or it can not
 	 * borrow at all.  Hence, borrow can be null.
 	 */
 	if (parent == NULL && (opts->flags & CBQCLF_ROOTCLASS) == 0) {
 		printf("cbq_add_queue: no parent class!\n");
 		return (EINVAL);
 	}
 
 	if ((borrow != parent)  && (borrow != NULL)) {
 		printf("cbq_add_class: borrow class != parent\n");
 		return (EINVAL);
 	}
 
 	/*
 	 * check parameters
 	 */
 	switch (opts->flags & CBQCLF_CLASSMASK) {
 	case CBQCLF_ROOTCLASS:
 		if (parent != NULL)
 			return (EINVAL);
 		if (cbqp->ifnp.root_)
 			return (EINVAL);
 		break;
 	case CBQCLF_DEFCLASS:
 		if (cbqp->ifnp.default_)
 			return (EINVAL);
 		break;
 	case 0:
 		if (a->qid == 0)
 			return (EINVAL);
 		break;
 	default:
 		/* more than two flags bits set */
 		return (EINVAL);
 	}
 
 	/*
 	 * create a class.  if this is a root class, initialize the
 	 * interface.
 	 */
 	if ((opts->flags & CBQCLF_CLASSMASK) == CBQCLF_ROOTCLASS) {
 		rmc_init(cbqp->ifnp.ifq_, &cbqp->ifnp, opts->ns_per_byte,
 		    cbqrestart, a->qlimit, RM_MAXQUEUED,
 		    opts->maxidle, opts->minidle, opts->offtime,
 		    opts->flags);
 		cl = cbqp->ifnp.root_;
 	} else {
 		cl = rmc_newclass(a->priority,
 				  &cbqp->ifnp, opts->ns_per_byte,
 				  rmc_delay_action, a->qlimit, parent, borrow,
 				  opts->maxidle, opts->minidle, opts->offtime,
 				  opts->pktsize, opts->flags);
 	}
 	if (cl == NULL)
 		return (ENOMEM);
 
 	/* return handle to user space. */
 	cl->stats_.handle = a->qid;
 	cl->stats_.depth = cl->depth_;
 
 	/* save the allocated class */
 	cbqp->cbq_class_tbl[i] = cl;
 
 	if ((opts->flags & CBQCLF_CLASSMASK) == CBQCLF_DEFCLASS)
 		cbqp->ifnp.default_ = cl;
 
 	return (0);
 }
 
 int
 cbq_remove_queue(struct pf_altq *a)
 {
 	struct rm_class	*cl;
 	cbq_state_t	*cbqp;
 	int		i;
 
 	if ((cbqp = a->altq_disc) == NULL)
 		return (EINVAL);
 
 	if ((cl = clh_to_clp(cbqp, a->qid)) == NULL)
 		return (EINVAL);
 
 	/* if we are a parent class, then return an error. */
 	if (is_a_parent_class(cl))
 		return (EINVAL);
 
 	/* delete the class */
 	rmc_delete_class(&cbqp->ifnp, cl);
 
 	/*
 	 * free the class handle
 	 */
 	for (i = 0; i < CBQ_MAX_CLASSES; i++)
 		if (cbqp->cbq_class_tbl[i] == cl) {
 			cbqp->cbq_class_tbl[i] = NULL;
 			if (cl == cbqp->ifnp.root_)
 				cbqp->ifnp.root_ = NULL;
 			if (cl == cbqp->ifnp.default_)
 				cbqp->ifnp.default_ = NULL;
 			break;
 		}
 
 	return (0);
 }
 
 int
 cbq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes, int version)
 {
 	cbq_state_t	*cbqp;
 	struct rm_class	*cl;
 	class_stats_t	 stats;
 	int		 error = 0;
 
 	if ((cbqp = altq_lookup(a->ifname, ALTQT_CBQ)) == NULL)
 		return (EBADF);
 
 	if ((cl = clh_to_clp(cbqp, a->qid)) == NULL)
 		return (EINVAL);
 
 	if (*nbytes < sizeof(stats))
 		return (EINVAL);
 
 	get_class_stats(&stats, cl);
 
 	if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0)
 		return (error);
 	*nbytes = sizeof(stats);
 	return (0);
 }
 
 /*
  * int
  * cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pattr)
  *		- Queue data packets.
  *
  *	cbq_enqueue is set to ifp->if_altqenqueue and called by an upper
  *	layer (e.g. ether_output).  cbq_enqueue queues the given packet
  *	to the cbq, then invokes the driver's start routine.
  *
  *	Assumptions:	called in splimp
  *	Returns:	0 if the queueing is successful.
  *			ENOBUFS if a packet dropping occurred as a result of
  *			the queueing.
  */
 
 static int
 cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
 {
 	cbq_state_t	*cbqp = (cbq_state_t *)ifq->altq_disc;
 	struct rm_class	*cl;
 	struct pf_mtag	*t;
 	int		 len;
 
 	IFQ_LOCK_ASSERT(ifq);
 
 	/* grab class set by classifier */
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		/* should not happen */
 		printf("altq: packet for %s does not have pkthdr\n",
 		    ifq->altq_ifp->if_xname);
 		m_freem(m);
 		return (ENOBUFS);
 	}
 	cl = NULL;
 	if ((t = pf_find_mtag(m)) != NULL)
 		cl = clh_to_clp(cbqp, t->qid);
 	if (cl == NULL) {
 		cl = cbqp->ifnp.default_;
 		if (cl == NULL) {
 			m_freem(m);
 			return (ENOBUFS);
 		}
 	}
 	cl->pktattr_ = NULL;
 	len = m_pktlen(m);
 	if (rmc_queue_packet(cl, m) != 0) {
 		/* drop occurred.  some mbuf was freed in rmc_queue_packet. */
 		PKTCNTR_ADD(&cl->stats_.drop_cnt, len);
 		return (ENOBUFS);
 	}
 
 	/* successfully queued. */
 	++cbqp->cbq_qlen;
 	IFQ_INC_LEN(ifq);
 	return (0);
 }
 
 static struct mbuf *
 cbq_dequeue(struct ifaltq *ifq, int op)
 {
 	cbq_state_t	*cbqp = (cbq_state_t *)ifq->altq_disc;
 	struct mbuf	*m;
 
 	IFQ_LOCK_ASSERT(ifq);
 
 	m = rmc_dequeue_next(&cbqp->ifnp, op);
 
 	if (m && op == ALTDQ_REMOVE) {
 		--cbqp->cbq_qlen;  /* decrement # of packets in cbq */
 		IFQ_DEC_LEN(ifq);
 
 		/* Update the class. */
 		rmc_update_class_util(&cbqp->ifnp);
 	}
 	return (m);
 }
 
 /*
  * void
  * cbqrestart(queue_t *) - Restart sending of data.
  * called from rmc_restart in splimp via timeout after waking up
  * a suspended class.
  *	Returns:	NONE
  */
 
 static void
 cbqrestart(struct ifaltq *ifq)
 {
 	cbq_state_t	*cbqp;
 	struct ifnet	*ifp;
 
 	IFQ_LOCK_ASSERT(ifq);
 
 	if (!ALTQ_IS_ENABLED(ifq))
 		/* cbq must have been detached */
 		return;
 
 	if ((cbqp = (cbq_state_t *)ifq->altq_disc) == NULL)
 		/* should not happen */
 		return;
 
 	ifp = ifq->altq_ifp;
 	if (ifp->if_start &&
 	    cbqp->cbq_qlen > 0 && (ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
 	    	IFQ_UNLOCK(ifq);
 		(*ifp->if_start)(ifp);
 		IFQ_LOCK(ifq);
 	}
 }
 
 static void cbq_purge(cbq_state_t *cbqp)
 {
 	struct rm_class	*cl;
 	int		 i;
 
 	for (i = 0; i < CBQ_MAX_CLASSES; i++)
 		if ((cl = cbqp->cbq_class_tbl[i]) != NULL)
 			rmc_dropall(cl);
 	if (ALTQ_IS_ENABLED(cbqp->ifnp.ifq_))
 		cbqp->ifnp.ifq_->ifq_len = 0;
 }
 
 #endif /* ALTQ_CBQ */
diff --git a/sys/net/altq/altq_codel.c b/sys/net/altq/altq_codel.c
index b9950ed94d2b..be16a5aef3e5 100644
--- a/sys/net/altq/altq_codel.c
+++ b/sys/net/altq/altq_codel.c
@@ -1,475 +1,476 @@
 /*
  * CoDel - The Controlled-Delay Active Queue Management algorithm
  *
  *  Copyright (C) 2013 Ermal Luçi <eri@FreeBSD.org>
  *  Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
  *  Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
  *  Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
  *  Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * Alternatively, provided that this notice is retained in full, this
  * software may be distributed under the terms of the GNU General
  * Public License ("GPL") version 2, in which case the provisions of the
  * GPL apply INSTEAD OF those given above.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  *
  * $FreeBSD$
  */
 #include "opt_altq.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #ifdef ALTQ_CODEL  /* CoDel is enabled by ALTQ_CODEL option in opt_altq.h */
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <netinet/in.h>
 
 #include <netpfil/pf/pf.h>
 #include <netpfil/pf/pf_altq.h>
 #include <net/altq/if_altq.h>
 #include <net/altq/altq.h>
 #include <net/altq/altq_codel.h>
 
 static int		 codel_should_drop(struct codel *, class_queue_t *,
 			    struct mbuf *, u_int64_t);
 static void		 codel_Newton_step(struct codel_vars *);
 static u_int64_t	 codel_control_law(u_int64_t t, u_int64_t, u_int32_t);
 
 #define	codel_time_after(a, b)		((int64_t)(a) - (int64_t)(b) > 0)
 #define	codel_time_after_eq(a, b)	((int64_t)(a) - (int64_t)(b) >= 0)
 #define	codel_time_before(a, b)		((int64_t)(a) - (int64_t)(b) < 0)
 #define	codel_time_before_eq(a, b)	((int64_t)(a) - (int64_t)(b) <= 0)
 
 static int codel_request(struct ifaltq *, int, void *);
 
 static int codel_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
 static struct mbuf *codel_dequeue(struct ifaltq *, int);
 
 int
 codel_pfattach(struct pf_altq *a)
 {
 	struct ifnet *ifp;
 
 	if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL)
 		return (EINVAL);
 
 	return (altq_attach(&ifp->if_snd, ALTQT_CODEL, a->altq_disc,
 	    codel_enqueue, codel_dequeue, codel_request));
 }
 
 int
 codel_add_altq(struct ifnet *ifp, struct pf_altq *a)
 {
 	struct codel_if	*cif;
 	struct codel_opts	*opts;
 
 	if (ifp == NULL)
 		return (EINVAL);
 	if (!ALTQ_IS_READY(&ifp->if_snd))
 		return (ENODEV);
 
 	opts = &a->pq_u.codel_opts;
 
 	cif = malloc(sizeof(struct codel_if), M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (cif == NULL)
 		return (ENOMEM);
 	cif->cif_bandwidth = a->ifbandwidth;
 	cif->cif_ifq = &ifp->if_snd;
 
 	cif->cl_q = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (cif->cl_q == NULL) {
 		free(cif, M_DEVBUF);
 		return (ENOMEM);
 	}
 
 	if (a->qlimit == 0)
 		a->qlimit = 50;	/* use default. */
 	qlimit(cif->cl_q) = a->qlimit;
 	qtype(cif->cl_q) = Q_CODEL;
 	qlen(cif->cl_q) = 0;
 	qsize(cif->cl_q) = 0;
 
 	if (opts->target == 0)
 		opts->target = 5;
 	if (opts->interval == 0)
 		opts->interval = 100;
 	cif->codel.params.target = machclk_freq * opts->target / 1000;
 	cif->codel.params.interval = machclk_freq * opts->interval / 1000;
 	cif->codel.params.ecn = opts->ecn;
 	cif->codel.stats.maxpacket = 256;
 
 	cif->cl_stats.qlength = qlen(cif->cl_q);
 	cif->cl_stats.qlimit = qlimit(cif->cl_q);
 
 	/* keep the state in pf_altq */
 	a->altq_disc = cif;
 
 	return (0);
 }
 
 int
 codel_remove_altq(struct pf_altq *a)
 {
 	struct codel_if *cif;
 
 	if ((cif = a->altq_disc) == NULL)
 		return (EINVAL);
 	a->altq_disc = NULL;
 
 	if (cif->cl_q)
 		free(cif->cl_q, M_DEVBUF);
 	free(cif, M_DEVBUF);
 
 	return (0);
 }
 
 int
 codel_getqstats(struct pf_altq *a, void *ubuf, int *nbytes, int version)
 {
 	struct codel_if *cif;
 	struct codel_ifstats stats;
 	int error = 0;
 
 	if ((cif = altq_lookup(a->ifname, ALTQT_CODEL)) == NULL)
 		return (EBADF);
 
 	if (*nbytes < sizeof(stats))
 		return (EINVAL);
 
 	stats = cif->cl_stats;
 	stats.stats = cif->codel.stats;
 
 	if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0)
 		return (error);
 	*nbytes = sizeof(stats);
 
 	return (0);
 }
 
 static int
 codel_request(struct ifaltq *ifq, int req, void *arg)
 {
 	struct codel_if	*cif = (struct codel_if *)ifq->altq_disc;
 	struct mbuf *m;
 
 	IFQ_LOCK_ASSERT(ifq);
 
 	switch (req) {
 	case ALTRQ_PURGE:
 		if (!ALTQ_IS_ENABLED(cif->cif_ifq))
 			break;
 
 		if (qempty(cif->cl_q))
 			break;
 
 		while ((m = _getq(cif->cl_q)) != NULL) {
 			PKTCNTR_ADD(&cif->cl_stats.cl_dropcnt, m_pktlen(m));
 			m_freem(m);
 			IFQ_DEC_LEN(cif->cif_ifq);
 		}
 		cif->cif_ifq->ifq_len = 0;
 		break;
 	}
 
 	return (0);
 }
 
 static int
 codel_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
 {
 
 	struct codel_if *cif = (struct codel_if *) ifq->altq_disc;
 
 	IFQ_LOCK_ASSERT(ifq);
 
 	/* grab class set by classifier */
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		/* should not happen */
 		printf("altq: packet for %s does not have pkthdr\n",
 		   ifq->altq_ifp->if_xname);
 		m_freem(m);
 		PKTCNTR_ADD(&cif->cl_stats.cl_dropcnt, m_pktlen(m));
 		return (ENOBUFS);
 	}
 
 	if (codel_addq(&cif->codel, cif->cl_q, m)) {
 		PKTCNTR_ADD(&cif->cl_stats.cl_dropcnt, m_pktlen(m));
 		return (ENOBUFS);
 	}
 	IFQ_INC_LEN(ifq);
 
 	return (0);
 }
 
 static struct mbuf *
 codel_dequeue(struct ifaltq *ifq, int op)
 {
 	struct codel_if *cif = (struct codel_if *)ifq->altq_disc;
 	struct mbuf *m;
 
 	IFQ_LOCK_ASSERT(ifq);
 
 	if (IFQ_IS_EMPTY(ifq))
 		return (NULL);
 
 	if (op == ALTDQ_POLL)
 		return (qhead(cif->cl_q));
 
 	m = codel_getq(&cif->codel, cif->cl_q);
 	if (m != NULL) {
 		IFQ_DEC_LEN(ifq);
 		PKTCNTR_ADD(&cif->cl_stats.cl_xmitcnt, m_pktlen(m));
 		return (m);
 	}
 
 	return (NULL);
 }
 
 struct codel *
 codel_alloc(int target, int interval, int ecn)
 {
 	struct codel *c;
 
 	c = malloc(sizeof(*c), M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (c != NULL) {
 		c->params.target = machclk_freq * target / 1000;
 		c->params.interval = machclk_freq * interval / 1000;
 		c->params.ecn = ecn;
 		c->stats.maxpacket = 256;
 	}
 
 	return (c);
 }
 
 void
 codel_destroy(struct codel *c)
 {
 
 	free(c, M_DEVBUF);
 }
 
 #define	MTAG_CODEL	1438031249
 int
 codel_addq(struct codel *c, class_queue_t *q, struct mbuf *m)
 {
 	struct m_tag *mtag;
 	uint64_t *enqueue_time;
 
 	if (qlen(q) < qlimit(q)) {
 		mtag = m_tag_locate(m, MTAG_CODEL, 0, NULL);
 		if (mtag == NULL)
 			mtag = m_tag_alloc(MTAG_CODEL, 0, sizeof(uint64_t),
 			    M_NOWAIT);
 		if (mtag == NULL) {
 			m_freem(m);
 			return (-1);
 		}
 		enqueue_time = (uint64_t *)(mtag + 1);
 		*enqueue_time = read_machclk();
 		m_tag_prepend(m, mtag);
 		_addq(q, m);
 		return (0);
 	}
 	c->drop_overlimit++;
 	m_freem(m);
 
 	return (-1);
 }
 
 static int
 codel_should_drop(struct codel *c, class_queue_t *q, struct mbuf *m,
     u_int64_t now)
 {
 	struct m_tag *mtag;
 	uint64_t *enqueue_time;
 
 	if (m == NULL) {
 		c->vars.first_above_time = 0;
 		return (0);
 	}
 
 	mtag = m_tag_locate(m, MTAG_CODEL, 0, NULL);
 	if (mtag == NULL) {
 		/* Only one warning per second. */
 		if (ppsratecheck(&c->last_log, &c->last_pps, 1))
 			printf("%s: could not found the packet mtag!\n",
 			    __func__);
 		c->vars.first_above_time = 0;
 		return (0);
 	}
 	enqueue_time = (uint64_t *)(mtag + 1);
 	c->vars.ldelay = now - *enqueue_time;
 	c->stats.maxpacket = MAX(c->stats.maxpacket, m_pktlen(m));
 
 	if (codel_time_before(c->vars.ldelay, c->params.target) ||
 	    qsize(q) <= c->stats.maxpacket) {
 		/* went below - stay below for at least interval */
 		c->vars.first_above_time = 0;
 		return (0);
 	}
 	if (c->vars.first_above_time == 0) {
 		/* just went above from below. If we stay above
 		 * for at least interval we'll say it's ok to drop
 		 */
 		c->vars.first_above_time = now + c->params.interval;
 		return (0);
 	}
 	if (codel_time_after(now, c->vars.first_above_time))
 		return (1);
 
 	return (0);
 }
 
 /*
  * Run a Newton method step:
  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
  *
  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
  */
 static void
 codel_Newton_step(struct codel_vars *vars)
 {
 	uint32_t invsqrt, invsqrt2;
 	uint64_t val;
 
 /* sizeof_in_bits(rec_inv_sqrt) */
 #define	REC_INV_SQRT_BITS (8 * sizeof(u_int16_t))
 /* needed shift to get a Q0.32 number from rec_inv_sqrt */
 #define	REC_INV_SQRT_SHIFT (32 - REC_INV_SQRT_BITS)
 
 	invsqrt = ((u_int32_t)vars->rec_inv_sqrt) << REC_INV_SQRT_SHIFT;
 	invsqrt2 = ((u_int64_t)invsqrt * invsqrt) >> 32;
 	val = (3LL << 32) - ((u_int64_t)vars->count * invsqrt2);
 	val >>= 2; /* avoid overflow in following multiply */
 	val = (val * invsqrt) >> (32 - 2 + 1);
 
 	vars->rec_inv_sqrt = val >> REC_INV_SQRT_SHIFT;
 }
 
 static u_int64_t
 codel_control_law(u_int64_t t, u_int64_t interval, u_int32_t rec_inv_sqrt)
 {
 
 	return (t + (u_int32_t)(((u_int64_t)interval *
 	    (rec_inv_sqrt << REC_INV_SQRT_SHIFT)) >> 32));
 }
 
 struct mbuf *
 codel_getq(struct codel *c, class_queue_t *q)
 {
 	struct mbuf	*m;
 	u_int64_t	 now;
 	int		 drop;
 
 	if ((m = _getq(q)) == NULL) {
 		c->vars.dropping = 0;
 		return (m);
 	}
 
 	now = read_machclk();
 	drop = codel_should_drop(c, q, m, now);
 	if (c->vars.dropping) {
 		if (!drop) {
 			/* sojourn time below target - leave dropping state */
 			c->vars.dropping = 0;
 		} else if (codel_time_after_eq(now, c->vars.drop_next)) {
 			/* It's time for the next drop. Drop the current
 			 * packet and dequeue the next. The dequeue might
 			 * take us out of dropping state.
 			 * If not, schedule the next drop.
 			 * A large backlog might result in drop rates so high
 			 * that the next drop should happen now,
 			 * hence the while loop.
 			 */
 			while (c->vars.dropping &&
 			    codel_time_after_eq(now, c->vars.drop_next)) {
 				c->vars.count++; /* don't care of possible wrap
 						  * since there is no more
 						  * divide */
 				codel_Newton_step(&c->vars);
 				/* TODO ECN */
 				PKTCNTR_ADD(&c->stats.drop_cnt, m_pktlen(m));
 				m_freem(m);
 				m = _getq(q);
 				if (!codel_should_drop(c, q, m, now))
 					/* leave dropping state */
 					c->vars.dropping = 0;
 				else
 					/* and schedule the next drop */
 					c->vars.drop_next =
 					    codel_control_law(c->vars.drop_next,
 						c->params.interval,
 						c->vars.rec_inv_sqrt);
 			}
 		}
 	} else if (drop) {
 		/* TODO ECN */
 		PKTCNTR_ADD(&c->stats.drop_cnt, m_pktlen(m));
 		m_freem(m);
 
 		m = _getq(q);
 		drop = codel_should_drop(c, q, m, now);
 
 		c->vars.dropping = 1;
 		/* if min went above target close to when we last went below it
 		 * assume that the drop rate that controlled the queue on the
 		 * last cycle is a good starting point to control it now.
 		 */
 		if (codel_time_before(now - c->vars.drop_next,
 		    16 * c->params.interval)) {
 			c->vars.count = (c->vars.count - c->vars.lastcount) | 1;
 			/* we dont care if rec_inv_sqrt approximation
 			 * is not very precise :
 			 * Next Newton steps will correct it quadratically.
 			 */
 			codel_Newton_step(&c->vars);
 		} else {
 			c->vars.count = 1;
 			c->vars.rec_inv_sqrt = ~0U >> REC_INV_SQRT_SHIFT;
 		}
 		c->vars.lastcount = c->vars.count;
 		c->vars.drop_next = codel_control_law(now, c->params.interval,
 		    c->vars.rec_inv_sqrt);
 	}
 
 	return (m);
 }
 
 void
 codel_getstats(struct codel *c, struct codel_stats *s)
 {
 	*s = c->stats;
 }
 
 #endif /* ALTQ_CODEL */
diff --git a/sys/net/altq/altq_fairq.c b/sys/net/altq/altq_fairq.c
index b4eeb1ddf9b4..b33543548235 100644
--- a/sys/net/altq/altq_fairq.c
+++ b/sys/net/altq/altq_fairq.c
@@ -1,907 +1,908 @@
 /*
  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
  * 
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@backplane.com>
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  * 3. Neither the name of The DragonFly Project nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific, prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
  * $DragonFly: src/sys/net/altq/altq_fairq.c,v 1.1 2008/04/06 18:58:15 dillon Exp $
  * $FreeBSD$
  */
 /*
  * Matt: I gutted altq_priq.c and used it as a skeleton on which to build
  * fairq.  The fairq algorithm is completely different then priq, of course,
  * but because I used priq's skeleton I believe I should include priq's
  * copyright.
  *
  * Copyright (C) 2000-2003
  *	Sony Computer Science Laboratories Inc.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * FAIRQ - take traffic classified by keep state (hashed into
  * mbuf->m_pkthdr.altq_state_hash) and bucketize it.  Fairly extract
  * the first packet from each bucket in a round-robin fashion.
  *
  * TODO - better overall qlimit support (right now it is per-bucket).
  *	- NOTE: red etc is per bucket, not overall.
  *	- better service curve support.
  *
  * EXAMPLE:
  *
  *  altq on em0 fairq bandwidth 650Kb queue { std, bulk }
  *  queue std  priority 3 bandwidth 400Kb \
  *	fairq (buckets 64, default, hogs 1Kb) qlimit 50
  *  queue bulk priority 2 bandwidth 100Kb \
  *	fairq (buckets 64, hogs 1Kb) qlimit 50
  *
  *  pass out on em0 from any to any keep state queue std
  *  pass out on em0 inet proto tcp ..... port ... keep state queue bulk
  */
 #include "opt_altq.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #ifdef ALTQ_FAIRQ  /* fairq is enabled in the kernel conf */
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/queue.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <netinet/in.h>
 
 #include <netpfil/pf/pf.h>
 #include <netpfil/pf/pf_altq.h>
 #include <netpfil/pf/pf_mtag.h>
 #include <net/altq/altq.h>
 #include <net/altq/altq_fairq.h>
 
 /*
  * function prototypes
  */
 static int	fairq_clear_interface(struct fairq_if *);
 static int	fairq_request(struct ifaltq *, int, void *);
 static void	fairq_purge(struct fairq_if *);
 static struct fairq_class *fairq_class_create(struct fairq_if *, int, int, u_int, struct fairq_opts *, int);
 static int	fairq_class_destroy(struct fairq_class *);
 static int	fairq_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
 static struct mbuf *fairq_dequeue(struct ifaltq *, int);
 
 static int	fairq_addq(struct fairq_class *, struct mbuf *, u_int32_t);
 static struct mbuf *fairq_getq(struct fairq_class *, uint64_t);
 static struct mbuf *fairq_pollq(struct fairq_class *, uint64_t, int *);
 static fairq_bucket_t *fairq_selectq(struct fairq_class *, int);
 static void	fairq_purgeq(struct fairq_class *);
 
 static void	get_class_stats(struct fairq_classstats *, struct fairq_class *);
 static struct fairq_class *clh_to_clp(struct fairq_if *, uint32_t);
 
 int
 fairq_pfattach(struct pf_altq *a)
 {
 	struct ifnet *ifp;
 	int error;
 
 	if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL)
 		return (EINVAL);
 
 	error = altq_attach(&ifp->if_snd, ALTQT_FAIRQ, a->altq_disc,
 	    fairq_enqueue, fairq_dequeue, fairq_request);
 
 	return (error);
 }
 
 int
 fairq_add_altq(struct ifnet *ifp, struct pf_altq *a)
 {
 	struct fairq_if *pif;
 
 	if (ifp == NULL)
 		return (EINVAL);
 	if (!ALTQ_IS_READY(&ifp->if_snd))
 		return (ENODEV);
 
 	pif = malloc(sizeof(struct fairq_if),
 			M_DEVBUF, M_WAITOK | M_ZERO);
 	pif->pif_bandwidth = a->ifbandwidth;
 	pif->pif_maxpri = -1;
 	pif->pif_ifq = &ifp->if_snd;
 
 	/* keep the state in pf_altq */
 	a->altq_disc = pif;
 
 	return (0);
 }
 
 int
 fairq_remove_altq(struct pf_altq *a)
 {
 	struct fairq_if *pif;
 
 	if ((pif = a->altq_disc) == NULL)
 		return (EINVAL);
 	a->altq_disc = NULL;
 
 	fairq_clear_interface(pif);
 
 	free(pif, M_DEVBUF);
 	return (0);
 }
 
 int
 fairq_add_queue(struct pf_altq *a)
 {
 	struct fairq_if *pif;
 	struct fairq_class *cl;
 
 	if ((pif = a->altq_disc) == NULL)
 		return (EINVAL);
 
 	/* check parameters */
 	if (a->priority >= FAIRQ_MAXPRI)
 		return (EINVAL);
 	if (a->qid == 0)
 		return (EINVAL);
 	if (pif->pif_classes[a->priority] != NULL)
 		return (EBUSY);
 	if (clh_to_clp(pif, a->qid) != NULL)
 		return (EBUSY);
 
 	cl = fairq_class_create(pif, a->priority, a->qlimit, a->bandwidth,
 			       &a->pq_u.fairq_opts, a->qid);
 	if (cl == NULL)
 		return (ENOMEM);
 
 	return (0);
 }
 
 int
 fairq_remove_queue(struct pf_altq *a)
 {
 	struct fairq_if *pif;
 	struct fairq_class *cl;
 
 	if ((pif = a->altq_disc) == NULL)
 		return (EINVAL);
 
 	if ((cl = clh_to_clp(pif, a->qid)) == NULL)
 		return (EINVAL);
 
 	return (fairq_class_destroy(cl));
 }
 
 int
 fairq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes, int version)
 {
 	struct fairq_if *pif;
 	struct fairq_class *cl;
 	struct fairq_classstats stats;
 	int error = 0;
 
 	if ((pif = altq_lookup(a->ifname, ALTQT_FAIRQ)) == NULL)
 		return (EBADF);
 
 	if ((cl = clh_to_clp(pif, a->qid)) == NULL)
 		return (EINVAL);
 
 	if (*nbytes < sizeof(stats))
 		return (EINVAL);
 
 	get_class_stats(&stats, cl);
 
 	if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0)
 		return (error);
 	*nbytes = sizeof(stats);
 	return (0);
 }
 
 /*
  * bring the interface back to the initial state by discarding
  * all the filters and classes.
  */
 static int
 fairq_clear_interface(struct fairq_if *pif)
 {
 	struct fairq_class *cl;
 	int pri;
 
 	/* clear out the classes */
 	for (pri = 0; pri <= pif->pif_maxpri; pri++) {
 		if ((cl = pif->pif_classes[pri]) != NULL)
 			fairq_class_destroy(cl);
 	}
 
 	return (0);
 }
 
 static int
 fairq_request(struct ifaltq *ifq, int req, void *arg)
 {
 	struct fairq_if *pif = (struct fairq_if *)ifq->altq_disc;
 
 	IFQ_LOCK_ASSERT(ifq);
 
 	switch (req) {
 	case ALTRQ_PURGE:
 		fairq_purge(pif);
 		break;
 	}
 	return (0);
 }
 
 /* discard all the queued packets on the interface */
 static void
 fairq_purge(struct fairq_if *pif)
 {
 	struct fairq_class *cl;
 	int pri;
 
 	for (pri = 0; pri <= pif->pif_maxpri; pri++) {
 		if ((cl = pif->pif_classes[pri]) != NULL && cl->cl_head)
 			fairq_purgeq(cl);
 	}
 	if (ALTQ_IS_ENABLED(pif->pif_ifq))
 		pif->pif_ifq->ifq_len = 0;
 }
 
 static struct fairq_class *
 fairq_class_create(struct fairq_if *pif, int pri, int qlimit,
 		   u_int bandwidth, struct fairq_opts *opts, int qid)
 {
 	struct fairq_class *cl;
 	int flags = opts->flags;
 	u_int nbuckets = opts->nbuckets;
 	int i;
 
 #ifndef ALTQ_RED
 	if (flags & FARF_RED) {
 #ifdef ALTQ_DEBUG
 		printf("fairq_class_create: RED not configured for FAIRQ!\n");
 #endif
 		return (NULL);
 	}
 #endif
 #ifndef ALTQ_CODEL
 	if (flags & FARF_CODEL) {
 #ifdef ALTQ_DEBUG
 		printf("fairq_class_create: CODEL not configured for FAIRQ!\n");
 #endif
 		return (NULL);
 	}
 #endif
 	if (nbuckets == 0)
 		nbuckets = 256;
 	if (nbuckets > FAIRQ_MAX_BUCKETS)
 		nbuckets = FAIRQ_MAX_BUCKETS;
 	/* enforce power-of-2 size */
 	while ((nbuckets ^ (nbuckets - 1)) != ((nbuckets << 1) - 1))
 		++nbuckets;
 
 	if ((cl = pif->pif_classes[pri]) != NULL) {
 		/* modify the class instead of creating a new one */
 		IFQ_LOCK(cl->cl_pif->pif_ifq);
 		if (cl->cl_head)
 			fairq_purgeq(cl);
 		IFQ_UNLOCK(cl->cl_pif->pif_ifq);
 #ifdef ALTQ_RIO
 		if (cl->cl_qtype == Q_RIO)
 			rio_destroy((rio_t *)cl->cl_red);
 #endif
 #ifdef ALTQ_RED
 		if (cl->cl_qtype == Q_RED)
 			red_destroy(cl->cl_red);
 #endif
 #ifdef ALTQ_CODEL
 		if (cl->cl_qtype == Q_CODEL)
 			codel_destroy(cl->cl_codel);
 #endif
 	} else {
 		cl = malloc(sizeof(struct fairq_class),
 				M_DEVBUF, M_WAITOK | M_ZERO);
 		cl->cl_nbuckets = nbuckets;
 		cl->cl_nbucket_mask = nbuckets - 1;
 
 		cl->cl_buckets = malloc(
 			sizeof(struct fairq_bucket) * cl->cl_nbuckets,
 			M_DEVBUF, M_WAITOK | M_ZERO);
 		cl->cl_head = NULL;
 	}
 
 	pif->pif_classes[pri] = cl;
 	if (flags & FARF_DEFAULTCLASS)
 		pif->pif_default = cl;
 	if (qlimit == 0)
 		qlimit = 50;  /* use default */
 	cl->cl_qlimit = qlimit;
 	for (i = 0; i < cl->cl_nbuckets; ++i) {
 		qlimit(&cl->cl_buckets[i].queue) = qlimit;
 	}
 	cl->cl_bandwidth = bandwidth / 8;
 	cl->cl_qtype = Q_DROPTAIL;
 	cl->cl_flags = flags & FARF_USERFLAGS;
 	cl->cl_pri = pri;
 	if (pri > pif->pif_maxpri)
 		pif->pif_maxpri = pri;
 	cl->cl_pif = pif;
 	cl->cl_handle = qid;
 	cl->cl_hogs_m1 = opts->hogs_m1 / 8;
 	cl->cl_lssc_m1 = opts->lssc_m1 / 8;	/* NOT YET USED */
 
 #ifdef ALTQ_RED
 	if (flags & (FARF_RED|FARF_RIO)) {
 		int red_flags, red_pkttime;
 
 		red_flags = 0;
 		if (flags & FARF_ECN)
 			red_flags |= REDF_ECN;
 #ifdef ALTQ_RIO
 		if (flags & FARF_CLEARDSCP)
 			red_flags |= RIOF_CLEARDSCP;
 #endif
 		if (pif->pif_bandwidth < 8)
 			red_pkttime = 1000 * 1000 * 1000; /* 1 sec */
 		else
 			red_pkttime = (int64_t)pif->pif_ifq->altq_ifp->if_mtu
 			  * 1000 * 1000 * 1000 / (pif->pif_bandwidth / 8);
 #ifdef ALTQ_RIO
 		if (flags & FARF_RIO) {
 			cl->cl_red = (red_t *)rio_alloc(0, NULL,
 						red_flags, red_pkttime);
 			if (cl->cl_red != NULL)
 				cl->cl_qtype = Q_RIO;
 		} else
 #endif
 		if (flags & FARF_RED) {
 			cl->cl_red = red_alloc(0, 0,
 			    cl->cl_qlimit * 10/100,
 			    cl->cl_qlimit * 30/100,
 			    red_flags, red_pkttime);
 			if (cl->cl_red != NULL)
 				cl->cl_qtype = Q_RED;
 		}
 	}
 #endif /* ALTQ_RED */
 #ifdef ALTQ_CODEL
 	if (flags & FARF_CODEL) {
 		cl->cl_codel = codel_alloc(5, 100, 0);
 		if (cl->cl_codel != NULL)
 			cl->cl_qtype = Q_CODEL;
 	}
 #endif
 
 	return (cl);
 }
 
 static int
 fairq_class_destroy(struct fairq_class *cl)
 {
 	struct fairq_if *pif;
 	int pri;
 
 	IFQ_LOCK(cl->cl_pif->pif_ifq);
 
 	if (cl->cl_head)
 		fairq_purgeq(cl);
 
 	pif = cl->cl_pif;
 	pif->pif_classes[cl->cl_pri] = NULL;
 	if (pif->pif_poll_cache == cl)
 		pif->pif_poll_cache = NULL;
 	if (pif->pif_maxpri == cl->cl_pri) {
 		for (pri = cl->cl_pri; pri >= 0; pri--)
 			if (pif->pif_classes[pri] != NULL) {
 				pif->pif_maxpri = pri;
 				break;
 			}
 		if (pri < 0)
 			pif->pif_maxpri = -1;
 	}
 	IFQ_UNLOCK(cl->cl_pif->pif_ifq);
 
 	if (cl->cl_red != NULL) {
 #ifdef ALTQ_RIO
 		if (cl->cl_qtype == Q_RIO)
 			rio_destroy((rio_t *)cl->cl_red);
 #endif
 #ifdef ALTQ_RED
 		if (cl->cl_qtype == Q_RED)
 			red_destroy(cl->cl_red);
 #endif
 #ifdef ALTQ_CODEL
 		if (cl->cl_qtype == Q_CODEL)
 			codel_destroy(cl->cl_codel);
 #endif
 	}
 	free(cl->cl_buckets, M_DEVBUF);
 	free(cl, M_DEVBUF);
 
 	return (0);
 }
 
 /*
  * fairq_enqueue is an enqueue function to be registered to
  * (*altq_enqueue) in struct ifaltq.
  */
 static int
 fairq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
 {
 	struct fairq_if *pif = (struct fairq_if *)ifq->altq_disc;
 	struct fairq_class *cl = NULL; /* Make compiler happy */
 	struct pf_mtag *t;
 	u_int32_t qid_hash = 0;
 	int len;
 
 	IFQ_LOCK_ASSERT(ifq);
 
 	/* grab class set by classifier */
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		/* should not happen */
 		printf("altq: packet for %s does not have pkthdr\n",
 			ifq->altq_ifp->if_xname);
 		m_freem(m);
 		return (ENOBUFS);
 	}
 
 	if ((t = pf_find_mtag(m)) != NULL) {
 		cl = clh_to_clp(pif, t->qid);
 		qid_hash = t->qid_hash;
 	}
 	if (cl == NULL) {
 		cl = pif->pif_default;
 		if (cl == NULL) {
 			m_freem(m);
 			return (ENOBUFS);
 		}
 	}
 	cl->cl_flags |= FARF_HAS_PACKETS;
 	cl->cl_pktattr = NULL;
 	len = m_pktlen(m);
 	if (fairq_addq(cl, m, qid_hash) != 0) {
 		/* drop occurred.  mbuf was freed in fairq_addq. */
 		PKTCNTR_ADD(&cl->cl_dropcnt, len);
 		return (ENOBUFS);
 	}
 	IFQ_INC_LEN(ifq);
 
 	return (0);
 }
 
 /*
  * fairq_dequeue is a dequeue function to be registered to
  * (*altq_dequeue) in struct ifaltq.
  *
  * note: ALTDQ_POLL returns the next packet without removing the packet
  *	from the queue.  ALTDQ_REMOVE is a normal dequeue operation.
  *	ALTDQ_REMOVE must return the same packet if called immediately
  *	after ALTDQ_POLL.
  */
 static struct mbuf *
 fairq_dequeue(struct ifaltq *ifq, int op)
 {
 	struct fairq_if *pif = (struct fairq_if *)ifq->altq_disc;
 	struct fairq_class *cl;
 	struct fairq_class *best_cl;
 	struct mbuf *best_m;
 	struct mbuf *m = NULL;
 	uint64_t cur_time = read_machclk();
 	int pri;
 	int hit_limit;
 
 	IFQ_LOCK_ASSERT(ifq);
 
 	if (IFQ_IS_EMPTY(ifq)) {
 		return (NULL);
 	}
 
 	if (pif->pif_poll_cache && op == ALTDQ_REMOVE) {
 		best_cl = pif->pif_poll_cache;
 		m = fairq_getq(best_cl, cur_time);
 		pif->pif_poll_cache = NULL;
 		if (m) {
 			IFQ_DEC_LEN(ifq);
 			PKTCNTR_ADD(&best_cl->cl_xmitcnt, m_pktlen(m));
 			return (m);
 		}
 	} else {
 		best_cl = NULL;
 		best_m = NULL;
 
 		for (pri = pif->pif_maxpri;  pri >= 0; pri--) {
 			if ((cl = pif->pif_classes[pri]) == NULL)
 				continue;
 			if ((cl->cl_flags & FARF_HAS_PACKETS) == 0)
 				continue;
 			m = fairq_pollq(cl, cur_time, &hit_limit);
 			if (m == NULL) {
 				cl->cl_flags &= ~FARF_HAS_PACKETS;
 				continue;
 			}
 
 			/*
 			 * Only override the best choice if we are under
 			 * the BW limit.
 			 */
 			if (hit_limit == 0 || best_cl == NULL) {
 				best_cl = cl;
 				best_m = m;
 			}
 
 			/*
 			 * Remember the highest priority mbuf in case we
 			 * do not find any lower priority mbufs.
 			 */
 			if (hit_limit)
 				continue;
 			break;
 		}
 		if (op == ALTDQ_POLL) {
 			pif->pif_poll_cache = best_cl;
 			m = best_m;
 		} else if (best_cl) {
 			m = fairq_getq(best_cl, cur_time);
 			if (m != NULL) {
 				IFQ_DEC_LEN(ifq);
 				PKTCNTR_ADD(&best_cl->cl_xmitcnt, m_pktlen(m));
 			}
 		} 
 		return (m);
 	}
 	return (NULL);
 }
 
 static int
 fairq_addq(struct fairq_class *cl, struct mbuf *m, u_int32_t bucketid)
 {
 	fairq_bucket_t *b;
 	u_int hindex;
 	uint64_t bw;
 
 	/*
 	 * If the packet doesn't have any keep state put it on the end of
 	 * our queue.  XXX this can result in out of order delivery.
 	 */
 	if (bucketid == 0) {
 		if (cl->cl_head)
 			b = cl->cl_head->prev;
 		else
 			b = &cl->cl_buckets[0];
 	} else {
 		hindex = bucketid & cl->cl_nbucket_mask;
 		b = &cl->cl_buckets[hindex];
 	}
 
 	/*
 	 * Add the bucket to the end of the circular list of active buckets.
 	 *
 	 * As a special case we add the bucket to the beginning of the list
 	 * instead of the end if it was not previously on the list and if
 	 * its traffic is less then the hog level.
 	 */
 	if (b->in_use == 0) {
 		b->in_use = 1;
 		if (cl->cl_head == NULL) {
 			cl->cl_head = b;
 			b->next = b;
 			b->prev = b;
 		} else {
 			b->next = cl->cl_head;
 			b->prev = cl->cl_head->prev;
 			b->prev->next = b;
 			b->next->prev = b;
 
 			if (b->bw_delta && cl->cl_hogs_m1) {
 				bw = b->bw_bytes * machclk_freq / b->bw_delta;
 				if (bw < cl->cl_hogs_m1)
 					cl->cl_head = b;
 			}
 		}
 	}
 
 #ifdef ALTQ_RIO
 	if (cl->cl_qtype == Q_RIO)
 		return rio_addq((rio_t *)cl->cl_red, &b->queue, m, cl->cl_pktattr);
 #endif
 #ifdef ALTQ_RED
 	if (cl->cl_qtype == Q_RED)
 		return red_addq(cl->cl_red, &b->queue, m, cl->cl_pktattr);
 #endif
 #ifdef ALTQ_CODEL
 	if (cl->cl_qtype == Q_CODEL)
 		return codel_addq(cl->cl_codel, &b->queue, m);
 #endif
 	if (qlen(&b->queue) >= qlimit(&b->queue)) {
 		m_freem(m);
 		return (-1);
 	}
 
 	if (cl->cl_flags & FARF_CLEARDSCP)
 		write_dsfield(m, cl->cl_pktattr, 0);
 
 	_addq(&b->queue, m);
 
 	return (0);
 }
 
 static struct mbuf *
 fairq_getq(struct fairq_class *cl, uint64_t cur_time)
 {
 	fairq_bucket_t *b;
 	struct mbuf *m;
 
 	b = fairq_selectq(cl, 0);
 	if (b == NULL)
 		m = NULL;
 #ifdef ALTQ_RIO
 	else if (cl->cl_qtype == Q_RIO)
 		m = rio_getq((rio_t *)cl->cl_red, &b->queue);
 #endif
 #ifdef ALTQ_RED
 	else if (cl->cl_qtype == Q_RED)
 		m = red_getq(cl->cl_red, &b->queue);
 #endif
 #ifdef ALTQ_CODEL
 	else if (cl->cl_qtype == Q_CODEL)
 		m = codel_getq(cl->cl_codel, &b->queue);
 #endif
 	else
 		m = _getq(&b->queue);
 
 	/*
 	 * Calculate the BW change
 	 */
 	if (m != NULL) {
 		uint64_t delta;
 
 		/*
 		 * Per-class bandwidth calculation
 		 */
 		delta = (cur_time - cl->cl_last_time);
 		if (delta > machclk_freq * 8)
 			delta = machclk_freq * 8;
 		cl->cl_bw_delta += delta;
 		cl->cl_bw_bytes += m->m_pkthdr.len;
 		cl->cl_last_time = cur_time;
 		cl->cl_bw_delta -= cl->cl_bw_delta >> 3;
 		cl->cl_bw_bytes -= cl->cl_bw_bytes >> 3;
 
 		/*
 		 * Per-bucket bandwidth calculation
 		 */
 		delta = (cur_time - b->last_time);
 		if (delta > machclk_freq * 8)
 			delta = machclk_freq * 8;
 		b->bw_delta += delta;
 		b->bw_bytes += m->m_pkthdr.len;
 		b->last_time = cur_time;
 		b->bw_delta -= b->bw_delta >> 3;
 		b->bw_bytes -= b->bw_bytes >> 3;
 	}
 	return(m);
 }
 
 /*
  * Figure out what the next packet would be if there were no limits.  If
  * this class hits its bandwidth limit *hit_limit is set to no-zero, otherwise
  * it is set to 0.  A non-NULL mbuf is returned either way.
  */
 static struct mbuf *
 fairq_pollq(struct fairq_class *cl, uint64_t cur_time, int *hit_limit)
 {
 	fairq_bucket_t *b;
 	struct mbuf *m;
 	uint64_t delta;
 	uint64_t bw;
 
 	*hit_limit = 0;
 	b = fairq_selectq(cl, 1);
 	if (b == NULL)
 		return(NULL);
 	m = qhead(&b->queue);
 
 	/*
 	 * Did this packet exceed the class bandwidth?  Calculate the
 	 * bandwidth component of the packet.
 	 *
 	 * - Calculate bytes per second
 	 */
 	delta = cur_time - cl->cl_last_time;
 	if (delta > machclk_freq * 8)
 		delta = machclk_freq * 8;
 	cl->cl_bw_delta += delta;
 	cl->cl_last_time = cur_time;
 	if (cl->cl_bw_delta) {
 		bw = cl->cl_bw_bytes * machclk_freq / cl->cl_bw_delta;
 
 		if (bw > cl->cl_bandwidth)
 			*hit_limit = 1;
 #ifdef ALTQ_DEBUG
 		printf("BW %6ju relative to %6u %d queue %p\n",
 			(uintmax_t)bw, cl->cl_bandwidth, *hit_limit, b);
 #endif
 	}
 	return(m);
 }
 
 /*
  * Locate the next queue we want to pull a packet out of.  This code
  * is also responsible for removing empty buckets from the circular list.
  */
 static
 fairq_bucket_t *
 fairq_selectq(struct fairq_class *cl, int ispoll)
 {
 	fairq_bucket_t *b;
 	uint64_t bw;
 
 	if (ispoll == 0 && cl->cl_polled) {
 		b = cl->cl_polled;
 		cl->cl_polled = NULL;
 		return(b);
 	}
 
 	while ((b = cl->cl_head) != NULL) {
 		/*
 		 * Remove empty queues from consideration
 		 */
 		if (qempty(&b->queue)) {
 			b->in_use = 0;
 			cl->cl_head = b->next;
 			if (cl->cl_head == b) {
 				cl->cl_head = NULL;
 			} else {
 				b->next->prev = b->prev;
 				b->prev->next = b->next;
 			}
 			continue;
 		}
 
 		/*
 		 * Advance the round robin.  Queues with bandwidths less
 		 * then the hog bandwidth are allowed to burst.
 		 */
 		if (cl->cl_hogs_m1 == 0) {
 			cl->cl_head = b->next;
 		} else if (b->bw_delta) {
 			bw = b->bw_bytes * machclk_freq / b->bw_delta;
 			if (bw >= cl->cl_hogs_m1) {
 				cl->cl_head = b->next;
 			}
 			/*
 			 * XXX TODO - 
 			 */
 		}
 
 		/*
 		 * Return bucket b.
 		 */
 		break;
 	}
 	if (ispoll)
 		cl->cl_polled = b;
 	return(b);
 }
 
 static void
 fairq_purgeq(struct fairq_class *cl)
 {
 	fairq_bucket_t *b;
 	struct mbuf *m;
 
 	while ((b = fairq_selectq(cl, 0)) != NULL) {
 		while ((m = _getq(&b->queue)) != NULL) {
 			PKTCNTR_ADD(&cl->cl_dropcnt, m_pktlen(m));
 			m_freem(m);
 		}
 		ASSERT(qlen(&b->queue) == 0);
 	}
 }
 
 static void
 get_class_stats(struct fairq_classstats *sp, struct fairq_class *cl)
 {
 	fairq_bucket_t *b;
 
 	sp->class_handle = cl->cl_handle;
 	sp->qlimit = cl->cl_qlimit;
 	sp->xmit_cnt = cl->cl_xmitcnt;
 	sp->drop_cnt = cl->cl_dropcnt;
 	sp->qtype = cl->cl_qtype;
 	sp->qlength = 0;
 
 	if (cl->cl_head) {
 		b = cl->cl_head;
 		do {
 			sp->qlength += qlen(&b->queue);
 			b = b->next;
 		} while (b != cl->cl_head);
 	}
 
 #ifdef ALTQ_RED
 	if (cl->cl_qtype == Q_RED)
 		red_getstats(cl->cl_red, &sp->red[0]);
 #endif
 #ifdef ALTQ_RIO
 	if (cl->cl_qtype == Q_RIO)
 		rio_getstats((rio_t *)cl->cl_red, &sp->red[0]);
 #endif
 #ifdef ALTQ_CODEL
 	if (cl->cl_qtype == Q_CODEL)
 		codel_getstats(cl->cl_codel, &sp->codel);
 #endif
 }
 
 /* convert a class handle to the corresponding class pointer */
 static struct fairq_class *
 clh_to_clp(struct fairq_if *pif, uint32_t chandle)
 {
 	struct fairq_class *cl;
 	int idx;
 
 	if (chandle == 0)
 		return (NULL);
 
 	for (idx = pif->pif_maxpri; idx >= 0; idx--)
 		if ((cl = pif->pif_classes[idx]) != NULL &&
 		    cl->cl_handle == chandle)
 			return (cl);
 
 	return (NULL);
 }
 
 #endif /* ALTQ_FAIRQ */
diff --git a/sys/net/altq/altq_hfsc.c b/sys/net/altq/altq_hfsc.c
index 5551ad1313e4..16b25d215feb 100644
--- a/sys/net/altq/altq_hfsc.c
+++ b/sys/net/altq/altq_hfsc.c
@@ -1,1736 +1,1737 @@
 /*-
  * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved.
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation is hereby granted (including for commercial or
  * for-profit use), provided that both the copyright notice and this
  * permission notice appear in all copies of the software, derivative
  * works, or modified versions, and any portions thereof.
  *
  * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF
  * WHICH MAY HAVE SERIOUS CONSEQUENCES.  CARNEGIE MELLON PROVIDES THIS
  * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  *
  * Carnegie Mellon encourages (but does not require) users of this
  * software to return any improvements or extensions that they make,
  * and to grant Carnegie Mellon the rights to redistribute these
  * changes without encumbrance.
  *
  * $KAME: altq_hfsc.c,v 1.24 2003/12/05 05:40:46 kjc Exp $
  * $FreeBSD$
  */
 /*
  * H-FSC is described in Proceedings of SIGCOMM'97,
  * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing,
  * Real-Time and Priority Service"
  * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng.
  *
  * Oleg Cherevko <olwi@aq.ml.com.ua> added the upperlimit for link-sharing.
  * when a class has an upperlimit, the fit-time is computed from the
  * upperlimit service curve.  the link-sharing scheduler does not schedule
  * a class whose fit-time exceeds the current time.
  */
 
 #include "opt_altq.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #ifdef ALTQ_HFSC  /* hfsc is enabled by ALTQ_HFSC option in opt_altq.h */
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/queue.h>
 #if 1 /* ALTQ3_COMPAT */
 #include <sys/sockio.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #endif /* ALTQ3_COMPAT */
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <netinet/in.h>
 
 #include <netpfil/pf/pf.h>
 #include <netpfil/pf/pf_altq.h>
 #include <netpfil/pf/pf_mtag.h>
 #include <net/altq/altq.h>
 #include <net/altq/altq_hfsc.h>
 
 /*
  * function prototypes
  */
 static int			 hfsc_clear_interface(struct hfsc_if *);
 static int			 hfsc_request(struct ifaltq *, int, void *);
 static void			 hfsc_purge(struct hfsc_if *);
 static struct hfsc_class	*hfsc_class_create(struct hfsc_if *,
     struct service_curve *, struct service_curve *, struct service_curve *,
     struct hfsc_class *, int, int, int);
 static int			 hfsc_class_destroy(struct hfsc_class *);
 static struct hfsc_class	*hfsc_nextclass(struct hfsc_class *);
 static int			 hfsc_enqueue(struct ifaltq *, struct mbuf *,
 				    struct altq_pktattr *);
 static struct mbuf		*hfsc_dequeue(struct ifaltq *, int);
 
 static int		 hfsc_addq(struct hfsc_class *, struct mbuf *);
 static struct mbuf	*hfsc_getq(struct hfsc_class *);
 static struct mbuf	*hfsc_pollq(struct hfsc_class *);
 static void		 hfsc_purgeq(struct hfsc_class *);
 
 static void		 update_cfmin(struct hfsc_class *);
 static void		 set_active(struct hfsc_class *, int);
 static void		 set_passive(struct hfsc_class *);
 
 static void		 init_ed(struct hfsc_class *, int);
 static void		 update_ed(struct hfsc_class *, int);
 static void		 update_d(struct hfsc_class *, int);
 static void		 init_vf(struct hfsc_class *, int);
 static void		 update_vf(struct hfsc_class *, int, u_int64_t);
 static void		 ellist_insert(struct hfsc_class *);
 static void		 ellist_remove(struct hfsc_class *);
 static void		 ellist_update(struct hfsc_class *);
 struct hfsc_class	*hfsc_get_mindl(struct hfsc_if *, u_int64_t);
 static void		 actlist_insert(struct hfsc_class *);
 static void		 actlist_remove(struct hfsc_class *);
 static void		 actlist_update(struct hfsc_class *);
 
 static struct hfsc_class	*actlist_firstfit(struct hfsc_class *,
 				    u_int64_t);
 
 static __inline u_int64_t	seg_x2y(u_int64_t, u_int64_t);
 static __inline u_int64_t	seg_y2x(u_int64_t, u_int64_t);
 static __inline u_int64_t	m2sm(u_int64_t);
 static __inline u_int64_t	m2ism(u_int64_t);
 static __inline u_int64_t	d2dx(u_int);
 static u_int64_t		sm2m(u_int64_t);
 static u_int			dx2d(u_int64_t);
 
 static void		sc2isc(struct service_curve *, struct internal_sc *);
 static void		rtsc_init(struct runtime_sc *, struct internal_sc *,
 			    u_int64_t, u_int64_t);
 static u_int64_t	rtsc_y2x(struct runtime_sc *, u_int64_t);
 static u_int64_t	rtsc_x2y(struct runtime_sc *, u_int64_t);
 static void		rtsc_min(struct runtime_sc *, struct internal_sc *,
 			    u_int64_t, u_int64_t);
 
 static void			 get_class_stats_v0(struct hfsc_classstats_v0 *,
 				    struct hfsc_class *);
 static void			 get_class_stats_v1(struct hfsc_classstats_v1 *,
 				    struct hfsc_class *);
 static struct hfsc_class	*clh_to_clp(struct hfsc_if *, u_int32_t);
 
 /*
  * macros
  */
 #define	is_a_parent_class(cl)	((cl)->cl_children != NULL)
 
 #define	HT_INFINITY	0xffffffffffffffffULL	/* infinite time value */
 
 int
 hfsc_pfattach(struct pf_altq *a)
 {
 	struct ifnet *ifp;
 	int s, error;
 
 	if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL)
 		return (EINVAL);
 	s = splnet();
 	error = altq_attach(&ifp->if_snd, ALTQT_HFSC, a->altq_disc,
 	    hfsc_enqueue, hfsc_dequeue, hfsc_request);
 	splx(s);
 	return (error);
 }
 
 int
 hfsc_add_altq(struct ifnet *ifp, struct pf_altq *a)
 {
 	struct hfsc_if *hif;
 
 	if (ifp == NULL)
 		return (EINVAL);
 	if (!ALTQ_IS_READY(&ifp->if_snd))
 		return (ENODEV);
 
 	hif = malloc(sizeof(struct hfsc_if), M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (hif == NULL)
 		return (ENOMEM);
 
 	TAILQ_INIT(&hif->hif_eligible);
 	hif->hif_ifq = &ifp->if_snd;
 
 	/* keep the state in pf_altq */
 	a->altq_disc = hif;
 
 	return (0);
 }
 
 int
 hfsc_remove_altq(struct pf_altq *a)
 {
 	struct hfsc_if *hif;
 
 	if ((hif = a->altq_disc) == NULL)
 		return (EINVAL);
 	a->altq_disc = NULL;
 
 	(void)hfsc_clear_interface(hif);
 	(void)hfsc_class_destroy(hif->hif_rootclass);
 
 	free(hif, M_DEVBUF);
 
 	return (0);
 }
 
 int
 hfsc_add_queue(struct pf_altq *a)
 {
 	struct hfsc_if *hif;
 	struct hfsc_class *cl, *parent;
 	struct hfsc_opts_v1 *opts;
 	struct service_curve rtsc, lssc, ulsc;
 
 	if ((hif = a->altq_disc) == NULL)
 		return (EINVAL);
 
 	opts = &a->pq_u.hfsc_opts;
 
 	if (a->parent_qid == HFSC_NULLCLASS_HANDLE &&
 	    hif->hif_rootclass == NULL)
 		parent = NULL;
 	else if ((parent = clh_to_clp(hif, a->parent_qid)) == NULL)
 		return (EINVAL);
 
 	if (a->qid == 0)
 		return (EINVAL);
 
 	if (clh_to_clp(hif, a->qid) != NULL)
 		return (EBUSY);
 
 	rtsc.m1 = opts->rtsc_m1;
 	rtsc.d  = opts->rtsc_d;
 	rtsc.m2 = opts->rtsc_m2;
 	lssc.m1 = opts->lssc_m1;
 	lssc.d  = opts->lssc_d;
 	lssc.m2 = opts->lssc_m2;
 	ulsc.m1 = opts->ulsc_m1;
 	ulsc.d  = opts->ulsc_d;
 	ulsc.m2 = opts->ulsc_m2;
 
 	cl = hfsc_class_create(hif, &rtsc, &lssc, &ulsc,
 	    parent, a->qlimit, opts->flags, a->qid);
 	if (cl == NULL)
 		return (ENOMEM);
 
 	return (0);
 }
 
 int
 hfsc_remove_queue(struct pf_altq *a)
 {
 	struct hfsc_if *hif;
 	struct hfsc_class *cl;
 
 	if ((hif = a->altq_disc) == NULL)
 		return (EINVAL);
 
 	if ((cl = clh_to_clp(hif, a->qid)) == NULL)
 		return (EINVAL);
 
 	return (hfsc_class_destroy(cl));
 }
 
 int
 hfsc_getqstats(struct pf_altq *a, void *ubuf, int *nbytes, int version)
 {
 	struct hfsc_if *hif;
 	struct hfsc_class *cl;
 	union {
 		struct hfsc_classstats_v0 v0;
 		struct hfsc_classstats_v1 v1;
 	} stats;
 	size_t stats_size;
 	int error = 0;
 
 	if ((hif = altq_lookup(a->ifname, ALTQT_HFSC)) == NULL)
 		return (EBADF);
 
 	if ((cl = clh_to_clp(hif, a->qid)) == NULL)
 		return (EINVAL);
 
 	if (version > HFSC_STATS_VERSION)
 		return (EINVAL);
 
 	memset(&stats, 0, sizeof(stats));
 	switch (version) {
 	case 0:
 		get_class_stats_v0(&stats.v0, cl);
 		stats_size = sizeof(struct hfsc_classstats_v0);
 		break;
 	case 1:
 		get_class_stats_v1(&stats.v1, cl);
 		stats_size = sizeof(struct hfsc_classstats_v1);
 		break;
 	}		
 
 	if (*nbytes < stats_size)
 		return (EINVAL);
 
 	if ((error = copyout((caddr_t)&stats, ubuf, stats_size)) != 0)
 		return (error);
 	*nbytes = stats_size;
 	return (0);
 }
 
 /*
  * bring the interface back to the initial state by discarding
  * all the filters and classes except the root class.
  */
 static int
 hfsc_clear_interface(struct hfsc_if *hif)
 {
 	struct hfsc_class	*cl;
 
 	/* clear out the classes */
 	while (hif->hif_rootclass != NULL &&
 	    (cl = hif->hif_rootclass->cl_children) != NULL) {
 		/*
 		 * remove the first leaf class found in the hierarchy
 		 * then start over
 		 */
 		for (; cl != NULL; cl = hfsc_nextclass(cl)) {
 			if (!is_a_parent_class(cl)) {
 				(void)hfsc_class_destroy(cl);
 				break;
 			}
 		}
 	}
 
 	return (0);
 }
 
 static int
 hfsc_request(struct ifaltq *ifq, int req, void *arg)
 {
 	struct hfsc_if	*hif = (struct hfsc_if *)ifq->altq_disc;
 
 	IFQ_LOCK_ASSERT(ifq);
 
 	switch (req) {
 	case ALTRQ_PURGE:
 		hfsc_purge(hif);
 		break;
 	}
 	return (0);
 }
 
 /* discard all the queued packets on the interface */
 static void
 hfsc_purge(struct hfsc_if *hif)
 {
 	struct hfsc_class *cl;
 
 	for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl))
 		if (!qempty(cl->cl_q))
 			hfsc_purgeq(cl);
 	if (ALTQ_IS_ENABLED(hif->hif_ifq))
 		hif->hif_ifq->ifq_len = 0;
 }
 
 struct hfsc_class *
 hfsc_class_create(struct hfsc_if *hif, struct service_curve *rsc,
     struct service_curve *fsc, struct service_curve *usc,
     struct hfsc_class *parent, int qlimit, int flags, int qid)
 {
 	struct hfsc_class *cl, *p;
 	int i, s;
 
 	if (hif->hif_classes >= HFSC_MAX_CLASSES)
 		return (NULL);
 
 #ifndef ALTQ_RED
 	if (flags & HFCF_RED) {
 #ifdef ALTQ_DEBUG
 		printf("hfsc_class_create: RED not configured for HFSC!\n");
 #endif
 		return (NULL);
 	}
 #endif
 #ifndef ALTQ_CODEL
 	if (flags & HFCF_CODEL) {
 #ifdef ALTQ_DEBUG
 		printf("hfsc_class_create: CODEL not configured for HFSC!\n");
 #endif
 		return (NULL);
 	}
 #endif
 
 	cl = malloc(sizeof(struct hfsc_class), M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (cl == NULL)
 		return (NULL);
 
 	cl->cl_q = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (cl->cl_q == NULL)
 		goto err_ret;
 
 	TAILQ_INIT(&cl->cl_actc);
 
 	if (qlimit == 0)
 		qlimit = 50;  /* use default */
 	qlimit(cl->cl_q) = qlimit;
 	qtype(cl->cl_q) = Q_DROPTAIL;
 	qlen(cl->cl_q) = 0;
 	qsize(cl->cl_q) = 0;
 	cl->cl_flags = flags;
 #ifdef ALTQ_RED
 	if (flags & (HFCF_RED|HFCF_RIO)) {
 		int red_flags, red_pkttime;
 		u_int m2;
 
 		m2 = 0;
 		if (rsc != NULL && rsc->m2 > m2)
 			m2 = rsc->m2;
 		if (fsc != NULL && fsc->m2 > m2)
 			m2 = fsc->m2;
 		if (usc != NULL && usc->m2 > m2)
 			m2 = usc->m2;
 
 		red_flags = 0;
 		if (flags & HFCF_ECN)
 			red_flags |= REDF_ECN;
 #ifdef ALTQ_RIO
 		if (flags & HFCF_CLEARDSCP)
 			red_flags |= RIOF_CLEARDSCP;
 #endif
 		if (m2 < 8)
 			red_pkttime = 1000 * 1000 * 1000; /* 1 sec */
 		else
 			red_pkttime = (int64_t)hif->hif_ifq->altq_ifp->if_mtu
 				* 1000 * 1000 * 1000 / (m2 / 8);
 		if (flags & HFCF_RED) {
 			cl->cl_red = red_alloc(0, 0,
 			    qlimit(cl->cl_q) * 10/100,
 			    qlimit(cl->cl_q) * 30/100,
 			    red_flags, red_pkttime);
 			if (cl->cl_red != NULL)
 				qtype(cl->cl_q) = Q_RED;
 		}
 #ifdef ALTQ_RIO
 		else {
 			cl->cl_red = (red_t *)rio_alloc(0, NULL,
 			    red_flags, red_pkttime);
 			if (cl->cl_red != NULL)
 				qtype(cl->cl_q) = Q_RIO;
 		}
 #endif
 	}
 #endif /* ALTQ_RED */
 #ifdef ALTQ_CODEL
 	if (flags & HFCF_CODEL) {
 		cl->cl_codel = codel_alloc(5, 100, 0);
 		if (cl->cl_codel != NULL)
 			qtype(cl->cl_q) = Q_CODEL;
 	}
 #endif
 
 	if (rsc != NULL && (rsc->m1 != 0 || rsc->m2 != 0)) {
 		cl->cl_rsc = malloc(sizeof(struct internal_sc),
 		    M_DEVBUF, M_NOWAIT);
 		if (cl->cl_rsc == NULL)
 			goto err_ret;
 		sc2isc(rsc, cl->cl_rsc);
 		rtsc_init(&cl->cl_deadline, cl->cl_rsc, 0, 0);
 		rtsc_init(&cl->cl_eligible, cl->cl_rsc, 0, 0);
 	}
 	if (fsc != NULL && (fsc->m1 != 0 || fsc->m2 != 0)) {
 		cl->cl_fsc = malloc(sizeof(struct internal_sc),
 		    M_DEVBUF, M_NOWAIT);
 		if (cl->cl_fsc == NULL)
 			goto err_ret;
 		sc2isc(fsc, cl->cl_fsc);
 		rtsc_init(&cl->cl_virtual, cl->cl_fsc, 0, 0);
 	}
 	if (usc != NULL && (usc->m1 != 0 || usc->m2 != 0)) {
 		cl->cl_usc = malloc(sizeof(struct internal_sc),
 		    M_DEVBUF, M_NOWAIT);
 		if (cl->cl_usc == NULL)
 			goto err_ret;
 		sc2isc(usc, cl->cl_usc);
 		rtsc_init(&cl->cl_ulimit, cl->cl_usc, 0, 0);
 	}
 
 	cl->cl_id = hif->hif_classid++;
 	cl->cl_handle = qid;
 	cl->cl_hif = hif;
 	cl->cl_parent = parent;
 
 	s = splnet();
 	IFQ_LOCK(hif->hif_ifq);
 	hif->hif_classes++;
 
 	/*
 	 * find a free slot in the class table.  if the slot matching
 	 * the lower bits of qid is free, use this slot.  otherwise,
 	 * use the first free slot.
 	 */
 	i = qid % HFSC_MAX_CLASSES;
 	if (hif->hif_class_tbl[i] == NULL)
 		hif->hif_class_tbl[i] = cl;
 	else {
 		for (i = 0; i < HFSC_MAX_CLASSES; i++)
 			if (hif->hif_class_tbl[i] == NULL) {
 				hif->hif_class_tbl[i] = cl;
 				break;
 			}
 		if (i == HFSC_MAX_CLASSES) {
 			IFQ_UNLOCK(hif->hif_ifq);
 			splx(s);
 			goto err_ret;
 		}
 	}
 	cl->cl_slot = i;
 
 	if (flags & HFCF_DEFAULTCLASS)
 		hif->hif_defaultclass = cl;
 
 	if (parent == NULL) {
 		/* this is root class */
 		hif->hif_rootclass = cl;
 	} else {
 		/* add this class to the children list of the parent */
 		if ((p = parent->cl_children) == NULL)
 			parent->cl_children = cl;
 		else {
 			/* Put new class at beginning of list */
 			cl->cl_siblings = parent->cl_children;
 			parent->cl_children = cl;
 		}
 	}
 	IFQ_UNLOCK(hif->hif_ifq);
 	splx(s);
 
 	return (cl);
 
  err_ret:
 	if (cl->cl_red != NULL) {
 #ifdef ALTQ_RIO
 		if (q_is_rio(cl->cl_q))
 			rio_destroy((rio_t *)cl->cl_red);
 #endif
 #ifdef ALTQ_RED
 		if (q_is_red(cl->cl_q))
 			red_destroy(cl->cl_red);
 #endif
 #ifdef ALTQ_CODEL
 		if (q_is_codel(cl->cl_q))
 			codel_destroy(cl->cl_codel);
 #endif
 	}
 	if (cl->cl_fsc != NULL)
 		free(cl->cl_fsc, M_DEVBUF);
 	if (cl->cl_rsc != NULL)
 		free(cl->cl_rsc, M_DEVBUF);
 	if (cl->cl_usc != NULL)
 		free(cl->cl_usc, M_DEVBUF);
 	if (cl->cl_q != NULL)
 		free(cl->cl_q, M_DEVBUF);
 	free(cl, M_DEVBUF);
 	return (NULL);
 }
 
 static int
 hfsc_class_destroy(struct hfsc_class *cl)
 {
 	int s;
 
 	if (cl == NULL)
 		return (0);
 
 	if (is_a_parent_class(cl))
 		return (EBUSY);
 
 	s = splnet();
 	IFQ_LOCK(cl->cl_hif->hif_ifq);
 
 	if (!qempty(cl->cl_q))
 		hfsc_purgeq(cl);
 
 	if (cl->cl_parent == NULL) {
 		/* this is root class */
 	} else {
 		struct hfsc_class *p = cl->cl_parent->cl_children;
 
 		if (p == cl)
 			cl->cl_parent->cl_children = cl->cl_siblings;
 		else do {
 			if (p->cl_siblings == cl) {
 				p->cl_siblings = cl->cl_siblings;
 				break;
 			}
 		} while ((p = p->cl_siblings) != NULL);
 		ASSERT(p != NULL);
 	}
 
 	cl->cl_hif->hif_class_tbl[cl->cl_slot] = NULL;
 	cl->cl_hif->hif_classes--;
 	IFQ_UNLOCK(cl->cl_hif->hif_ifq);
 	splx(s);
 
 	if (cl->cl_red != NULL) {
 #ifdef ALTQ_RIO
 		if (q_is_rio(cl->cl_q))
 			rio_destroy((rio_t *)cl->cl_red);
 #endif
 #ifdef ALTQ_RED
 		if (q_is_red(cl->cl_q))
 			red_destroy(cl->cl_red);
 #endif
 #ifdef ALTQ_CODEL
 		if (q_is_codel(cl->cl_q))
 			codel_destroy(cl->cl_codel);
 #endif
 	}
 
 	IFQ_LOCK(cl->cl_hif->hif_ifq);
 	if (cl == cl->cl_hif->hif_rootclass)
 		cl->cl_hif->hif_rootclass = NULL;
 	if (cl == cl->cl_hif->hif_defaultclass)
 		cl->cl_hif->hif_defaultclass = NULL;
 	IFQ_UNLOCK(cl->cl_hif->hif_ifq);
 
 	if (cl->cl_usc != NULL)
 		free(cl->cl_usc, M_DEVBUF);
 	if (cl->cl_fsc != NULL)
 		free(cl->cl_fsc, M_DEVBUF);
 	if (cl->cl_rsc != NULL)
 		free(cl->cl_rsc, M_DEVBUF);
 	free(cl->cl_q, M_DEVBUF);
 	free(cl, M_DEVBUF);
 
 	return (0);
 }
 
 /*
  * hfsc_nextclass returns the next class in the tree.
  *   usage:
  *	for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl))
  *		do_something;
  */
 static struct hfsc_class *
 hfsc_nextclass(struct hfsc_class *cl)
 {
 	if (cl->cl_children != NULL)
 		cl = cl->cl_children;
 	else if (cl->cl_siblings != NULL)
 		cl = cl->cl_siblings;
 	else {
 		while ((cl = cl->cl_parent) != NULL)
 			if (cl->cl_siblings) {
 				cl = cl->cl_siblings;
 				break;
 			}
 	}
 
 	return (cl);
 }
 
 /*
  * hfsc_enqueue is an enqueue function to be registered to
  * (*altq_enqueue) in struct ifaltq.
  */
 static int
 hfsc_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
 {
 	struct hfsc_if	*hif = (struct hfsc_if *)ifq->altq_disc;
 	struct hfsc_class *cl;
 	struct pf_mtag *t;
 	int len;
 
 	IFQ_LOCK_ASSERT(ifq);
 
 	/* grab class set by classifier */
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		/* should not happen */
 		printf("altq: packet for %s does not have pkthdr\n",
 		    ifq->altq_ifp->if_xname);
 		m_freem(m);
 		return (ENOBUFS);
 	}
 	cl = NULL;
 	if ((t = pf_find_mtag(m)) != NULL)
 		cl = clh_to_clp(hif, t->qid);
 	if (cl == NULL || is_a_parent_class(cl)) {
 		cl = hif->hif_defaultclass;
 		if (cl == NULL) {
 			m_freem(m);
 			return (ENOBUFS);
 		}
 	}
 	cl->cl_pktattr = NULL;
 	len = m_pktlen(m);
 	if (hfsc_addq(cl, m) != 0) {
 		/* drop occurred.  mbuf was freed in hfsc_addq. */
 		PKTCNTR_ADD(&cl->cl_stats.drop_cnt, len);
 		return (ENOBUFS);
 	}
 	IFQ_INC_LEN(ifq);
 	cl->cl_hif->hif_packets++;
 
 	/* successfully queued. */
 	if (qlen(cl->cl_q) == 1)
 		set_active(cl, m_pktlen(m));
 
 	return (0);
 }
 
 /*
  * hfsc_dequeue is a dequeue function to be registered to
  * (*altq_dequeue) in struct ifaltq.
  *
  * note: ALTDQ_POLL returns the next packet without removing the packet
  *	from the queue.  ALTDQ_REMOVE is a normal dequeue operation.
  *	ALTDQ_REMOVE must return the same packet if called immediately
  *	after ALTDQ_POLL.
  */
 static struct mbuf *
 hfsc_dequeue(struct ifaltq *ifq, int op)
 {
 	struct hfsc_if	*hif = (struct hfsc_if *)ifq->altq_disc;
 	struct hfsc_class *cl;
 	struct mbuf *m;
 	int len, next_len;
 	int realtime = 0;
 	u_int64_t cur_time;
 
 	IFQ_LOCK_ASSERT(ifq);
 
 	if (hif->hif_packets == 0)
 		/* no packet in the tree */
 		return (NULL);
 
 	cur_time = read_machclk();
 
 	if (op == ALTDQ_REMOVE && hif->hif_pollcache != NULL) {
 		cl = hif->hif_pollcache;
 		hif->hif_pollcache = NULL;
 		/* check if the class was scheduled by real-time criteria */
 		if (cl->cl_rsc != NULL)
 			realtime = (cl->cl_e <= cur_time);
 	} else {
 		/*
 		 * if there are eligible classes, use real-time criteria.
 		 * find the class with the minimum deadline among
 		 * the eligible classes.
 		 */
 		if ((cl = hfsc_get_mindl(hif, cur_time))
 		    != NULL) {
 			realtime = 1;
 		} else {
 #ifdef ALTQ_DEBUG
 			int fits = 0;
 #endif
 			/*
 			 * use link-sharing criteria
 			 * get the class with the minimum vt in the hierarchy
 			 */
 			cl = hif->hif_rootclass;
 			while (is_a_parent_class(cl)) {
 				cl = actlist_firstfit(cl, cur_time);
 				if (cl == NULL) {
 #ifdef ALTQ_DEBUG
 					if (fits > 0)
 						printf("%d fit but none found\n",fits);
 #endif
 					return (NULL);
 				}
 				/*
 				 * update parent's cl_cvtmin.
 				 * don't update if the new vt is smaller.
 				 */
 				if (cl->cl_parent->cl_cvtmin < cl->cl_vt)
 					cl->cl_parent->cl_cvtmin = cl->cl_vt;
 #ifdef ALTQ_DEBUG
 				fits++;
 #endif
 			}
 		}
 
 		if (op == ALTDQ_POLL) {
 			hif->hif_pollcache = cl;
 			m = hfsc_pollq(cl);
 			return (m);
 		}
 	}
 
 	m = hfsc_getq(cl);
 	if (m == NULL)
 		panic("hfsc_dequeue:");
 	len = m_pktlen(m);
 	cl->cl_hif->hif_packets--;
 	IFQ_DEC_LEN(ifq);
 	PKTCNTR_ADD(&cl->cl_stats.xmit_cnt, len);
 
 	update_vf(cl, len, cur_time);
 	if (realtime)
 		cl->cl_cumul += len;
 
 	if (!qempty(cl->cl_q)) {
 		if (cl->cl_rsc != NULL) {
 			/* update ed */
 			next_len = m_pktlen(qhead(cl->cl_q));
 
 			if (realtime)
 				update_ed(cl, next_len);
 			else
 				update_d(cl, next_len);
 		}
 	} else {
 		/* the class becomes passive */
 		set_passive(cl);
 	}
 
 	return (m);
 }
 
 static int
 hfsc_addq(struct hfsc_class *cl, struct mbuf *m)
 {
 
 #ifdef ALTQ_RIO
 	if (q_is_rio(cl->cl_q))
 		return rio_addq((rio_t *)cl->cl_red, cl->cl_q,
 				m, cl->cl_pktattr);
 #endif
 #ifdef ALTQ_RED
 	if (q_is_red(cl->cl_q))
 		return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr);
 #endif
 #ifdef ALTQ_CODEL
 	if (q_is_codel(cl->cl_q))
 		return codel_addq(cl->cl_codel, cl->cl_q, m);
 #endif
 	if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) {
 		m_freem(m);
 		return (-1);
 	}
 
 	if (cl->cl_flags & HFCF_CLEARDSCP)
 		write_dsfield(m, cl->cl_pktattr, 0);
 
 	_addq(cl->cl_q, m);
 
 	return (0);
 }
 
 static struct mbuf *
 hfsc_getq(struct hfsc_class *cl)
 {
 #ifdef ALTQ_RIO
 	if (q_is_rio(cl->cl_q))
 		return rio_getq((rio_t *)cl->cl_red, cl->cl_q);
 #endif
 #ifdef ALTQ_RED
 	if (q_is_red(cl->cl_q))
 		return red_getq(cl->cl_red, cl->cl_q);
 #endif
 #ifdef ALTQ_CODEL
 	if (q_is_codel(cl->cl_q))
 		return codel_getq(cl->cl_codel, cl->cl_q);
 #endif
 	return _getq(cl->cl_q);
 }
 
 static struct mbuf *
 hfsc_pollq(struct hfsc_class *cl)
 {
 	return qhead(cl->cl_q);
 }
 
 static void
 hfsc_purgeq(struct hfsc_class *cl)
 {
 	struct mbuf *m;
 
 	if (qempty(cl->cl_q))
 		return;
 
 	while ((m = _getq(cl->cl_q)) != NULL) {
 		PKTCNTR_ADD(&cl->cl_stats.drop_cnt, m_pktlen(m));
 		m_freem(m);
 		cl->cl_hif->hif_packets--;
 		IFQ_DEC_LEN(cl->cl_hif->hif_ifq);
 	}
 	ASSERT(qlen(cl->cl_q) == 0);
 
 	update_vf(cl, 0, 0);	/* remove cl from the actlist */
 	set_passive(cl);
 }
 
 static void
 set_active(struct hfsc_class *cl, int len)
 {
 	if (cl->cl_rsc != NULL)
 		init_ed(cl, len);
 	if (cl->cl_fsc != NULL)
 		init_vf(cl, len);
 
 	cl->cl_stats.period++;
 }
 
 static void
 set_passive(struct hfsc_class *cl)
 {
 	if (cl->cl_rsc != NULL)
 		ellist_remove(cl);
 
 	/*
 	 * actlist is now handled in update_vf() so that update_vf(cl, 0, 0)
 	 * needs to be called explicitly to remove a class from actlist
 	 */
 }
 
 static void
 init_ed(struct hfsc_class *cl, int next_len)
 {
 	u_int64_t cur_time;
 
 	cur_time = read_machclk();
 
 	/* update the deadline curve */
 	rtsc_min(&cl->cl_deadline, cl->cl_rsc, cur_time, cl->cl_cumul);
 
 	/*
 	 * update the eligible curve.
 	 * for concave, it is equal to the deadline curve.
 	 * for convex, it is a linear curve with slope m2.
 	 */
 	cl->cl_eligible = cl->cl_deadline;
 	if (cl->cl_rsc->sm1 <= cl->cl_rsc->sm2) {
 		cl->cl_eligible.dx = 0;
 		cl->cl_eligible.dy = 0;
 	}
 
 	/* compute e and d */
 	cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul);
 	cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
 
 	ellist_insert(cl);
 }
 
 static void
 update_ed(struct hfsc_class *cl, int next_len)
 {
 	cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul);
 	cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
 
 	ellist_update(cl);
 }
 
 static void
 update_d(struct hfsc_class *cl, int next_len)
 {
 	cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
 }
 
 static void
 init_vf(struct hfsc_class *cl, int len)
 {
 	struct hfsc_class *max_cl, *p;
 	u_int64_t vt, f, cur_time;
 	int go_active;
 
 	cur_time = 0;
 	go_active = 1;
 	for ( ; cl->cl_parent != NULL; cl = cl->cl_parent) {
 		if (go_active && cl->cl_nactive++ == 0)
 			go_active = 1;
 		else
 			go_active = 0;
 
 		if (go_active) {
 			max_cl = TAILQ_LAST(&cl->cl_parent->cl_actc, acthead);
 			if (max_cl != NULL) {
 				/*
 				 * set vt to the average of the min and max
 				 * classes.  if the parent's period didn't
 				 * change, don't decrease vt of the class.
 				 */
 				vt = max_cl->cl_vt;
 				if (cl->cl_parent->cl_cvtmin != 0)
 					vt = (cl->cl_parent->cl_cvtmin + vt)/2;
 
 				if (cl->cl_parent->cl_vtperiod !=
 				    cl->cl_parentperiod || vt > cl->cl_vt)
 					cl->cl_vt = vt;
 			} else {
 				/*
 				 * first child for a new parent backlog period.
 				 * add parent's cvtmax to vtoff of children
 				 * to make a new vt (vtoff + vt) larger than
 				 * the vt in the last period for all children.
 				 */
 				vt = cl->cl_parent->cl_cvtmax;
 				for (p = cl->cl_parent->cl_children; p != NULL;
 				     p = p->cl_siblings)
 					p->cl_vtoff += vt;
 				cl->cl_vt = 0;
 				cl->cl_parent->cl_cvtmax = 0;
 				cl->cl_parent->cl_cvtmin = 0;
 			}
 			cl->cl_initvt = cl->cl_vt;
 
 			/* update the virtual curve */
 			vt = cl->cl_vt + cl->cl_vtoff;
 			rtsc_min(&cl->cl_virtual, cl->cl_fsc, vt, cl->cl_total);
 			if (cl->cl_virtual.x == vt) {
 				cl->cl_virtual.x -= cl->cl_vtoff;
 				cl->cl_vtoff = 0;
 			}
 			cl->cl_vtadj = 0;
 
 			cl->cl_vtperiod++;  /* increment vt period */
 			cl->cl_parentperiod = cl->cl_parent->cl_vtperiod;
 			if (cl->cl_parent->cl_nactive == 0)
 				cl->cl_parentperiod++;
 			cl->cl_f = 0;
 
 			actlist_insert(cl);
 
 			if (cl->cl_usc != NULL) {
 				/* class has upper limit curve */
 				if (cur_time == 0)
 					cur_time = read_machclk();
 
 				/* update the ulimit curve */
 				rtsc_min(&cl->cl_ulimit, cl->cl_usc, cur_time,
 				    cl->cl_total);
 				/* compute myf */
 				cl->cl_myf = rtsc_y2x(&cl->cl_ulimit,
 				    cl->cl_total);
 				cl->cl_myfadj = 0;
 			}
 		}
 
 		if (cl->cl_myf > cl->cl_cfmin)
 			f = cl->cl_myf;
 		else
 			f = cl->cl_cfmin;
 		if (f != cl->cl_f) {
 			cl->cl_f = f;
 			update_cfmin(cl->cl_parent);
 		}
 	}
 }
 
 static void
 update_vf(struct hfsc_class *cl, int len, u_int64_t cur_time)
 {
 	u_int64_t f, myf_bound, delta;
 	int go_passive;
 
 	go_passive = qempty(cl->cl_q);
 
 	for (; cl->cl_parent != NULL; cl = cl->cl_parent) {
 		cl->cl_total += len;
 
 		if (cl->cl_fsc == NULL || cl->cl_nactive == 0)
 			continue;
 
 		if (go_passive && --cl->cl_nactive == 0)
 			go_passive = 1;
 		else
 			go_passive = 0;
 
 		if (go_passive) {
 			/* no more active child, going passive */
 
 			/* update cvtmax of the parent class */
 			if (cl->cl_vt > cl->cl_parent->cl_cvtmax)
 				cl->cl_parent->cl_cvtmax = cl->cl_vt;
 
 			/* remove this class from the vt list */
 			actlist_remove(cl);
 
 			update_cfmin(cl->cl_parent);
 
 			continue;
 		}
 
 		/*
 		 * update vt and f
 		 */
 		cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total)
 		    - cl->cl_vtoff + cl->cl_vtadj;
 
 		/*
 		 * if vt of the class is smaller than cvtmin,
 		 * the class was skipped in the past due to non-fit.
 		 * if so, we need to adjust vtadj.
 		 */
 		if (cl->cl_vt < cl->cl_parent->cl_cvtmin) {
 			cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt;
 			cl->cl_vt = cl->cl_parent->cl_cvtmin;
 		}
 
 		/* update the vt list */
 		actlist_update(cl);
 
 		if (cl->cl_usc != NULL) {
 			cl->cl_myf = cl->cl_myfadj
 			    + rtsc_y2x(&cl->cl_ulimit, cl->cl_total);
 
 			/*
 			 * if myf lags behind by more than one clock tick
 			 * from the current time, adjust myfadj to prevent
 			 * a rate-limited class from going greedy.
 			 * in a steady state under rate-limiting, myf
 			 * fluctuates within one clock tick.
 			 */
 			myf_bound = cur_time - machclk_per_tick;
 			if (cl->cl_myf < myf_bound) {
 				delta = cur_time - cl->cl_myf;
 				cl->cl_myfadj += delta;
 				cl->cl_myf += delta;
 			}
 		}
 
 		/* cl_f is max(cl_myf, cl_cfmin) */
 		if (cl->cl_myf > cl->cl_cfmin)
 			f = cl->cl_myf;
 		else
 			f = cl->cl_cfmin;
 		if (f != cl->cl_f) {
 			cl->cl_f = f;
 			update_cfmin(cl->cl_parent);
 		}
 	}
 }
 
 static void
 update_cfmin(struct hfsc_class *cl)
 {
 	struct hfsc_class *p;
 	u_int64_t cfmin;
 
 	if (TAILQ_EMPTY(&cl->cl_actc)) {
 		cl->cl_cfmin = 0;
 		return;
 	}
 	cfmin = HT_INFINITY;
 	TAILQ_FOREACH(p, &cl->cl_actc, cl_actlist) {
 		if (p->cl_f == 0) {
 			cl->cl_cfmin = 0;
 			return;
 		}
 		if (p->cl_f < cfmin)
 			cfmin = p->cl_f;
 	}
 	cl->cl_cfmin = cfmin;
 }
 
 /*
  * TAILQ based ellist and actlist implementation
  * (ion wanted to make a calendar queue based implementation)
  */
 /*
  * eligible list holds backlogged classes being sorted by their eligible times.
  * there is one eligible list per interface.
  */
 
 static void
 ellist_insert(struct hfsc_class *cl)
 {
 	struct hfsc_if	*hif = cl->cl_hif;
 	struct hfsc_class *p;
 
 	/* check the last entry first */
 	if ((p = TAILQ_LAST(&hif->hif_eligible, elighead)) == NULL ||
 	    p->cl_e <= cl->cl_e) {
 		TAILQ_INSERT_TAIL(&hif->hif_eligible, cl, cl_ellist);
 		return;
 	}
 
 	TAILQ_FOREACH(p, &hif->hif_eligible, cl_ellist) {
 		if (cl->cl_e < p->cl_e) {
 			TAILQ_INSERT_BEFORE(p, cl, cl_ellist);
 			return;
 		}
 	}
 	ASSERT(0); /* should not reach here */
 }
 
 static void
 ellist_remove(struct hfsc_class *cl)
 {
 	struct hfsc_if	*hif = cl->cl_hif;
 
 	TAILQ_REMOVE(&hif->hif_eligible, cl, cl_ellist);
 }
 
 static void
 ellist_update(struct hfsc_class *cl)
 {
 	struct hfsc_if	*hif = cl->cl_hif;
 	struct hfsc_class *p, *last;
 
 	/*
 	 * the eligible time of a class increases monotonically.
 	 * if the next entry has a larger eligible time, nothing to do.
 	 */
 	p = TAILQ_NEXT(cl, cl_ellist);
 	if (p == NULL || cl->cl_e <= p->cl_e)
 		return;
 
 	/* check the last entry */
 	last = TAILQ_LAST(&hif->hif_eligible, elighead);
 	ASSERT(last != NULL);
 	if (last->cl_e <= cl->cl_e) {
 		TAILQ_REMOVE(&hif->hif_eligible, cl, cl_ellist);
 		TAILQ_INSERT_TAIL(&hif->hif_eligible, cl, cl_ellist);
 		return;
 	}
 
 	/*
 	 * the new position must be between the next entry
 	 * and the last entry
 	 */
 	while ((p = TAILQ_NEXT(p, cl_ellist)) != NULL) {
 		if (cl->cl_e < p->cl_e) {
 			TAILQ_REMOVE(&hif->hif_eligible, cl, cl_ellist);
 			TAILQ_INSERT_BEFORE(p, cl, cl_ellist);
 			return;
 		}
 	}
 	ASSERT(0); /* should not reach here */
 }
 
 /* find the class with the minimum deadline among the eligible classes */
 struct hfsc_class *
 hfsc_get_mindl(struct hfsc_if *hif, u_int64_t cur_time)
 {
 	struct hfsc_class *p, *cl = NULL;
 
 	TAILQ_FOREACH(p, &hif->hif_eligible, cl_ellist) {
 		if (p->cl_e > cur_time)
 			break;
 		if (cl == NULL || p->cl_d < cl->cl_d)
 			cl = p;
 	}
 	return (cl);
 }
 
 /*
  * active children list holds backlogged child classes being sorted
  * by their virtual time.
  * each intermediate class has one active children list.
  */
 
 static void
 actlist_insert(struct hfsc_class *cl)
 {
 	struct hfsc_class *p;
 
 	/* check the last entry first */
 	if ((p = TAILQ_LAST(&cl->cl_parent->cl_actc, acthead)) == NULL
 	    || p->cl_vt <= cl->cl_vt) {
 		TAILQ_INSERT_TAIL(&cl->cl_parent->cl_actc, cl, cl_actlist);
 		return;
 	}
 
 	TAILQ_FOREACH(p, &cl->cl_parent->cl_actc, cl_actlist) {
 		if (cl->cl_vt < p->cl_vt) {
 			TAILQ_INSERT_BEFORE(p, cl, cl_actlist);
 			return;
 		}
 	}
 	ASSERT(0); /* should not reach here */
 }
 
 static void
 actlist_remove(struct hfsc_class *cl)
 {
 	TAILQ_REMOVE(&cl->cl_parent->cl_actc, cl, cl_actlist);
 }
 
 static void
 actlist_update(struct hfsc_class *cl)
 {
 	struct hfsc_class *p, *last;
 
 	/*
 	 * the virtual time of a class increases monotonically during its
 	 * backlogged period.
 	 * if the next entry has a larger virtual time, nothing to do.
 	 */
 	p = TAILQ_NEXT(cl, cl_actlist);
 	if (p == NULL || cl->cl_vt < p->cl_vt)
 		return;
 
 	/* check the last entry */
 	last = TAILQ_LAST(&cl->cl_parent->cl_actc, acthead);
 	ASSERT(last != NULL);
 	if (last->cl_vt <= cl->cl_vt) {
 		TAILQ_REMOVE(&cl->cl_parent->cl_actc, cl, cl_actlist);
 		TAILQ_INSERT_TAIL(&cl->cl_parent->cl_actc, cl, cl_actlist);
 		return;
 	}
 
 	/*
 	 * the new position must be between the next entry
 	 * and the last entry
 	 */
 	while ((p = TAILQ_NEXT(p, cl_actlist)) != NULL) {
 		if (cl->cl_vt < p->cl_vt) {
 			TAILQ_REMOVE(&cl->cl_parent->cl_actc, cl, cl_actlist);
 			TAILQ_INSERT_BEFORE(p, cl, cl_actlist);
 			return;
 		}
 	}
 	ASSERT(0); /* should not reach here */
 }
 
 static struct hfsc_class *
 actlist_firstfit(struct hfsc_class *cl, u_int64_t cur_time)
 {
 	struct hfsc_class *p;
 
 	TAILQ_FOREACH(p, &cl->cl_actc, cl_actlist) {
 		if (p->cl_f <= cur_time)
 			return (p);
 	}
 	return (NULL);
 }
 
 /*
  * service curve support functions
  *
  *  external service curve parameters
  *	m: bits/sec
  *	d: msec
  *  internal service curve parameters
  *	sm: (bytes/machclk tick) << SM_SHIFT
  *	ism: (machclk ticks/byte) << ISM_SHIFT
  *	dx: machclk ticks
  *
  * SM_SHIFT and ISM_SHIFT are scaled in order to keep effective digits.  we
  * should be able to handle 100K-100Gbps linkspeed with 256 MHz machclk
  * frequency and at least 3 effective digits in decimal.
  *
  */
 #define	SM_SHIFT	24
 #define	ISM_SHIFT	14
 
 #define	SM_MASK		((1LL << SM_SHIFT) - 1)
 #define	ISM_MASK	((1LL << ISM_SHIFT) - 1)
 
 static __inline u_int64_t
 seg_x2y(u_int64_t x, u_int64_t sm)
 {
 	u_int64_t y;
 
 	/*
 	 * compute
 	 *	y = x * sm >> SM_SHIFT
 	 * but divide it for the upper and lower bits to avoid overflow
 	 */
 	y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT);
 	return (y);
 }
 
 static __inline u_int64_t
 seg_y2x(u_int64_t y, u_int64_t ism)
 {
 	u_int64_t x;
 
 	if (y == 0)
 		x = 0;
 	else if (ism == HT_INFINITY)
 		x = HT_INFINITY;
 	else {
 		x = (y >> ISM_SHIFT) * ism
 		    + (((y & ISM_MASK) * ism) >> ISM_SHIFT);
 	}
 	return (x);
 }
 
 static __inline u_int64_t
 m2sm(u_int64_t m)
 {
 	u_int64_t sm;
 
 	sm = (m << SM_SHIFT) / 8 / machclk_freq;
 	return (sm);
 }
 
 static __inline u_int64_t
 m2ism(u_int64_t m)
 {
 	u_int64_t ism;
 
 	if (m == 0)
 		ism = HT_INFINITY;
 	else
 		ism = ((u_int64_t)machclk_freq << ISM_SHIFT) * 8 / m;
 	return (ism);
 }
 
 static __inline u_int64_t
 d2dx(u_int d)
 {
 	u_int64_t dx;
 
 	dx = ((u_int64_t)d * machclk_freq) / 1000;
 	return (dx);
 }
 
 static u_int64_t
 sm2m(u_int64_t sm)
 {
 	u_int64_t m;
 
 	m = (sm * 8 * machclk_freq) >> SM_SHIFT;
 	return (m);
 }
 
 static u_int
 dx2d(u_int64_t dx)
 {
 	u_int64_t d;
 
 	d = dx * 1000 / machclk_freq;
 	return ((u_int)d);
 }
 
 static void
 sc2isc(struct service_curve *sc, struct internal_sc *isc)
 {
 	isc->sm1 = m2sm(sc->m1);
 	isc->ism1 = m2ism(sc->m1);
 	isc->dx = d2dx(sc->d);
 	isc->dy = seg_x2y(isc->dx, isc->sm1);
 	isc->sm2 = m2sm(sc->m2);
 	isc->ism2 = m2ism(sc->m2);
 }
 
 /*
  * initialize the runtime service curve with the given internal
  * service curve starting at (x, y).
  */
 static void
 rtsc_init(struct runtime_sc *rtsc, struct internal_sc * isc, u_int64_t x,
     u_int64_t y)
 {
 	rtsc->x =	x;
 	rtsc->y =	y;
 	rtsc->sm1 =	isc->sm1;
 	rtsc->ism1 =	isc->ism1;
 	rtsc->dx =	isc->dx;
 	rtsc->dy =	isc->dy;
 	rtsc->sm2 =	isc->sm2;
 	rtsc->ism2 =	isc->ism2;
 }
 
 /*
  * calculate the y-projection of the runtime service curve by the
  * given x-projection value
  */
 static u_int64_t
 rtsc_y2x(struct runtime_sc *rtsc, u_int64_t y)
 {
 	u_int64_t	x;
 
 	if (y < rtsc->y)
 		x = rtsc->x;
 	else if (y <= rtsc->y + rtsc->dy) {
 		/* x belongs to the 1st segment */
 		if (rtsc->dy == 0)
 			x = rtsc->x + rtsc->dx;
 		else
 			x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1);
 	} else {
 		/* x belongs to the 2nd segment */
 		x = rtsc->x + rtsc->dx
 		    + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2);
 	}
 	return (x);
 }
 
 static u_int64_t
 rtsc_x2y(struct runtime_sc *rtsc, u_int64_t x)
 {
 	u_int64_t	y;
 
 	if (x <= rtsc->x)
 		y = rtsc->y;
 	else if (x <= rtsc->x + rtsc->dx)
 		/* y belongs to the 1st segment */
 		y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1);
 	else
 		/* y belongs to the 2nd segment */
 		y = rtsc->y + rtsc->dy
 		    + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2);
 	return (y);
 }
 
 /*
  * update the runtime service curve by taking the minimum of the current
  * runtime service curve and the service curve starting at (x, y).
  */
 static void
 rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u_int64_t x,
     u_int64_t y)
 {
 	u_int64_t	y1, y2, dx, dy;
 
 	if (isc->sm1 <= isc->sm2) {
 		/* service curve is convex */
 		y1 = rtsc_x2y(rtsc, x);
 		if (y1 < y)
 			/* the current rtsc is smaller */
 			return;
 		rtsc->x = x;
 		rtsc->y = y;
 		return;
 	}
 
 	/*
 	 * service curve is concave
 	 * compute the two y values of the current rtsc
 	 *	y1: at x
 	 *	y2: at (x + dx)
 	 */
 	y1 = rtsc_x2y(rtsc, x);
 	if (y1 <= y) {
 		/* rtsc is below isc, no change to rtsc */
 		return;
 	}
 
 	y2 = rtsc_x2y(rtsc, x + isc->dx);
 	if (y2 >= y + isc->dy) {
 		/* rtsc is above isc, replace rtsc by isc */
 		rtsc->x = x;
 		rtsc->y = y;
 		rtsc->dx = isc->dx;
 		rtsc->dy = isc->dy;
 		return;
 	}
 
 	/*
 	 * the two curves intersect
 	 * compute the offsets (dx, dy) using the reverse
 	 * function of seg_x2y()
 	 *	seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y)
 	 */
 	dx = ((y1 - y) << SM_SHIFT) / (isc->sm1 - isc->sm2);
 	/*
 	 * check if (x, y1) belongs to the 1st segment of rtsc.
 	 * if so, add the offset.
 	 */
 	if (rtsc->x + rtsc->dx > x)
 		dx += rtsc->x + rtsc->dx - x;
 	dy = seg_x2y(dx, isc->sm1);
 
 	rtsc->x = x;
 	rtsc->y = y;
 	rtsc->dx = dx;
 	rtsc->dy = dy;
 	return;
 }
 
 static void
 get_class_stats_v0(struct hfsc_classstats_v0 *sp, struct hfsc_class *cl)
 {
 	sp->class_id = cl->cl_id;
 	sp->class_handle = cl->cl_handle;
 
 #define SATU32(x)	(u_int32_t)uqmin((x), UINT_MAX)
 
 	if (cl->cl_rsc != NULL) {
 		sp->rsc.m1 = SATU32(sm2m(cl->cl_rsc->sm1));
 		sp->rsc.d = dx2d(cl->cl_rsc->dx);
 		sp->rsc.m2 = SATU32(sm2m(cl->cl_rsc->sm2));
 	} else {
 		sp->rsc.m1 = 0;
 		sp->rsc.d = 0;
 		sp->rsc.m2 = 0;
 	}
 	if (cl->cl_fsc != NULL) {
 		sp->fsc.m1 = SATU32(sm2m(cl->cl_fsc->sm1));
 		sp->fsc.d = dx2d(cl->cl_fsc->dx);
 		sp->fsc.m2 = SATU32(sm2m(cl->cl_fsc->sm2));
 	} else {
 		sp->fsc.m1 = 0;
 		sp->fsc.d = 0;
 		sp->fsc.m2 = 0;
 	}
 	if (cl->cl_usc != NULL) {
 		sp->usc.m1 = SATU32(sm2m(cl->cl_usc->sm1));
 		sp->usc.d = dx2d(cl->cl_usc->dx);
 		sp->usc.m2 = SATU32(sm2m(cl->cl_usc->sm2));
 	} else {
 		sp->usc.m1 = 0;
 		sp->usc.d = 0;
 		sp->usc.m2 = 0;
 	}
 
 #undef SATU32
 
 	sp->total = cl->cl_total;
 	sp->cumul = cl->cl_cumul;
 
 	sp->d = cl->cl_d;
 	sp->e = cl->cl_e;
 	sp->vt = cl->cl_vt;
 	sp->f = cl->cl_f;
 
 	sp->initvt = cl->cl_initvt;
 	sp->vtperiod = cl->cl_vtperiod;
 	sp->parentperiod = cl->cl_parentperiod;
 	sp->nactive = cl->cl_nactive;
 	sp->vtoff = cl->cl_vtoff;
 	sp->cvtmax = cl->cl_cvtmax;
 	sp->myf = cl->cl_myf;
 	sp->cfmin = cl->cl_cfmin;
 	sp->cvtmin = cl->cl_cvtmin;
 	sp->myfadj = cl->cl_myfadj;
 	sp->vtadj = cl->cl_vtadj;
 
 	sp->cur_time = read_machclk();
 	sp->machclk_freq = machclk_freq;
 
 	sp->qlength = qlen(cl->cl_q);
 	sp->qlimit = qlimit(cl->cl_q);
 	sp->xmit_cnt = cl->cl_stats.xmit_cnt;
 	sp->drop_cnt = cl->cl_stats.drop_cnt;
 	sp->period = cl->cl_stats.period;
 
 	sp->qtype = qtype(cl->cl_q);
 #ifdef ALTQ_RED
 	if (q_is_red(cl->cl_q))
 		red_getstats(cl->cl_red, &sp->red[0]);
 #endif
 #ifdef ALTQ_RIO
 	if (q_is_rio(cl->cl_q))
 		rio_getstats((rio_t *)cl->cl_red, &sp->red[0]);
 #endif
 #ifdef ALTQ_CODEL
 	if (q_is_codel(cl->cl_q))
 		codel_getstats(cl->cl_codel, &sp->codel);
 #endif
 }
 
 static void
 get_class_stats_v1(struct hfsc_classstats_v1 *sp, struct hfsc_class *cl)
 {
 	sp->class_id = cl->cl_id;
 	sp->class_handle = cl->cl_handle;
 
 	if (cl->cl_rsc != NULL) {
 		sp->rsc.m1 = sm2m(cl->cl_rsc->sm1);
 		sp->rsc.d = dx2d(cl->cl_rsc->dx);
 		sp->rsc.m2 = sm2m(cl->cl_rsc->sm2);
 	} else {
 		sp->rsc.m1 = 0;
 		sp->rsc.d = 0;
 		sp->rsc.m2 = 0;
 	}
 	if (cl->cl_fsc != NULL) {
 		sp->fsc.m1 = sm2m(cl->cl_fsc->sm1);
 		sp->fsc.d = dx2d(cl->cl_fsc->dx);
 		sp->fsc.m2 = sm2m(cl->cl_fsc->sm2);
 	} else {
 		sp->fsc.m1 = 0;
 		sp->fsc.d = 0;
 		sp->fsc.m2 = 0;
 	}
 	if (cl->cl_usc != NULL) {
 		sp->usc.m1 = sm2m(cl->cl_usc->sm1);
 		sp->usc.d = dx2d(cl->cl_usc->dx);
 		sp->usc.m2 = sm2m(cl->cl_usc->sm2);
 	} else {
 		sp->usc.m1 = 0;
 		sp->usc.d = 0;
 		sp->usc.m2 = 0;
 	}
 
 	sp->total = cl->cl_total;
 	sp->cumul = cl->cl_cumul;
 
 	sp->d = cl->cl_d;
 	sp->e = cl->cl_e;
 	sp->vt = cl->cl_vt;
 	sp->f = cl->cl_f;
 
 	sp->initvt = cl->cl_initvt;
 	sp->vtperiod = cl->cl_vtperiod;
 	sp->parentperiod = cl->cl_parentperiod;
 	sp->nactive = cl->cl_nactive;
 	sp->vtoff = cl->cl_vtoff;
 	sp->cvtmax = cl->cl_cvtmax;
 	sp->myf = cl->cl_myf;
 	sp->cfmin = cl->cl_cfmin;
 	sp->cvtmin = cl->cl_cvtmin;
 	sp->myfadj = cl->cl_myfadj;
 	sp->vtadj = cl->cl_vtadj;
 
 	sp->cur_time = read_machclk();
 	sp->machclk_freq = machclk_freq;
 
 	sp->qlength = qlen(cl->cl_q);
 	sp->qlimit = qlimit(cl->cl_q);
 	sp->xmit_cnt = cl->cl_stats.xmit_cnt;
 	sp->drop_cnt = cl->cl_stats.drop_cnt;
 	sp->period = cl->cl_stats.period;
 
 	sp->qtype = qtype(cl->cl_q);
 #ifdef ALTQ_RED
 	if (q_is_red(cl->cl_q))
 		red_getstats(cl->cl_red, &sp->red[0]);
 #endif
 #ifdef ALTQ_RIO
 	if (q_is_rio(cl->cl_q))
 		rio_getstats((rio_t *)cl->cl_red, &sp->red[0]);
 #endif
 #ifdef ALTQ_CODEL
 	if (q_is_codel(cl->cl_q))
 		codel_getstats(cl->cl_codel, &sp->codel);
 #endif
 }
 
 /* convert a class handle to the corresponding class pointer */
 static struct hfsc_class *
 clh_to_clp(struct hfsc_if *hif, u_int32_t chandle)
 {
 	int i;
 	struct hfsc_class *cl;
 
 	if (chandle == 0)
 		return (NULL);
 	/*
 	 * first, try optimistically the slot matching the lower bits of
 	 * the handle.  if it fails, do the linear table search.
 	 */
 	i = chandle % HFSC_MAX_CLASSES;
 	if ((cl = hif->hif_class_tbl[i]) != NULL && cl->cl_handle == chandle)
 		return (cl);
 	for (i = 0; i < HFSC_MAX_CLASSES; i++)
 		if ((cl = hif->hif_class_tbl[i]) != NULL &&
 		    cl->cl_handle == chandle)
 			return (cl);
 	return (NULL);
 }
 
 #endif /* ALTQ_HFSC */
diff --git a/sys/net/altq/altq_priq.c b/sys/net/altq/altq_priq.c
index f0627e2611a7..bb679baa236d 100644
--- a/sys/net/altq/altq_priq.c
+++ b/sys/net/altq/altq_priq.c
@@ -1,641 +1,642 @@
 /*-
  * Copyright (C) 2000-2003
  *	Sony Computer Science Laboratories Inc.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $KAME: altq_priq.c,v 1.11 2003/09/17 14:23:25 kjc Exp $
  * $FreeBSD$
  */
 /*
  * priority queue
  */
 
 #include "opt_altq.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #ifdef ALTQ_PRIQ  /* priq is enabled by ALTQ_PRIQ option in opt_altq.h */
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/queue.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <netinet/in.h>
 
 #include <netpfil/pf/pf.h>
 #include <netpfil/pf/pf_altq.h>
 #include <netpfil/pf/pf_mtag.h>
 #include <net/altq/altq.h>
 #include <net/altq/altq_priq.h>
 
 /*
  * function prototypes
  */
 static int priq_clear_interface(struct priq_if *);
 static int priq_request(struct ifaltq *, int, void *);
 static void priq_purge(struct priq_if *);
 static struct priq_class *priq_class_create(struct priq_if *, int, int, int,
     int);
 static int priq_class_destroy(struct priq_class *);
 static int priq_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
 static struct mbuf *priq_dequeue(struct ifaltq *, int);
 
 static int priq_addq(struct priq_class *, struct mbuf *);
 static struct mbuf *priq_getq(struct priq_class *);
 static struct mbuf *priq_pollq(struct priq_class *);
 static void priq_purgeq(struct priq_class *);
 
 static void get_class_stats(struct priq_classstats *, struct priq_class *);
 static struct priq_class *clh_to_clp(struct priq_if *, u_int32_t);
 
 int
 priq_pfattach(struct pf_altq *a)
 {
 	struct ifnet *ifp;
 	int s, error;
 
 	if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL)
 		return (EINVAL);
 	s = splnet();
 	error = altq_attach(&ifp->if_snd, ALTQT_PRIQ, a->altq_disc,
 	    priq_enqueue, priq_dequeue, priq_request);
 	splx(s);
 	return (error);
 }
 
 int
 priq_add_altq(struct ifnet * ifp, struct pf_altq *a)
 {
 	struct priq_if	*pif;
 
 	if (ifp == NULL)
 		return (EINVAL);
 	if (!ALTQ_IS_READY(&ifp->if_snd))
 		return (ENODEV);
 
 	pif = malloc(sizeof(struct priq_if), M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (pif == NULL)
 		return (ENOMEM);
 	pif->pif_bandwidth = a->ifbandwidth;
 	pif->pif_maxpri = -1;
 	pif->pif_ifq = &ifp->if_snd;
 
 	/* keep the state in pf_altq */
 	a->altq_disc = pif;
 
 	return (0);
 }
 
 int
 priq_remove_altq(struct pf_altq *a)
 {
 	struct priq_if *pif;
 
 	if ((pif = a->altq_disc) == NULL)
 		return (EINVAL);
 	a->altq_disc = NULL;
 
 	(void)priq_clear_interface(pif);
 
 	free(pif, M_DEVBUF);
 	return (0);
 }
 
 int
 priq_add_queue(struct pf_altq *a)
 {
 	struct priq_if *pif;
 	struct priq_class *cl;
 
 	if ((pif = a->altq_disc) == NULL)
 		return (EINVAL);
 
 	/* check parameters */
 	if (a->priority >= PRIQ_MAXPRI)
 		return (EINVAL);
 	if (a->qid == 0)
 		return (EINVAL);
 	if (pif->pif_classes[a->priority] != NULL)
 		return (EBUSY);
 	if (clh_to_clp(pif, a->qid) != NULL)
 		return (EBUSY);
 
 	cl = priq_class_create(pif, a->priority, a->qlimit,
 	    a->pq_u.priq_opts.flags, a->qid);
 	if (cl == NULL)
 		return (ENOMEM);
 
 	return (0);
 }
 
 int
 priq_remove_queue(struct pf_altq *a)
 {
 	struct priq_if *pif;
 	struct priq_class *cl;
 
 	if ((pif = a->altq_disc) == NULL)
 		return (EINVAL);
 
 	if ((cl = clh_to_clp(pif, a->qid)) == NULL)
 		return (EINVAL);
 
 	return (priq_class_destroy(cl));
 }
 
 int
 priq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes, int version)
 {
 	struct priq_if *pif;
 	struct priq_class *cl;
 	struct priq_classstats stats;
 	int error = 0;
 
 	if ((pif = altq_lookup(a->ifname, ALTQT_PRIQ)) == NULL)
 		return (EBADF);
 
 	if ((cl = clh_to_clp(pif, a->qid)) == NULL)
 		return (EINVAL);
 
 	if (*nbytes < sizeof(stats))
 		return (EINVAL);
 
 	get_class_stats(&stats, cl);
 
 	if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0)
 		return (error);
 	*nbytes = sizeof(stats);
 	return (0);
 }
 
 /*
  * bring the interface back to the initial state by discarding
  * all the filters and classes.
  */
 static int
 priq_clear_interface(struct priq_if *pif)
 {
 	struct priq_class	*cl;
 	int pri;
 
 #ifdef ALTQ3_CLFIER_COMPAT
 	/* free the filters for this interface */
 	acc_discard_filters(&pif->pif_classifier, NULL, 1);
 #endif
 
 	/* clear out the classes */
 	for (pri = 0; pri <= pif->pif_maxpri; pri++)
 		if ((cl = pif->pif_classes[pri]) != NULL)
 			priq_class_destroy(cl);
 
 	return (0);
 }
 
 static int
 priq_request(struct ifaltq *ifq, int req, void *arg)
 {
 	struct priq_if	*pif = (struct priq_if *)ifq->altq_disc;
 
 	IFQ_LOCK_ASSERT(ifq);
 
 	switch (req) {
 	case ALTRQ_PURGE:
 		priq_purge(pif);
 		break;
 	}
 	return (0);
 }
 
 /* discard all the queued packets on the interface */
 static void
 priq_purge(struct priq_if *pif)
 {
 	struct priq_class *cl;
 	int pri;
 
 	for (pri = 0; pri <= pif->pif_maxpri; pri++) {
 		if ((cl = pif->pif_classes[pri]) != NULL && !qempty(cl->cl_q))
 			priq_purgeq(cl);
 	}
 	if (ALTQ_IS_ENABLED(pif->pif_ifq))
 		pif->pif_ifq->ifq_len = 0;
 }
 
 static struct priq_class *
 priq_class_create(struct priq_if *pif, int pri, int qlimit, int flags, int qid)
 {
 	struct priq_class *cl;
 	int s;
 
 #ifndef ALTQ_RED
 	if (flags & PRCF_RED) {
 #ifdef ALTQ_DEBUG
 		printf("priq_class_create: RED not configured for PRIQ!\n");
 #endif
 		return (NULL);
 	}
 #endif
 #ifndef ALTQ_CODEL
 	if (flags & PRCF_CODEL) {
 #ifdef ALTQ_DEBUG
 		printf("priq_class_create: CODEL not configured for PRIQ!\n");
 #endif
 		return (NULL);
 	}
 #endif
 
 	if ((cl = pif->pif_classes[pri]) != NULL) {
 		/* modify the class instead of creating a new one */
 		s = splnet();
 		IFQ_LOCK(cl->cl_pif->pif_ifq);
 		if (!qempty(cl->cl_q))
 			priq_purgeq(cl);
 		IFQ_UNLOCK(cl->cl_pif->pif_ifq);
 		splx(s);
 #ifdef ALTQ_RIO
 		if (q_is_rio(cl->cl_q))
 			rio_destroy((rio_t *)cl->cl_red);
 #endif
 #ifdef ALTQ_RED
 		if (q_is_red(cl->cl_q))
 			red_destroy(cl->cl_red);
 #endif
 #ifdef ALTQ_CODEL
 		if (q_is_codel(cl->cl_q))
 			codel_destroy(cl->cl_codel);
 #endif
 	} else {
 		cl = malloc(sizeof(struct priq_class), M_DEVBUF,
 		    M_NOWAIT | M_ZERO);
 		if (cl == NULL)
 			return (NULL);
 
 		cl->cl_q = malloc(sizeof(class_queue_t), M_DEVBUF,
 		    M_NOWAIT | M_ZERO);
 		if (cl->cl_q == NULL)
 			goto err_ret;
 	}
 
 	pif->pif_classes[pri] = cl;
 	if (flags & PRCF_DEFAULTCLASS)
 		pif->pif_default = cl;
 	if (qlimit == 0)
 		qlimit = 50;  /* use default */
 	qlimit(cl->cl_q) = qlimit;
 	qtype(cl->cl_q) = Q_DROPTAIL;
 	qlen(cl->cl_q) = 0;
 	qsize(cl->cl_q) = 0;
 	cl->cl_flags = flags;
 	cl->cl_pri = pri;
 	if (pri > pif->pif_maxpri)
 		pif->pif_maxpri = pri;
 	cl->cl_pif = pif;
 	cl->cl_handle = qid;
 
 #ifdef ALTQ_RED
 	if (flags & (PRCF_RED|PRCF_RIO)) {
 		int red_flags, red_pkttime;
 
 		red_flags = 0;
 		if (flags & PRCF_ECN)
 			red_flags |= REDF_ECN;
 #ifdef ALTQ_RIO
 		if (flags & PRCF_CLEARDSCP)
 			red_flags |= RIOF_CLEARDSCP;
 #endif
 		if (pif->pif_bandwidth < 8)
 			red_pkttime = 1000 * 1000 * 1000; /* 1 sec */
 		else
 			red_pkttime = (int64_t)pif->pif_ifq->altq_ifp->if_mtu
 			  * 1000 * 1000 * 1000 / (pif->pif_bandwidth / 8);
 #ifdef ALTQ_RIO
 		if (flags & PRCF_RIO) {
 			cl->cl_red = (red_t *)rio_alloc(0, NULL,
 						red_flags, red_pkttime);
 			if (cl->cl_red == NULL)
 				goto err_ret;
 			qtype(cl->cl_q) = Q_RIO;
 		} else
 #endif
 		if (flags & PRCF_RED) {
 			cl->cl_red = red_alloc(0, 0,
 			    qlimit(cl->cl_q) * 10/100,
 			    qlimit(cl->cl_q) * 30/100,
 			    red_flags, red_pkttime);
 			if (cl->cl_red == NULL)
 				goto err_ret;
 			qtype(cl->cl_q) = Q_RED;
 		}
 	}
 #endif /* ALTQ_RED */
 #ifdef ALTQ_CODEL
 	if (flags & PRCF_CODEL) {
 		cl->cl_codel = codel_alloc(5, 100, 0);
 		if (cl->cl_codel != NULL)
 			qtype(cl->cl_q) = Q_CODEL;
 	}
 #endif
 
 	return (cl);
 
  err_ret:
 	if (cl->cl_red != NULL) {
 #ifdef ALTQ_RIO
 		if (q_is_rio(cl->cl_q))
 			rio_destroy((rio_t *)cl->cl_red);
 #endif
 #ifdef ALTQ_RED
 		if (q_is_red(cl->cl_q))
 			red_destroy(cl->cl_red);
 #endif
 #ifdef ALTQ_CODEL
 		if (q_is_codel(cl->cl_q))
 			codel_destroy(cl->cl_codel);
 #endif
 	}
 	if (cl->cl_q != NULL)
 		free(cl->cl_q, M_DEVBUF);
 	free(cl, M_DEVBUF);
 	return (NULL);
 }
 
 static int
 priq_class_destroy(struct priq_class *cl)
 {
 	struct priq_if *pif;
 	int s, pri;
 
 	s = splnet();
 	IFQ_LOCK(cl->cl_pif->pif_ifq);
 
 #ifdef ALTQ3_CLFIER_COMPAT
 	/* delete filters referencing to this class */
 	acc_discard_filters(&cl->cl_pif->pif_classifier, cl, 0);
 #endif
 
 	if (!qempty(cl->cl_q))
 		priq_purgeq(cl);
 
 	pif = cl->cl_pif;
 	pif->pif_classes[cl->cl_pri] = NULL;
 	if (pif->pif_maxpri == cl->cl_pri) {
 		for (pri = cl->cl_pri; pri >= 0; pri--)
 			if (pif->pif_classes[pri] != NULL) {
 				pif->pif_maxpri = pri;
 				break;
 			}
 		if (pri < 0)
 			pif->pif_maxpri = -1;
 	}
 	IFQ_UNLOCK(cl->cl_pif->pif_ifq);
 	splx(s);
 
 	if (cl->cl_red != NULL) {
 #ifdef ALTQ_RIO
 		if (q_is_rio(cl->cl_q))
 			rio_destroy((rio_t *)cl->cl_red);
 #endif
 #ifdef ALTQ_RED
 		if (q_is_red(cl->cl_q))
 			red_destroy(cl->cl_red);
 #endif
 #ifdef ALTQ_CODEL
 		if (q_is_codel(cl->cl_q))
 			codel_destroy(cl->cl_codel);
 #endif
 	}
 	free(cl->cl_q, M_DEVBUF);
 	free(cl, M_DEVBUF);
 	return (0);
 }
 
 /*
  * priq_enqueue is an enqueue function to be registered to
  * (*altq_enqueue) in struct ifaltq.
  */
 static int
 priq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
 {
 	struct priq_if	*pif = (struct priq_if *)ifq->altq_disc;
 	struct priq_class *cl;
 	struct pf_mtag *t;
 	int len;
 
 	IFQ_LOCK_ASSERT(ifq);
 
 	/* grab class set by classifier */
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		/* should not happen */
 		printf("altq: packet for %s does not have pkthdr\n",
 		    ifq->altq_ifp->if_xname);
 		m_freem(m);
 		return (ENOBUFS);
 	}
 	cl = NULL;
 	if ((t = pf_find_mtag(m)) != NULL)
 		cl = clh_to_clp(pif, t->qid);
 	if (cl == NULL) {
 		cl = pif->pif_default;
 		if (cl == NULL) {
 			m_freem(m);
 			return (ENOBUFS);
 		}
 	}
 	cl->cl_pktattr = NULL;
 	len = m_pktlen(m);
 	if (priq_addq(cl, m) != 0) {
 		/* drop occurred.  mbuf was freed in priq_addq. */
 		PKTCNTR_ADD(&cl->cl_dropcnt, len);
 		return (ENOBUFS);
 	}
 	IFQ_INC_LEN(ifq);
 
 	/* successfully queued. */
 	return (0);
 }
 
 /*
  * priq_dequeue is a dequeue function to be registered to
  * (*altq_dequeue) in struct ifaltq.
  *
  * note: ALTDQ_POLL returns the next packet without removing the packet
  *	from the queue.  ALTDQ_REMOVE is a normal dequeue operation.
  *	ALTDQ_REMOVE must return the same packet if called immediately
  *	after ALTDQ_POLL.
  */
 static struct mbuf *
 priq_dequeue(struct ifaltq *ifq, int op)
 {
 	struct priq_if	*pif = (struct priq_if *)ifq->altq_disc;
 	struct priq_class *cl;
 	struct mbuf *m;
 	int pri;
 
 	IFQ_LOCK_ASSERT(ifq);
 
 	if (IFQ_IS_EMPTY(ifq))
 		/* no packet in the queue */
 		return (NULL);
 
 	for (pri = pif->pif_maxpri;  pri >= 0; pri--) {
 		if ((cl = pif->pif_classes[pri]) != NULL &&
 		    !qempty(cl->cl_q)) {
 			if (op == ALTDQ_POLL)
 				return (priq_pollq(cl));
 
 			m = priq_getq(cl);
 			if (m != NULL) {
 				IFQ_DEC_LEN(ifq);
 				if (qempty(cl->cl_q))
 					cl->cl_period++;
 				PKTCNTR_ADD(&cl->cl_xmitcnt, m_pktlen(m));
 			}
 			return (m);
 		}
 	}
 	return (NULL);
 }
 
 static int
 priq_addq(struct priq_class *cl, struct mbuf *m)
 {
 
 #ifdef ALTQ_RIO
 	if (q_is_rio(cl->cl_q))
 		return rio_addq((rio_t *)cl->cl_red, cl->cl_q, m,
 				cl->cl_pktattr);
 #endif
 #ifdef ALTQ_RED
 	if (q_is_red(cl->cl_q))
 		return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr);
 #endif
 #ifdef ALTQ_CODEL
 	if (q_is_codel(cl->cl_q))
 		return codel_addq(cl->cl_codel, cl->cl_q, m);
 #endif
 	if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) {
 		m_freem(m);
 		return (-1);
 	}
 
 	if (cl->cl_flags & PRCF_CLEARDSCP)
 		write_dsfield(m, cl->cl_pktattr, 0);
 
 	_addq(cl->cl_q, m);
 
 	return (0);
 }
 
 static struct mbuf *
 priq_getq(struct priq_class *cl)
 {
 #ifdef ALTQ_RIO
 	if (q_is_rio(cl->cl_q))
 		return rio_getq((rio_t *)cl->cl_red, cl->cl_q);
 #endif
 #ifdef ALTQ_RED
 	if (q_is_red(cl->cl_q))
 		return red_getq(cl->cl_red, cl->cl_q);
 #endif
 #ifdef ALTQ_CODEL
 	if (q_is_codel(cl->cl_q))
 		return codel_getq(cl->cl_codel, cl->cl_q);
 #endif
 	return _getq(cl->cl_q);
 }
 
 static struct mbuf *
 priq_pollq(cl)
 	struct priq_class *cl;
 {
 	return qhead(cl->cl_q);
 }
 
 static void
 priq_purgeq(struct priq_class *cl)
 {
 	struct mbuf *m;
 
 	if (qempty(cl->cl_q))
 		return;
 
 	while ((m = _getq(cl->cl_q)) != NULL) {
 		PKTCNTR_ADD(&cl->cl_dropcnt, m_pktlen(m));
 		m_freem(m);
 	}
 	ASSERT(qlen(cl->cl_q) == 0);
 }
 
 static void
 get_class_stats(struct priq_classstats *sp, struct priq_class *cl)
 {
 	sp->class_handle = cl->cl_handle;
 	sp->qlength = qlen(cl->cl_q);
 	sp->qlimit = qlimit(cl->cl_q);
 	sp->period = cl->cl_period;
 	sp->xmitcnt = cl->cl_xmitcnt;
 	sp->dropcnt = cl->cl_dropcnt;
 
 	sp->qtype = qtype(cl->cl_q);
 #ifdef ALTQ_RED
 	if (q_is_red(cl->cl_q))
 		red_getstats(cl->cl_red, &sp->red[0]);
 #endif
 #ifdef ALTQ_RIO
 	if (q_is_rio(cl->cl_q))
 		rio_getstats((rio_t *)cl->cl_red, &sp->red[0]);
 #endif
 #ifdef ALTQ_CODEL
 	if (q_is_codel(cl->cl_q))
 		codel_getstats(cl->cl_codel, &sp->codel);
 #endif
 }
 
 /* convert a class handle to the corresponding class pointer */
 static struct priq_class *
 clh_to_clp(struct priq_if *pif, u_int32_t chandle)
 {
 	struct priq_class *cl;
 	int idx;
 
 	if (chandle == 0)
 		return (NULL);
 
 	for (idx = pif->pif_maxpri; idx >= 0; idx--)
 		if ((cl = pif->pif_classes[idx]) != NULL &&
 		    cl->cl_handle == chandle)
 			return (cl);
 
 	return (NULL);
 }
 
 #endif /* ALTQ_PRIQ */
diff --git a/sys/net/altq/altq_rmclass.c b/sys/net/altq/altq_rmclass.c
index a9af314cd48a..6e64a0a6f80e 100644
--- a/sys/net/altq/altq_rmclass.c
+++ b/sys/net/altq/altq_rmclass.c
@@ -1,1834 +1,1835 @@
 /*-
  * Copyright (c) 1991-1997 Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by the Network Research
  *      Group at Lawrence Berkeley Laboratory.
  * 4. Neither the name of the University nor of the Laboratory may be used
  *    to endorse or promote products derived from this software without
  *    specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * LBL code modified by speer@eng.sun.com, May 1977.
  * For questions and/or comments, please send mail to cbq@ee.lbl.gov
  *
  * @(#)rm_class.c  1.48     97/12/05 SMI
  * $KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $
  * $FreeBSD$
  */
 #include "opt_altq.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #ifdef ALTQ_CBQ	/* cbq is enabled by ALTQ_CBQ option in opt_altq.h */
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 
 #include <net/altq/if_altq.h>
 #include <net/altq/altq.h>
 #include <net/altq/altq_codel.h>
 #include <net/altq/altq_rmclass.h>
 #include <net/altq/altq_rmclass_debug.h>
 #include <net/altq/altq_red.h>
 #include <net/altq/altq_rio.h>
 
 /*
  * Local Macros
  */
 #define	reset_cutoff(ifd)	{ ifd->cutoff_ = RM_MAXDEPTH; }
 
 /*
  * Local routines.
  */
 
 static int	rmc_satisfied(struct rm_class *, struct timeval *);
 static void	rmc_wrr_set_weights(struct rm_ifdat *);
 static void	rmc_depth_compute(struct rm_class *);
 static void	rmc_depth_recompute(rm_class_t *);
 
 static mbuf_t	*_rmc_wrr_dequeue_next(struct rm_ifdat *, int);
 static mbuf_t	*_rmc_prr_dequeue_next(struct rm_ifdat *, int);
 
 static int	_rmc_addq(rm_class_t *, mbuf_t *);
 static void	_rmc_dropq(rm_class_t *);
 static mbuf_t	*_rmc_getq(rm_class_t *);
 static mbuf_t	*_rmc_pollq(rm_class_t *);
 
 static int	rmc_under_limit(struct rm_class *, struct timeval *);
 static void	rmc_tl_satisfied(struct rm_ifdat *, struct timeval *);
 static void	rmc_drop_action(struct rm_class *);
 static void	rmc_restart(void *);
 static void	rmc_root_overlimit(struct rm_class *, struct rm_class *);
 
 #define	BORROW_OFFTIME
 /*
  * BORROW_OFFTIME (experimental):
  * borrow the offtime of the class borrowing from.
  * the reason is that when its own offtime is set, the class is unable
  * to borrow much, especially when cutoff is taking effect.
  * but when the borrowed class is overloaded (advidle is close to minidle),
  * use the borrowing class's offtime to avoid overload.
  */
 #define	ADJUST_CUTOFF
 /*
  * ADJUST_CUTOFF (experimental):
  * if no underlimit class is found due to cutoff, increase cutoff and
  * retry the scheduling loop.
  * also, don't invoke delay_actions while cutoff is taking effect,
  * since a sleeping class won't have a chance to be scheduled in the
  * next loop.
  *
  * now heuristics for setting the top-level variable (cutoff_) becomes:
  *	1. if a packet arrives for a not-overlimit class, set cutoff
  *	   to the depth of the class.
  *	2. if cutoff is i, and a packet arrives for an overlimit class
  *	   with an underlimit ancestor at a lower level than i (say j),
  *	   then set cutoff to j.
  *	3. at scheduling a packet, if there is no underlimit class
  *	   due to the current cutoff level, increase cutoff by 1 and
  *	   then try to schedule again.
  */
 
 /*
  * rm_class_t *
  * rmc_newclass(...) - Create a new resource management class at priority
  * 'pri' on the interface given by 'ifd'.
  *
  * nsecPerByte  is the data rate of the interface in nanoseconds/byte.
  *              E.g., 800 for a 10Mb/s ethernet.  If the class gets less
  *              than 100% of the bandwidth, this number should be the
  *              'effective' rate for the class.  Let f be the
  *              bandwidth fraction allocated to this class, and let
  *              nsPerByte be the data rate of the output link in
  *              nanoseconds/byte.  Then nsecPerByte is set to
  *              nsPerByte / f.  E.g., 1600 (= 800 / .5)
  *              for a class that gets 50% of an ethernet's bandwidth.
  *
  * action       the routine to call when the class is over limit.
  *
  * maxq         max allowable queue size for class (in packets).
  *
  * parent       parent class pointer.
  *
  * borrow       class to borrow from (should be either 'parent' or null).
  *
  * maxidle      max value allowed for class 'idle' time estimate (this
  *              parameter determines how large an initial burst of packets
  *              can be before overlimit action is invoked.
  *
  * offtime      how long 'delay' action will delay when class goes over
  *              limit (this parameter determines the steady-state burst
  *              size when a class is running over its limit).
  *
  * Maxidle and offtime have to be computed from the following:  If the
  * average packet size is s, the bandwidth fraction allocated to this
  * class is f, we want to allow b packet bursts, and the gain of the
  * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
  *
  *   ptime = s * nsPerByte * (1 - f) / f
  *   maxidle = ptime * (1 - g^b) / g^b
  *   minidle = -ptime * (1 / (f - 1))
  *   offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
  *
  * Operationally, it's convenient to specify maxidle & offtime in units
  * independent of the link bandwidth so the maxidle & offtime passed to
  * this routine are the above values multiplied by 8*f/(1000*nsPerByte).
  * (The constant factor is a scale factor needed to make the parameters
  * integers.  This scaling also means that the 'unscaled' values of
  * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
  * not nanoseconds.)  Also note that the 'idle' filter computation keeps
  * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of
  * maxidle also must be scaled upward by this value.  Thus, the passed
  * values for maxidle and offtime can be computed as follows:
  *
  * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
  * offtime = offtime * 8 / (1000 * nsecPerByte)
  *
  * When USE_HRTIME is employed, then maxidle and offtime become:
  * 	maxidle = maxilde * (8.0 / nsecPerByte);
  * 	offtime = offtime * (8.0 / nsecPerByte);
  */
 struct rm_class *
 rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte,
     void (*action)(rm_class_t *, rm_class_t *), int maxq,
     struct rm_class *parent, struct rm_class *borrow, u_int maxidle,
     int minidle, u_int offtime, int pktsize, int flags)
 {
 	struct rm_class	*cl;
 	struct rm_class	*peer;
 	int		 s;
 
 	if (pri >= RM_MAXPRIO)
 		return (NULL);
 #ifndef ALTQ_RED
 	if (flags & RMCF_RED) {
 #ifdef ALTQ_DEBUG
 		printf("rmc_newclass: RED not configured for CBQ!\n");
 #endif
 		return (NULL);
 	}
 #endif
 #ifndef ALTQ_RIO
 	if (flags & RMCF_RIO) {
 #ifdef ALTQ_DEBUG
 		printf("rmc_newclass: RIO not configured for CBQ!\n");
 #endif
 		return (NULL);
 	}
 #endif
 #ifndef ALTQ_CODEL
 	if (flags & RMCF_CODEL) {
 #ifdef ALTQ_DEBUG
 		printf("rmc_newclass: CODEL not configured for CBQ!\n");
 #endif
 		return (NULL);
 	}
 #endif
 
 	cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (cl == NULL)
 		return (NULL);
 	CALLOUT_INIT(&cl->callout_);
 	cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (cl->q_ == NULL) {
 		free(cl, M_DEVBUF);
 		return (NULL);
 	}
 
 	/*
 	 * Class initialization.
 	 */
 	cl->children_ = NULL;
 	cl->parent_ = parent;
 	cl->borrow_ = borrow;
 	cl->leaf_ = 1;
 	cl->ifdat_ = ifd;
 	cl->pri_ = pri;
 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
 	cl->depth_ = 0;
 	cl->qthresh_ = 0;
 	cl->ns_per_byte_ = nsecPerByte;
 
 	qlimit(cl->q_) = maxq;
 	qtype(cl->q_) = Q_DROPHEAD;
 	qlen(cl->q_) = 0;
 	cl->flags_ = flags;
 
 #if 1 /* minidle is also scaled in ALTQ */
 	cl->minidle_ = (minidle * (int)nsecPerByte) / 8;
 	if (cl->minidle_ > 0)
 		cl->minidle_ = 0;
 #else
 	cl->minidle_ = minidle;
 #endif
 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
 	if (cl->maxidle_ == 0)
 		cl->maxidle_ = 1;
 #if 1 /* offtime is also scaled in ALTQ */
 	cl->avgidle_ = cl->maxidle_;
 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
 	if (cl->offtime_ == 0)
 		cl->offtime_ = 1;
 #else
 	cl->avgidle_ = 0;
 	cl->offtime_ = (offtime * nsecPerByte) / 8;
 #endif
 	cl->overlimit = action;
 
 #ifdef ALTQ_RED
 	if (flags & (RMCF_RED|RMCF_RIO)) {
 		int red_flags, red_pkttime;
 
 		red_flags = 0;
 		if (flags & RMCF_ECN)
 			red_flags |= REDF_ECN;
 		if (flags & RMCF_FLOWVALVE)
 			red_flags |= REDF_FLOWVALVE;
 #ifdef ALTQ_RIO
 		if (flags & RMCF_CLEARDSCP)
 			red_flags |= RIOF_CLEARDSCP;
 #endif
 		red_pkttime = nsecPerByte * pktsize  / 1000;
 
 		if (flags & RMCF_RED) {
 			cl->red_ = red_alloc(0, 0,
 			    qlimit(cl->q_) * 10/100,
 			    qlimit(cl->q_) * 30/100,
 			    red_flags, red_pkttime);
 			if (cl->red_ != NULL)
 				qtype(cl->q_) = Q_RED;
 		}
 #ifdef ALTQ_RIO
 		else {
 			cl->red_ = (red_t *)rio_alloc(0, NULL,
 						      red_flags, red_pkttime);
 			if (cl->red_ != NULL)
 				qtype(cl->q_) = Q_RIO;
 		}
 #endif
 	}
 #endif /* ALTQ_RED */
 #ifdef ALTQ_CODEL
 	if (flags & RMCF_CODEL) {
 		cl->codel_ = codel_alloc(5, 100, 0);
 		if (cl->codel_ != NULL)
 			qtype(cl->q_) = Q_CODEL;
 	}
 #endif
 
 	/*
 	 * put the class into the class tree
 	 */
 	s = splnet();
 	IFQ_LOCK(ifd->ifq_);
 	if ((peer = ifd->active_[pri]) != NULL) {
 		/* find the last class at this pri */
 		cl->peer_ = peer;
 		while (peer->peer_ != ifd->active_[pri])
 			peer = peer->peer_;
 		peer->peer_ = cl;
 	} else {
 		ifd->active_[pri] = cl;
 		cl->peer_ = cl;
 	}
 
 	if (cl->parent_) {
 		cl->next_ = parent->children_;
 		parent->children_ = cl;
 		parent->leaf_ = 0;
 	}
 
 	/*
 	 * Compute the depth of this class and its ancestors in the class
 	 * hierarchy.
 	 */
 	rmc_depth_compute(cl);
 
 	/*
 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
 	 */
 	if (ifd->wrr_) {
 		ifd->num_[pri]++;
 		ifd->alloc_[pri] += cl->allotment_;
 		rmc_wrr_set_weights(ifd);
 	}
 	IFQ_UNLOCK(ifd->ifq_);
 	splx(s);
 	return (cl);
 }
 
 int
 rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle,
     int minidle, u_int offtime, int pktsize)
 {
 	struct rm_ifdat	*ifd;
 	u_int		 old_allotment;
 	int		 s;
 
 	ifd = cl->ifdat_;
 	old_allotment = cl->allotment_;
 
 	s = splnet();
 	IFQ_LOCK(ifd->ifq_);
 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
 	cl->qthresh_ = 0;
 	cl->ns_per_byte_ = nsecPerByte;
 
 	qlimit(cl->q_) = maxq;
 
 #if 1 /* minidle is also scaled in ALTQ */
 	cl->minidle_ = (minidle * nsecPerByte) / 8;
 	if (cl->minidle_ > 0)
 		cl->minidle_ = 0;
 #else
 	cl->minidle_ = minidle;
 #endif
 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
 	if (cl->maxidle_ == 0)
 		cl->maxidle_ = 1;
 #if 1 /* offtime is also scaled in ALTQ */
 	cl->avgidle_ = cl->maxidle_;
 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
 	if (cl->offtime_ == 0)
 		cl->offtime_ = 1;
 #else
 	cl->avgidle_ = 0;
 	cl->offtime_ = (offtime * nsecPerByte) / 8;
 #endif
 
 	/*
 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
 	 */
 	if (ifd->wrr_) {
 		ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment;
 		rmc_wrr_set_weights(ifd);
 	}
 	IFQ_UNLOCK(ifd->ifq_);
 	splx(s);
 	return (0);
 }
 
 /*
  * static void
  * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes
  *	the appropriate run robin weights for the CBQ weighted round robin
  *	algorithm.
  *
  *	Returns: NONE
  */
 
 static void
 rmc_wrr_set_weights(struct rm_ifdat *ifd)
 {
 	int		i;
 	struct rm_class	*cl, *clh;
 
 	for (i = 0; i < RM_MAXPRIO; i++) {
 		/*
 		 * This is inverted from that of the simulator to
 		 * maintain precision.
 		 */
 		if (ifd->num_[i] == 0)
 			ifd->M_[i] = 0;
 		else
 			ifd->M_[i] = ifd->alloc_[i] /
 				(ifd->num_[i] * ifd->maxpkt_);
 		/*
 		 * Compute the weighted allotment for each class.
 		 * This takes the expensive div instruction out
 		 * of the main loop for the wrr scheduling path.
 		 * These only get recomputed when a class comes or
 		 * goes.
 		 */
 		if (ifd->active_[i] != NULL) {
 			clh = cl = ifd->active_[i];
 			do {
 				/* safe-guard for slow link or alloc_ == 0 */
 				if (ifd->M_[i] == 0)
 					cl->w_allotment_ = 0;
 				else
 					cl->w_allotment_ = cl->allotment_ /
 						ifd->M_[i];
 				cl = cl->peer_;
 			} while ((cl != NULL) && (cl != clh));
 		}
 	}
 }
 
 int
 rmc_get_weight(struct rm_ifdat *ifd, int pri)
 {
 	if ((pri >= 0) && (pri < RM_MAXPRIO))
 		return (ifd->M_[pri]);
 	else
 		return (0);
 }
 
 /*
  * static void
  * rmc_depth_compute(struct rm_class *cl) - This function computes the
  *	appropriate depth of class 'cl' and its ancestors.
  *
  *	Returns:	NONE
  */
 
 static void
 rmc_depth_compute(struct rm_class *cl)
 {
 	rm_class_t	*t = cl, *p;
 
 	/*
 	 * Recompute the depth for the branch of the tree.
 	 */
 	while (t != NULL) {
 		p = t->parent_;
 		if (p && (t->depth_ >= p->depth_)) {
 			p->depth_ = t->depth_ + 1;
 			t = p;
 		} else
 			t = NULL;
 	}
 }
 
 /*
  * static void
  * rmc_depth_recompute(struct rm_class *cl) - This function re-computes
  *	the depth of the tree after a class has been deleted.
  *
  *	Returns:	NONE
  */
 
 static void
 rmc_depth_recompute(rm_class_t *cl)
 {
 #if 1 /* ALTQ */
 	rm_class_t	*p, *t;
 
 	p = cl;
 	while (p != NULL) {
 		if ((t = p->children_) == NULL) {
 			p->depth_ = 0;
 		} else {
 			int cdepth = 0;
 
 			while (t != NULL) {
 				if (t->depth_ > cdepth)
 					cdepth = t->depth_;
 				t = t->next_;
 			}
 
 			if (p->depth_ == cdepth + 1)
 				/* no change to this parent */
 				return;
 
 			p->depth_ = cdepth + 1;
 		}
 
 		p = p->parent_;
 	}
 #else
 	rm_class_t	*t;
 
 	if (cl->depth_ >= 1) {
 		if (cl->children_ == NULL) {
 			cl->depth_ = 0;
 		} else if ((t = cl->children_) != NULL) {
 			while (t != NULL) {
 				if (t->children_ != NULL)
 					rmc_depth_recompute(t);
 				t = t->next_;
 			}
 		} else
 			rmc_depth_compute(cl);
 	}
 #endif
 }
 
 /*
  * void
  * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This
  *	function deletes a class from the link-sharing structure and frees
  *	all resources associated with the class.
  *
  *	Returns: NONE
  */
 
 void
 rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl)
 {
 	struct rm_class	*p, *head, *previous;
 	int		 s;
 
 	ASSERT(cl->children_ == NULL);
 
 	if (cl->sleeping_)
 		CALLOUT_STOP(&cl->callout_);
 
 	s = splnet();
 	IFQ_LOCK(ifd->ifq_);
 	/*
 	 * Free packets in the packet queue.
 	 * XXX - this may not be a desired behavior.  Packets should be
 	 *		re-queued.
 	 */
 	rmc_dropall(cl);
 
 	/*
 	 * If the class has a parent, then remove the class from the
 	 * class from the parent's children chain.
 	 */
 	if (cl->parent_ != NULL) {
 		head = cl->parent_->children_;
 		p = previous = head;
 		if (head->next_ == NULL) {
 			ASSERT(head == cl);
 			cl->parent_->children_ = NULL;
 			cl->parent_->leaf_ = 1;
 		} else while (p != NULL) {
 			if (p == cl) {
 				if (cl == head)
 					cl->parent_->children_ = cl->next_;
 				else
 					previous->next_ = cl->next_;
 				cl->next_ = NULL;
 				p = NULL;
 			} else {
 				previous = p;
 				p = p->next_;
 			}
 		}
 	}
 
 	/*
 	 * Delete class from class priority peer list.
 	 */
 	if ((p = ifd->active_[cl->pri_]) != NULL) {
 		/*
 		 * If there is more than one member of this priority
 		 * level, then look for class(cl) in the priority level.
 		 */
 		if (p != p->peer_) {
 			while (p->peer_ != cl)
 				p = p->peer_;
 			p->peer_ = cl->peer_;
 
 			if (ifd->active_[cl->pri_] == cl)
 				ifd->active_[cl->pri_] = cl->peer_;
 		} else {
 			ASSERT(p == cl);
 			ifd->active_[cl->pri_] = NULL;
 		}
 	}
 
 	/*
 	 * Recompute the WRR weights.
 	 */
 	if (ifd->wrr_) {
 		ifd->alloc_[cl->pri_] -= cl->allotment_;
 		ifd->num_[cl->pri_]--;
 		rmc_wrr_set_weights(ifd);
 	}
 
 	/*
 	 * Re-compute the depth of the tree.
 	 */
 #if 1 /* ALTQ */
 	rmc_depth_recompute(cl->parent_);
 #else
 	rmc_depth_recompute(ifd->root_);
 #endif
 
 	IFQ_UNLOCK(ifd->ifq_);
 	splx(s);
 
 	/*
 	 * Free the class structure.
 	 */
 	if (cl->red_ != NULL) {
 #ifdef ALTQ_RIO
 		if (q_is_rio(cl->q_))
 			rio_destroy((rio_t *)cl->red_);
 #endif
 #ifdef ALTQ_RED
 		if (q_is_red(cl->q_))
 			red_destroy(cl->red_);
 #endif
 #ifdef ALTQ_CODEL
 		if (q_is_codel(cl->q_))
 			codel_destroy(cl->codel_);
 #endif
 	}
 	free(cl->q_, M_DEVBUF);
 	free(cl, M_DEVBUF);
 }
 
 /*
  * void
  * rmc_init(...) - Initialize the resource management data structures
  *	associated with the output portion of interface 'ifp'.  'ifd' is
  *	where the structures will be built (for backwards compatibility, the
  *	structures aren't kept in the ifnet struct).  'nsecPerByte'
  *	gives the link speed (inverse of bandwidth) in nanoseconds/byte.
  *	'restart' is the driver-specific routine that the generic 'delay
  *	until under limit' action will call to restart output.  `maxq'
  *	is the queue size of the 'link' & 'default' classes.  'maxqueued'
  *	is the maximum number of packets that the resource management
  *	code will allow to be queued 'downstream' (this is typically 1).
  *
  *	Returns:	NONE
  */
 
 void
 rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte,
     void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle,
     int minidle, u_int offtime, int flags)
 {
 	int		i, mtu;
 
 	/*
 	 * Initialize the CBQ tracing/debug facility.
 	 */
 	CBQTRACEINIT();
 
 	bzero((char *)ifd, sizeof (*ifd));
 	mtu = ifq->altq_ifp->if_mtu;
 	ifd->ifq_ = ifq;
 	ifd->restart = restart;
 	ifd->maxqueued_ = maxqueued;
 	ifd->ns_per_byte_ = nsecPerByte;
 	ifd->maxpkt_ = mtu;
 	ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0;
 	ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0;
 #if 1
 	ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16;
 	if (mtu * nsecPerByte > 10 * 1000000)
 		ifd->maxiftime_ /= 4;
 #endif
 
 	reset_cutoff(ifd);
 	CBQTRACE(rmc_init, 'INIT', ifd->cutoff_);
 
 	/*
 	 * Initialize the CBQ's WRR state.
 	 */
 	for (i = 0; i < RM_MAXPRIO; i++) {
 		ifd->alloc_[i] = 0;
 		ifd->M_[i] = 0;
 		ifd->num_[i] = 0;
 		ifd->na_[i] = 0;
 		ifd->active_[i] = NULL;
 	}
 
 	/*
 	 * Initialize current packet state.
 	 */
 	ifd->qi_ = 0;
 	ifd->qo_ = 0;
 	for (i = 0; i < RM_MAXQUEUED; i++) {
 		ifd->class_[i] = NULL;
 		ifd->curlen_[i] = 0;
 		ifd->borrowed_[i] = NULL;
 	}
 
 	/*
 	 * Create the root class of the link-sharing structure.
 	 */
 	if ((ifd->root_ = rmc_newclass(0, ifd,
 				       nsecPerByte,
 				       rmc_root_overlimit, maxq, 0, 0,
 				       maxidle, minidle, offtime,
 				       0, 0)) == NULL) {
 		printf("rmc_init: root class not allocated\n");
 		return ;
 	}
 	ifd->root_->depth_ = 0;
 }
 
 /*
  * void
  * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by
  *	mbuf 'm' to queue for resource class 'cl'.  This routine is called
  *	by a driver's if_output routine.  This routine must be called with
  *	output packet completion interrupts locked out (to avoid racing with
  *	rmc_dequeue_next).
  *
  *	Returns:	0 on successful queueing
  *			-1 when packet drop occurs
  */
 int
 rmc_queue_packet(struct rm_class *cl, mbuf_t *m)
 {
 	struct timeval	 now;
 	struct rm_ifdat *ifd = cl->ifdat_;
 	int		 cpri = cl->pri_;
 	int		 is_empty = qempty(cl->q_);
 
 	RM_GETTIME(now);
 	if (ifd->cutoff_ > 0) {
 		if (TV_LT(&cl->undertime_, &now)) {
 			if (ifd->cutoff_ > cl->depth_)
 				ifd->cutoff_ = cl->depth_;
 			CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_);
 		}
 #if 1 /* ALTQ */
 		else {
 			/*
 			 * the class is overlimit. if the class has
 			 * underlimit ancestors, set cutoff to the lowest
 			 * depth among them.
 			 */
 			struct rm_class *borrow = cl->borrow_;
 
 			while (borrow != NULL &&
 			       borrow->depth_ < ifd->cutoff_) {
 				if (TV_LT(&borrow->undertime_, &now)) {
 					ifd->cutoff_ = borrow->depth_;
 					CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_);
 					break;
 				}
 				borrow = borrow->borrow_;
 			}
 		}
 #else /* !ALTQ */
 		else if ((ifd->cutoff_ > 1) && cl->borrow_) {
 			if (TV_LT(&cl->borrow_->undertime_, &now)) {
 				ifd->cutoff_ = cl->borrow_->depth_;
 				CBQTRACE(rmc_queue_packet, 'ffob',
 					 cl->borrow_->depth_);
 			}
 		}
 #endif /* !ALTQ */
 	}
 
 	if (_rmc_addq(cl, m) < 0)
 		/* failed */
 		return (-1);
 
 	if (is_empty) {
 		CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle);
 		ifd->na_[cpri]++;
 	}
 
 	if (qlen(cl->q_) > qlimit(cl->q_)) {
 		/* note: qlimit can be set to 0 or 1 */
 		rmc_drop_action(cl);
 		return (-1);
 	}
 	return (0);
 }
 
 /*
  * void
  * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all
  *	classes to see if there are satified.
  */
 
 static void
 rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now)
 {
 	int		 i;
 	rm_class_t	*p, *bp;
 
 	for (i = RM_MAXPRIO - 1; i >= 0; i--) {
 		if ((bp = ifd->active_[i]) != NULL) {
 			p = bp;
 			do {
 				if (!rmc_satisfied(p, now)) {
 					ifd->cutoff_ = p->depth_;
 					return;
 				}
 				p = p->peer_;
 			} while (p != bp);
 		}
 	}
 
 	reset_cutoff(ifd);
 }
 
 /*
  * rmc_satisfied - Return 1 of the class is satisfied.  O, otherwise.
  */
 
 static int
 rmc_satisfied(struct rm_class *cl, struct timeval *now)
 {
 	rm_class_t	*p;
 
 	if (cl == NULL)
 		return (1);
 	if (TV_LT(now, &cl->undertime_))
 		return (1);
 	if (cl->depth_ == 0) {
 		if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_))
 			return (0);
 		else
 			return (1);
 	}
 	if (cl->children_ != NULL) {
 		p = cl->children_;
 		while (p != NULL) {
 			if (!rmc_satisfied(p, now))
 				return (0);
 			p = p->next_;
 		}
 	}
 
 	return (1);
 }
 
 /*
  * Return 1 if class 'cl' is under limit or can borrow from a parent,
  * 0 if overlimit.  As a side-effect, this routine will invoke the
  * class overlimit action if the class if overlimit.
  */
 
 static int
 rmc_under_limit(struct rm_class *cl, struct timeval *now)
 {
 	rm_class_t	*p = cl;
 	rm_class_t	*top;
 	struct rm_ifdat	*ifd = cl->ifdat_;
 
 	ifd->borrowed_[ifd->qi_] = NULL;
 	/*
 	 * If cl is the root class, then always return that it is
 	 * underlimit.  Otherwise, check to see if the class is underlimit.
 	 */
 	if (cl->parent_ == NULL)
 		return (1);
 
 	if (cl->sleeping_) {
 		if (TV_LT(now, &cl->undertime_))
 			return (0);
 
 		CALLOUT_STOP(&cl->callout_);
 		cl->sleeping_ = 0;
 		cl->undertime_.tv_sec = 0;
 		return (1);
 	}
 
 	top = NULL;
 	while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) {
 		if (((cl = cl->borrow_) == NULL) ||
 		    (cl->depth_ > ifd->cutoff_)) {
 #ifdef ADJUST_CUTOFF
 			if (cl != NULL)
 				/* cutoff is taking effect, just
 				   return false without calling
 				   the delay action. */
 				return (0);
 #endif
 #ifdef BORROW_OFFTIME
 			/*
 			 * check if the class can borrow offtime too.
 			 * borrow offtime from the top of the borrow
 			 * chain if the top class is not overloaded.
 			 */
 			if (cl != NULL) {
 				/* cutoff is taking effect, use this class as top. */
 				top = cl;
 				CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_);
 			}
 			if (top != NULL && top->avgidle_ == top->minidle_)
 				top = NULL;
 			p->overtime_ = *now;
 			(p->overlimit)(p, top);
 #else
 			p->overtime_ = *now;
 			(p->overlimit)(p, NULL);
 #endif
 			return (0);
 		}
 		top = cl;
 	}
 
 	if (cl != p)
 		ifd->borrowed_[ifd->qi_] = cl;
 	return (1);
 }
 
 /*
  * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to
  *	Packet-by-packet round robin.
  *
  * The heart of the weighted round-robin scheduler, which decides which
  * class next gets to send a packet.  Highest priority first, then
  * weighted round-robin within priorites.
  *
  * Each able-to-send class gets to send until its byte allocation is
  * exhausted.  Thus, the active pointer is only changed after a class has
  * exhausted its allocation.
  *
  * If the scheduler finds no class that is underlimit or able to borrow,
  * then the first class found that had a nonzero queue and is allowed to
  * borrow gets to send.
  */
 
 static mbuf_t *
 _rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op)
 {
 	struct rm_class	*cl = NULL, *first = NULL;
 	u_int		 deficit;
 	int		 cpri;
 	mbuf_t		*m;
 	struct timeval	 now;
 
 	RM_GETTIME(now);
 
 	/*
 	 * if the driver polls the top of the queue and then removes
 	 * the polled packet, we must return the same packet.
 	 */
 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
 		cl = ifd->pollcache_;
 		cpri = cl->pri_;
 		if (ifd->efficient_) {
 			/* check if this class is overlimit */
 			if (cl->undertime_.tv_sec != 0 &&
 			    rmc_under_limit(cl, &now) == 0)
 				first = cl;
 		}
 		ifd->pollcache_ = NULL;
 		goto _wrr_out;
 	}
 	else {
 		/* mode == ALTDQ_POLL || pollcache == NULL */
 		ifd->pollcache_ = NULL;
 		ifd->borrowed_[ifd->qi_] = NULL;
 	}
 #ifdef ADJUST_CUTOFF
  _again:
 #endif
 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
 		if (ifd->na_[cpri] == 0)
 			continue;
 		deficit = 0;
 		/*
 		 * Loop through twice for a priority level, if some class
 		 * was unable to send a packet the first round because
 		 * of the weighted round-robin mechanism.
 		 * During the second loop at this level, deficit==2.
 		 * (This second loop is not needed if for every class,
 		 * "M[cl->pri_])" times "cl->allotment" is greater than
 		 * the byte size for the largest packet in the class.)
 		 */
  _wrr_loop:
 		cl = ifd->active_[cpri];
 		ASSERT(cl != NULL);
 		do {
 			if ((deficit < 2) && (cl->bytes_alloc_ <= 0))
 				cl->bytes_alloc_ += cl->w_allotment_;
 			if (!qempty(cl->q_)) {
 				if ((cl->undertime_.tv_sec == 0) ||
 				    rmc_under_limit(cl, &now)) {
 					if (cl->bytes_alloc_ > 0 || deficit > 1)
 						goto _wrr_out;
 
 					/* underlimit but no alloc */
 					deficit = 1;
 #if 1
 					ifd->borrowed_[ifd->qi_] = NULL;
 #endif
 				}
 				else if (first == NULL && cl->borrow_ != NULL)
 					first = cl; /* borrowing candidate */
 			}
 
 			cl->bytes_alloc_ = 0;
 			cl = cl->peer_;
 		} while (cl != ifd->active_[cpri]);
 
 		if (deficit == 1) {
 			/* first loop found an underlimit class with deficit */
 			/* Loop on same priority level, with new deficit.  */
 			deficit = 2;
 			goto _wrr_loop;
 		}
 	}
 
 #ifdef ADJUST_CUTOFF
 	/*
 	 * no underlimit class found.  if cutoff is taking effect,
 	 * increase cutoff and try again.
 	 */
 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
 		ifd->cutoff_++;
 		CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_);
 		goto _again;
 	}
 #endif /* ADJUST_CUTOFF */
 	/*
 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
 	 * class we encounter will send a packet if all the classes
 	 * of the link-sharing structure are overlimit.
 	 */
 	reset_cutoff(ifd);
 	CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_);
 
 	if (!ifd->efficient_ || first == NULL)
 		return (NULL);
 
 	cl = first;
 	cpri = cl->pri_;
 #if 0	/* too time-consuming for nothing */
 	if (cl->sleeping_)
 		CALLOUT_STOP(&cl->callout_);
 	cl->sleeping_ = 0;
 	cl->undertime_.tv_sec = 0;
 #endif
 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
 	ifd->cutoff_ = cl->borrow_->depth_;
 
 	/*
 	 * Deque the packet and do the book keeping...
 	 */
  _wrr_out:
 	if (op == ALTDQ_REMOVE) {
 		m = _rmc_getq(cl);
 		if (m == NULL)
 			panic("_rmc_wrr_dequeue_next");
 		if (qempty(cl->q_))
 			ifd->na_[cpri]--;
 
 		/*
 		 * Update class statistics and link data.
 		 */
 		if (cl->bytes_alloc_ > 0)
 			cl->bytes_alloc_ -= m_pktlen(m);
 
 		if ((cl->bytes_alloc_ <= 0) || first == cl)
 			ifd->active_[cl->pri_] = cl->peer_;
 		else
 			ifd->active_[cl->pri_] = cl;
 
 		ifd->class_[ifd->qi_] = cl;
 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
 		ifd->now_[ifd->qi_] = now;
 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
 		ifd->queued_++;
 	} else {
 		/* mode == ALTDQ_PPOLL */
 		m = _rmc_pollq(cl);
 		ifd->pollcache_ = cl;
 	}
 	return (m);
 }
 
 /*
  * Dequeue & return next packet from the highest priority class that
  * has a packet to send & has enough allocation to send it.  This
  * routine is called by a driver whenever it needs a new packet to
  * output.
  */
 static mbuf_t *
 _rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op)
 {
 	mbuf_t		*m;
 	int		 cpri;
 	struct rm_class	*cl, *first = NULL;
 	struct timeval	 now;
 
 	RM_GETTIME(now);
 
 	/*
 	 * if the driver polls the top of the queue and then removes
 	 * the polled packet, we must return the same packet.
 	 */
 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
 		cl = ifd->pollcache_;
 		cpri = cl->pri_;
 		ifd->pollcache_ = NULL;
 		goto _prr_out;
 	} else {
 		/* mode == ALTDQ_POLL || pollcache == NULL */
 		ifd->pollcache_ = NULL;
 		ifd->borrowed_[ifd->qi_] = NULL;
 	}
 #ifdef ADJUST_CUTOFF
  _again:
 #endif
 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
 		if (ifd->na_[cpri] == 0)
 			continue;
 		cl = ifd->active_[cpri];
 		ASSERT(cl != NULL);
 		do {
 			if (!qempty(cl->q_)) {
 				if ((cl->undertime_.tv_sec == 0) ||
 				    rmc_under_limit(cl, &now))
 					goto _prr_out;
 				if (first == NULL && cl->borrow_ != NULL)
 					first = cl;
 			}
 			cl = cl->peer_;
 		} while (cl != ifd->active_[cpri]);
 	}
 
 #ifdef ADJUST_CUTOFF
 	/*
 	 * no underlimit class found.  if cutoff is taking effect, increase
 	 * cutoff and try again.
 	 */
 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
 		ifd->cutoff_++;
 		goto _again;
 	}
 #endif /* ADJUST_CUTOFF */
 	/*
 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
 	 * class we encounter will send a packet if all the classes
 	 * of the link-sharing structure are overlimit.
 	 */
 	reset_cutoff(ifd);
 	if (!ifd->efficient_ || first == NULL)
 		return (NULL);
 
 	cl = first;
 	cpri = cl->pri_;
 #if 0	/* too time-consuming for nothing */
 	if (cl->sleeping_)
 		CALLOUT_STOP(&cl->callout_);
 	cl->sleeping_ = 0;
 	cl->undertime_.tv_sec = 0;
 #endif
 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
 	ifd->cutoff_ = cl->borrow_->depth_;
 
 	/*
 	 * Deque the packet and do the book keeping...
 	 */
  _prr_out:
 	if (op == ALTDQ_REMOVE) {
 		m = _rmc_getq(cl);
 		if (m == NULL)
 			panic("_rmc_prr_dequeue_next");
 		if (qempty(cl->q_))
 			ifd->na_[cpri]--;
 
 		ifd->active_[cpri] = cl->peer_;
 
 		ifd->class_[ifd->qi_] = cl;
 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
 		ifd->now_[ifd->qi_] = now;
 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
 		ifd->queued_++;
 	} else {
 		/* mode == ALTDQ_POLL */
 		m = _rmc_pollq(cl);
 		ifd->pollcache_ = cl;
 	}
 	return (m);
 }
 
 /*
  * mbuf_t *
  * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function
  *	is invoked by the packet driver to get the next packet to be
  *	dequeued and output on the link.  If WRR is enabled, then the
  *	WRR dequeue next routine will determine the next packet to sent.
  *	Otherwise, packet-by-packet round robin is invoked.
  *
  *	Returns:	NULL, if a packet is not available or if all
  *			classes are overlimit.
  *
  *			Otherwise, Pointer to the next packet.
  */
 
 mbuf_t *
 rmc_dequeue_next(struct rm_ifdat *ifd, int mode)
 {
 	if (ifd->queued_ >= ifd->maxqueued_)
 		return (NULL);
 	else if (ifd->wrr_)
 		return (_rmc_wrr_dequeue_next(ifd, mode));
 	else
 		return (_rmc_prr_dequeue_next(ifd, mode));
 }
 
 /*
  * Update the utilization estimate for the packet that just completed.
  * The packet's class & the parent(s) of that class all get their
  * estimators updated.  This routine is called by the driver's output-
  * packet-completion interrupt service routine.
  */
 
 /*
  * a macro to approximate "divide by 1000" that gives 0.000999,
  * if a value has enough effective digits.
  * (on pentium, mul takes 9 cycles but div takes 46!)
  */
 #define	NSEC_TO_USEC(t)	(((t) >> 10) + ((t) >> 16) + ((t) >> 17))
 void
 rmc_update_class_util(struct rm_ifdat *ifd)
 {
 	int		 idle, avgidle, pktlen;
 	int		 pkt_time, tidle;
 	rm_class_t	*cl, *borrowed;
 	rm_class_t	*borrows;
 	struct timeval	*nowp;
 
 	/*
 	 * Get the most recent completed class.
 	 */
 	if ((cl = ifd->class_[ifd->qo_]) == NULL)
 		return;
 
 	pktlen = ifd->curlen_[ifd->qo_];
 	borrowed = ifd->borrowed_[ifd->qo_];
 	borrows = borrowed;
 
 	PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
 
 	/*
 	 * Run estimator on class and its ancestors.
 	 */
 	/*
 	 * rm_update_class_util is designed to be called when the
 	 * transfer is completed from a xmit complete interrupt,
 	 * but most drivers don't implement an upcall for that.
 	 * so, just use estimated completion time.
 	 * as a result, ifd->qi_ and ifd->qo_ are always synced.
 	 */
 	nowp = &ifd->now_[ifd->qo_];
 	/* get pkt_time (for link) in usec */
 #if 1  /* use approximation */
 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_;
 	pkt_time = NSEC_TO_USEC(pkt_time);
 #else
 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000;
 #endif
 #if 1 /* ALTQ4PPP */
 	if (TV_LT(nowp, &ifd->ifnow_)) {
 		int iftime;
 
 		/*
 		 * make sure the estimated completion time does not go
 		 * too far.  it can happen when the link layer supports
 		 * data compression or the interface speed is set to
 		 * a much lower value.
 		 */
 		TV_DELTA(&ifd->ifnow_, nowp, iftime);
 		if (iftime+pkt_time < ifd->maxiftime_) {
 			TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
 		} else {
 			TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_);
 		}
 	} else {
 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
 	}
 #else
 	if (TV_LT(nowp, &ifd->ifnow_)) {
 		TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
 	} else {
 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
 	}
 #endif
 
 	while (cl != NULL) {
 		TV_DELTA(&ifd->ifnow_, &cl->last_, idle);
 		if (idle >= 2000000)
 			/*
 			 * this class is idle enough, reset avgidle.
 			 * (TV_DELTA returns 2000000 us when delta is large.)
 			 */
 			cl->avgidle_ = cl->maxidle_;
 
 		/* get pkt_time (for class) in usec */
 #if 1  /* use approximation */
 		pkt_time = pktlen * cl->ns_per_byte_;
 		pkt_time = NSEC_TO_USEC(pkt_time);
 #else
 		pkt_time = pktlen * cl->ns_per_byte_ / 1000;
 #endif
 		idle -= pkt_time;
 
 		avgidle = cl->avgidle_;
 		avgidle += idle - (avgidle >> RM_FILTER_GAIN);
 		cl->avgidle_ = avgidle;
 
 		/* Are we overlimit ? */
 		if (avgidle <= 0) {
 			CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle);
 #if 1 /* ALTQ */
 			/*
 			 * need some lower bound for avgidle, otherwise
 			 * a borrowing class gets unbounded penalty.
 			 */
 			if (avgidle < cl->minidle_)
 				avgidle = cl->avgidle_ = cl->minidle_;
 #endif
 			/* set next idle to make avgidle 0 */
 			tidle = pkt_time +
 				(((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN);
 			TV_ADD_DELTA(nowp, tidle, &cl->undertime_);
 			++cl->stats_.over;
 		} else {
 			cl->avgidle_ =
 			    (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle;
 			cl->undertime_.tv_sec = 0;
 			if (cl->sleeping_) {
 				CALLOUT_STOP(&cl->callout_);
 				cl->sleeping_ = 0;
 			}
 		}
 
 		if (borrows != NULL) {
 			if (borrows != cl)
 				++cl->stats_.borrows;
 			else
 				borrows = NULL;
 		}
 		cl->last_ = ifd->ifnow_;
 		cl->last_pkttime_ = pkt_time;
 
 #if 1
 		if (cl->parent_ == NULL) {
 			/* take stats of root class */
 			PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
 		}
 #endif
 
 		cl = cl->parent_;
 	}
 
 	/*
 	 * Check to see if cutoff needs to set to a new level.
 	 */
 	cl = ifd->class_[ifd->qo_];
 	if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) {
 #if 1 /* ALTQ */
 		if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) {
 			rmc_tl_satisfied(ifd, nowp);
 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
 		} else {
 			ifd->cutoff_ = borrowed->depth_;
 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
 		}
 #else /* !ALTQ */
 		if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) {
 			reset_cutoff(ifd);
 #ifdef notdef
 			rmc_tl_satisfied(ifd, &now);
 #endif
 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
 		} else {
 			ifd->cutoff_ = borrowed->depth_;
 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
 		}
 #endif /* !ALTQ */
 	}
 
 	/*
 	 * Release class slot
 	 */
 	ifd->borrowed_[ifd->qo_] = NULL;
 	ifd->class_[ifd->qo_] = NULL;
 	ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_;
 	ifd->queued_--;
 }
 
 /*
  * void
  * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific)
  *	over-limit action routines.  These get invoked by rmc_under_limit()
  *	if a class with packets to send if over its bandwidth limit & can't
  *	borrow from a parent class.
  *
  *	Returns: NONE
  */
 
 static void
 rmc_drop_action(struct rm_class *cl)
 {
 	struct rm_ifdat	*ifd = cl->ifdat_;
 
 	ASSERT(qlen(cl->q_) > 0);
 	_rmc_dropq(cl);
 	if (qempty(cl->q_))
 		ifd->na_[cl->pri_]--;
 }
 
 void rmc_dropall(struct rm_class *cl)
 {
 	struct rm_ifdat	*ifd = cl->ifdat_;
 
 	if (!qempty(cl->q_)) {
 		_flushq(cl->q_);
 
 		ifd->na_[cl->pri_]--;
 	}
 }
 
 #if (__FreeBSD_version > 300000)
 /* hzto() is removed from FreeBSD-3.0 */
 static int hzto(struct timeval *);
 
 static int
 hzto(tv)
 	struct timeval *tv;
 {
 	struct timeval t2;
 
 	getmicrotime(&t2);
 	t2.tv_sec = tv->tv_sec - t2.tv_sec;
 	t2.tv_usec = tv->tv_usec - t2.tv_usec;
 	return (tvtohz(&t2));
 }
 #endif /* __FreeBSD_version > 300000 */
 
 /*
  * void
  * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ
  *	delay action routine.  It is invoked via rmc_under_limit when the
  *	packet is discoverd to be overlimit.
  *
  *	If the delay action is result of borrow class being overlimit, then
  *	delay for the offtime of the borrowing class that is overlimit.
  *
  *	Returns: NONE
  */
 
 void
 rmc_delay_action(struct rm_class *cl, struct rm_class *borrow)
 {
 	int	delay, t, extradelay;
 
 	cl->stats_.overactions++;
 	TV_DELTA(&cl->undertime_, &cl->overtime_, delay);
 #ifndef BORROW_OFFTIME
 	delay += cl->offtime_;
 #endif
 
 	if (!cl->sleeping_) {
 		CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle);
 #ifdef BORROW_OFFTIME
 		if (borrow != NULL)
 			extradelay = borrow->offtime_;
 		else
 #endif
 			extradelay = cl->offtime_;
 
 #ifdef ALTQ
 		/*
 		 * XXX recalculate suspend time:
 		 * current undertime is (tidle + pkt_time) calculated
 		 * from the last transmission.
 		 *	tidle: time required to bring avgidle back to 0
 		 *	pkt_time: target waiting time for this class
 		 * we need to replace pkt_time by offtime
 		 */
 		extradelay -= cl->last_pkttime_;
 #endif
 		if (extradelay > 0) {
 			TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_);
 			delay += extradelay;
 		}
 
 		cl->sleeping_ = 1;
 		cl->stats_.delays++;
 
 		/*
 		 * Since packets are phased randomly with respect to the
 		 * clock, 1 tick (the next clock tick) can be an arbitrarily
 		 * short time so we have to wait for at least two ticks.
 		 * NOTE:  If there's no other traffic, we need the timer as
 		 * a 'backstop' to restart this class.
 		 */
 		if (delay > tick * 2) {
 			/* FreeBSD rounds up the tick */
 			t = hzto(&cl->undertime_);
 		} else
 			t = 2;
 		CALLOUT_RESET(&cl->callout_, t, rmc_restart, cl);
 	}
 }
 
 /*
  * void
  * rmc_restart() - is just a helper routine for rmc_delay_action -- it is
  *	called by the system timer code & is responsible checking if the
  *	class is still sleeping (it might have been restarted as a side
  *	effect of the queue scan on a packet arrival) and, if so, restarting
  *	output for the class.  Inspecting the class state & restarting output
  *	require locking the class structure.  In general the driver is
  *	responsible for locking but this is the only routine that is not
  *	called directly or indirectly from the interface driver so it has
  *	know about system locking conventions.  Under bsd, locking is done
  *	by raising IPL to splimp so that's what's implemented here.  On a
  *	different system this would probably need to be changed.
  *
  *	Returns:	NONE
  */
 
 static void
 rmc_restart(void *arg)
 {
 	struct rm_class *cl = arg;
 	struct rm_ifdat	*ifd = cl->ifdat_;
 	struct epoch_tracker et;
 	int		 s;
 
 	s = splnet();
 	NET_EPOCH_ENTER(et);
 	IFQ_LOCK(ifd->ifq_);
 	CURVNET_SET(ifd->ifq_->altq_ifp->if_vnet);
 	if (cl->sleeping_) {
 		cl->sleeping_ = 0;
 		cl->undertime_.tv_sec = 0;
 
 		if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) {
 			CBQTRACE(rmc_restart, 'trts', cl->stats_.handle);
 			(ifd->restart)(ifd->ifq_);
 		}
 	}
 	CURVNET_RESTORE();
 	IFQ_UNLOCK(ifd->ifq_);
 	NET_EPOCH_EXIT(et);
 	splx(s);
 }
 
 /*
  * void
  * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit
  *	handling routine for the root class of the link sharing structure.
  *
  *	Returns: NONE
  */
 
 static void
 rmc_root_overlimit(struct rm_class *cl, struct rm_class *borrow)
 {
     panic("rmc_root_overlimit");
 }
 
 /*
  * Packet Queue handling routines.  Eventually, this is to localize the
  *	effects on the code whether queues are red queues or droptail
  *	queues.
  */
 
 static int
 _rmc_addq(rm_class_t *cl, mbuf_t *m)
 {
 #ifdef ALTQ_RIO
 	if (q_is_rio(cl->q_))
 		return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_);
 #endif
 #ifdef ALTQ_RED
 	if (q_is_red(cl->q_))
 		return red_addq(cl->red_, cl->q_, m, cl->pktattr_);
 #endif /* ALTQ_RED */
 #ifdef ALTQ_CODEL
 	if (q_is_codel(cl->q_))
 		return codel_addq(cl->codel_, cl->q_, m);
 #endif
 
 	if (cl->flags_ & RMCF_CLEARDSCP)
 		write_dsfield(m, cl->pktattr_, 0);
 
 	_addq(cl->q_, m);
 	return (0);
 }
 
 /* note: _rmc_dropq is not called for red */
 static void
 _rmc_dropq(rm_class_t *cl)
 {
 	mbuf_t	*m;
 
 	if ((m = _getq(cl->q_)) != NULL)
 		m_freem(m);
 }
 
 static mbuf_t *
 _rmc_getq(rm_class_t *cl)
 {
 #ifdef ALTQ_RIO
 	if (q_is_rio(cl->q_))
 		return rio_getq((rio_t *)cl->red_, cl->q_);
 #endif
 #ifdef ALTQ_RED
 	if (q_is_red(cl->q_))
 		return red_getq(cl->red_, cl->q_);
 #endif
 #ifdef ALTQ_CODEL
 	if (q_is_codel(cl->q_))
 		return codel_getq(cl->codel_, cl->q_);
 #endif
 	return _getq(cl->q_);
 }
 
 static mbuf_t *
 _rmc_pollq(rm_class_t *cl)
 {
 	return qhead(cl->q_);
 }
 
 #ifdef CBQ_TRACE
 
 struct cbqtrace		 cbqtrace_buffer[NCBQTRACE+1];
 struct cbqtrace		*cbqtrace_ptr = NULL;
 int			 cbqtrace_count;
 
 /*
  * DDB hook to trace cbq events:
  *  the last 1024 events are held in a circular buffer.
  *  use "call cbqtrace_dump(N)" to display 20 events from Nth event.
  */
 void cbqtrace_dump(int);
 static char *rmc_funcname(void *);
 
 static struct rmc_funcs {
 	void	*func;
 	char	*name;
 } rmc_funcs[] =
 {
 	rmc_init,		"rmc_init",
 	rmc_queue_packet,	"rmc_queue_packet",
 	rmc_under_limit,	"rmc_under_limit",
 	rmc_update_class_util,	"rmc_update_class_util",
 	rmc_delay_action,	"rmc_delay_action",
 	rmc_restart,		"rmc_restart",
 	_rmc_wrr_dequeue_next,	"_rmc_wrr_dequeue_next",
 	NULL,			NULL
 };
 
 static char *rmc_funcname(void *func)
 {
 	struct rmc_funcs *fp;
 
 	for (fp = rmc_funcs; fp->func != NULL; fp++)
 		if (fp->func == func)
 			return (fp->name);
 	return ("unknown");
 }
 
 void cbqtrace_dump(int counter)
 {
 	int	 i, *p;
 	char	*cp;
 
 	counter = counter % NCBQTRACE;
 	p = (int *)&cbqtrace_buffer[counter];
 
 	for (i=0; i<20; i++) {
 		printf("[0x%x] ", *p++);
 		printf("%s: ", rmc_funcname((void *)*p++));
 		cp = (char *)p++;
 		printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]);
 		printf("%d\n",*p++);
 
 		if (p >= (int *)&cbqtrace_buffer[NCBQTRACE])
 			p = (int *)cbqtrace_buffer;
 	}
 }
 #endif /* CBQ_TRACE */
 #endif /* ALTQ_CBQ */
 
 #if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || \
     defined(ALTQ_HFSC) || defined(ALTQ_PRIQ) || defined(ALTQ_CODEL)
 #if !defined(__GNUC__) || defined(ALTQ_DEBUG)
 
 void
 _addq(class_queue_t *q, mbuf_t *m)
 {
         mbuf_t	*m0;
 
 	if ((m0 = qtail(q)) != NULL)
 		m->m_nextpkt = m0->m_nextpkt;
 	else
 		m0 = m;
 	m0->m_nextpkt = m;
 	qtail(q) = m;
 	qlen(q)++;
 }
 
 mbuf_t *
 _getq(class_queue_t *q)
 {
 	mbuf_t	*m, *m0;
 
 	if ((m = qtail(q)) == NULL)
 		return (NULL);
 	if ((m0 = m->m_nextpkt) != m)
 		m->m_nextpkt = m0->m_nextpkt;
 	else {
 		ASSERT(qlen(q) == 1);
 		qtail(q) = NULL;
 	}
 	qlen(q)--;
 	m0->m_nextpkt = NULL;
 	return (m0);
 }
 
 /* drop a packet at the tail of the queue */
 mbuf_t *
 _getq_tail(class_queue_t *q)
 {
 	mbuf_t	*m, *m0, *prev;
 
 	if ((m = m0 = qtail(q)) == NULL)
 		return NULL;
 	do {
 		prev = m0;
 		m0 = m0->m_nextpkt;
 	} while (m0 != m);
 	prev->m_nextpkt = m->m_nextpkt;
 	if (prev == m)  {
 		ASSERT(qlen(q) == 1);
 		qtail(q) = NULL;
 	} else
 		qtail(q) = prev;
 	qlen(q)--;
 	m->m_nextpkt = NULL;
 	return (m);
 }
 
 /* randomly select a packet in the queue */
 mbuf_t *
 _getq_random(class_queue_t *q)
 {
 	struct mbuf	*m;
 	int		 i, n;
 
 	if ((m = qtail(q)) == NULL)
 		return NULL;
 	if (m->m_nextpkt == m) {
 		ASSERT(qlen(q) == 1);
 		qtail(q) = NULL;
 	} else {
 		struct mbuf *prev = NULL;
 
 		n = arc4random() % qlen(q) + 1;
 		for (i = 0; i < n; i++) {
 			prev = m;
 			m = m->m_nextpkt;
 		}
 		prev->m_nextpkt = m->m_nextpkt;
 		if (m == qtail(q))
 			qtail(q) = prev;
 	}
 	qlen(q)--;
 	m->m_nextpkt = NULL;
 	return (m);
 }
 
 void
 _removeq(class_queue_t *q, mbuf_t *m)
 {
 	mbuf_t	*m0, *prev;
 
 	m0 = qtail(q);
 	do {
 		prev = m0;
 		m0 = m0->m_nextpkt;
 	} while (m0 != m);
 	prev->m_nextpkt = m->m_nextpkt;
 	if (prev == m)
 		qtail(q) = NULL;
 	else if (qtail(q) == m)
 		qtail(q) = prev;
 	qlen(q)--;
 }
 
 void
 _flushq(class_queue_t *q)
 {
 	mbuf_t *m;
 
 	while ((m = _getq(q)) != NULL)
 		m_freem(m);
 	ASSERT(qlen(q) == 0);
 }
 
 #endif /* !__GNUC__ || ALTQ_DEBUG */
 #endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */
diff --git a/sys/net/altq/altq_subr.c b/sys/net/altq/altq_subr.c
index deadf4ecc9a9..04ea1bc91e28 100644
--- a/sys/net/altq/altq_subr.c
+++ b/sys/net/altq/altq_subr.c
@@ -1,1953 +1,1954 @@
 /*-
  * Copyright (C) 1997-2003
  *	Sony Computer Science Laboratories Inc.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $KAME: altq_subr.c,v 1.21 2003/11/06 06:32:53 kjc Exp $
  * $FreeBSD$
  */
 
 #include "opt_altq.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/kernel.h>
 #include <sys/errno.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/queue.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 
 #include <netpfil/pf/pf.h>
 #include <netpfil/pf/pf_altq.h>
 #include <net/altq/altq.h>
 
 /* machine dependent clock related includes */
 #include <sys/bus.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <machine/clock.h>
 #if defined(__amd64__) || defined(__i386__)
 #include <machine/cpufunc.h>		/* for pentium tsc */
 #include <machine/specialreg.h>		/* for CPUID_TSC */
 #include <machine/md_var.h>		/* for cpu_feature */
 #endif /* __amd64 || __i386__ */
 
 /*
  * internal function prototypes
  */
 static void	tbr_timeout(void *);
 static struct mbuf *tbr_dequeue(struct ifaltq *, int);
 static int tbr_timer = 0;	/* token bucket regulator timer */
 #if !defined(__FreeBSD__) || (__FreeBSD_version < 600000)
 static struct callout tbr_callout = CALLOUT_INITIALIZER;
 #else
 static struct callout tbr_callout;
 #endif
 
 #ifdef ALTQ3_CLFIER_COMPAT
 static int 	extract_ports4(struct mbuf *, struct ip *, struct flowinfo_in *);
 #ifdef INET6
 static int 	extract_ports6(struct mbuf *, struct ip6_hdr *,
 			       struct flowinfo_in6 *);
 #endif
 static int	apply_filter4(u_int32_t, struct flow_filter *,
 			      struct flowinfo_in *);
 static int	apply_ppfilter4(u_int32_t, struct flow_filter *,
 				struct flowinfo_in *);
 #ifdef INET6
 static int	apply_filter6(u_int32_t, struct flow_filter6 *,
 			      struct flowinfo_in6 *);
 #endif
 static int	apply_tosfilter4(u_int32_t, struct flow_filter *,
 				 struct flowinfo_in *);
 static u_long	get_filt_handle(struct acc_classifier *, int);
 static struct acc_filter *filth_to_filtp(struct acc_classifier *, u_long);
 static u_int32_t filt2fibmask(struct flow_filter *);
 
 static void 	ip4f_cache(struct ip *, struct flowinfo_in *);
 static int 	ip4f_lookup(struct ip *, struct flowinfo_in *);
 static int 	ip4f_init(void);
 static struct ip4_frag	*ip4f_alloc(void);
 static void 	ip4f_free(struct ip4_frag *);
 #endif /* ALTQ3_CLFIER_COMPAT */
 
 #ifdef ALTQ
 SYSCTL_NODE(_kern_features, OID_AUTO, altq, CTLFLAG_RD | CTLFLAG_CAPRD, 0,
     "ALTQ packet queuing");
 
 #define	ALTQ_FEATURE(name, desc)					\
 	SYSCTL_INT_WITH_LABEL(_kern_features_altq, OID_AUTO, name,	\
 	    CTLFLAG_RD | CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, 1,		\
 	    desc, "feature")
 
 #ifdef ALTQ_CBQ
 ALTQ_FEATURE(cbq, "ALTQ Class Based Queuing discipline");
 #endif
 #ifdef ALTQ_CODEL
 ALTQ_FEATURE(codel, "ALTQ Controlled Delay discipline");
 #endif
 #ifdef ALTQ_RED
 ALTQ_FEATURE(red, "ALTQ Random Early Detection discipline");
 #endif
 #ifdef ALTQ_RIO
 ALTQ_FEATURE(rio, "ALTQ Random Early Drop discipline");
 #endif
 #ifdef ALTQ_HFSC
 ALTQ_FEATURE(hfsc, "ALTQ Hierarchical Packet Scheduler discipline");
 #endif
 #ifdef ALTQ_PRIQ
 ALTQ_FEATURE(priq, "ATLQ Priority Queuing discipline");
 #endif
 #ifdef ALTQ_FAIRQ
 ALTQ_FEATURE(fairq, "ALTQ Fair Queuing discipline");
 #endif
 #endif
 
 /*
  * alternate queueing support routines
  */
 
 /* look up the queue state by the interface name and the queueing type. */
 void *
 altq_lookup(name, type)
 	char *name;
 	int type;
 {
 	struct ifnet *ifp;
 
 	if ((ifp = ifunit(name)) != NULL) {
 		/* read if_snd unlocked */
 		if (type != ALTQT_NONE && ifp->if_snd.altq_type == type)
 			return (ifp->if_snd.altq_disc);
 	}
 
 	return NULL;
 }
 
 int
 altq_attach(ifq, type, discipline, enqueue, dequeue, request)
 	struct ifaltq *ifq;
 	int type;
 	void *discipline;
 	int (*enqueue)(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
 	struct mbuf *(*dequeue)(struct ifaltq *, int);
 	int (*request)(struct ifaltq *, int, void *);
 {
 	IFQ_LOCK(ifq);
 	if (!ALTQ_IS_READY(ifq)) {
 		IFQ_UNLOCK(ifq);
 		return ENXIO;
 	}
 
 	ifq->altq_type     = type;
 	ifq->altq_disc     = discipline;
 	ifq->altq_enqueue  = enqueue;
 	ifq->altq_dequeue  = dequeue;
 	ifq->altq_request  = request;
 	ifq->altq_flags &= (ALTQF_CANTCHANGE|ALTQF_ENABLED);
 	IFQ_UNLOCK(ifq);
 	return 0;
 }
 
 int
 altq_detach(ifq)
 	struct ifaltq *ifq;
 {
 	IFQ_LOCK(ifq);
 
 	if (!ALTQ_IS_READY(ifq)) {
 		IFQ_UNLOCK(ifq);
 		return ENXIO;
 	}
 	if (ALTQ_IS_ENABLED(ifq)) {
 		IFQ_UNLOCK(ifq);
 		return EBUSY;
 	}
 	if (!ALTQ_IS_ATTACHED(ifq)) {
 		IFQ_UNLOCK(ifq);
 		return (0);
 	}
 
 	ifq->altq_type     = ALTQT_NONE;
 	ifq->altq_disc     = NULL;
 	ifq->altq_enqueue  = NULL;
 	ifq->altq_dequeue  = NULL;
 	ifq->altq_request  = NULL;
 	ifq->altq_flags &= ALTQF_CANTCHANGE;
 
 	IFQ_UNLOCK(ifq);
 	return 0;
 }
 
 int
 altq_enable(ifq)
 	struct ifaltq *ifq;
 {
 	int s;
 
 	IFQ_LOCK(ifq);
 
 	if (!ALTQ_IS_READY(ifq)) {
 		IFQ_UNLOCK(ifq);
 		return ENXIO;
 	}
 	if (ALTQ_IS_ENABLED(ifq)) {
 		IFQ_UNLOCK(ifq);
 		return 0;
 	}
 
 	s = splnet();
 	IFQ_PURGE_NOLOCK(ifq);
 	ASSERT(ifq->ifq_len == 0);
 	ifq->ifq_drv_maxlen = 0;		/* disable bulk dequeue */
 	ifq->altq_flags |= ALTQF_ENABLED;
 	splx(s);
 
 	IFQ_UNLOCK(ifq);
 	return 0;
 }
 
 int
 altq_disable(ifq)
 	struct ifaltq *ifq;
 {
 	int s;
 
 	IFQ_LOCK(ifq);
 	if (!ALTQ_IS_ENABLED(ifq)) {
 		IFQ_UNLOCK(ifq);
 		return 0;
 	}
 
 	s = splnet();
 	IFQ_PURGE_NOLOCK(ifq);
 	ASSERT(ifq->ifq_len == 0);
 	ifq->altq_flags &= ~(ALTQF_ENABLED);
 	splx(s);
 
 	IFQ_UNLOCK(ifq);
 	return 0;
 }
 
 #ifdef ALTQ_DEBUG
 void
 altq_assert(file, line, failedexpr)
 	const char *file, *failedexpr;
 	int line;
 {
 	(void)printf("altq assertion \"%s\" failed: file \"%s\", line %d\n",
 		     failedexpr, file, line);
 	panic("altq assertion");
 	/* NOTREACHED */
 }
 #endif
 
 /*
  * internal representation of token bucket parameters
  *	rate:	(byte_per_unittime << TBR_SHIFT)  / machclk_freq
  *		(((bits_per_sec) / 8) << TBR_SHIFT) / machclk_freq
  *	depth:	byte << TBR_SHIFT
  *
  */
 #define	TBR_SHIFT	29
 #define	TBR_SCALE(x)	((int64_t)(x) << TBR_SHIFT)
 #define	TBR_UNSCALE(x)	((x) >> TBR_SHIFT)
 
 static struct mbuf *
 tbr_dequeue(ifq, op)
 	struct ifaltq *ifq;
 	int op;
 {
 	struct tb_regulator *tbr;
 	struct mbuf *m;
 	int64_t interval;
 	u_int64_t now;
 
 	IFQ_LOCK_ASSERT(ifq);
 	tbr = ifq->altq_tbr;
 	if (op == ALTDQ_REMOVE && tbr->tbr_lastop == ALTDQ_POLL) {
 		/* if this is a remove after poll, bypass tbr check */
 	} else {
 		/* update token only when it is negative */
 		if (tbr->tbr_token <= 0) {
 			now = read_machclk();
 			interval = now - tbr->tbr_last;
 			if (interval >= tbr->tbr_filluptime)
 				tbr->tbr_token = tbr->tbr_depth;
 			else {
 				tbr->tbr_token += interval * tbr->tbr_rate;
 				if (tbr->tbr_token > tbr->tbr_depth)
 					tbr->tbr_token = tbr->tbr_depth;
 			}
 			tbr->tbr_last = now;
 		}
 		/* if token is still negative, don't allow dequeue */
 		if (tbr->tbr_token <= 0)
 			return (NULL);
 	}
 
 	if (ALTQ_IS_ENABLED(ifq))
 		m = (*ifq->altq_dequeue)(ifq, op);
 	else {
 		if (op == ALTDQ_POLL)
 			_IF_POLL(ifq, m);
 		else
 			_IF_DEQUEUE(ifq, m);
 	}
 
 	if (m != NULL && op == ALTDQ_REMOVE)
 		tbr->tbr_token -= TBR_SCALE(m_pktlen(m));
 	tbr->tbr_lastop = op;
 	return (m);
 }
 
 /*
  * set a token bucket regulator.
  * if the specified rate is zero, the token bucket regulator is deleted.
  */
 int
 tbr_set(ifq, profile)
 	struct ifaltq *ifq;
 	struct tb_profile *profile;
 {
 	struct tb_regulator *tbr, *otbr;
 
 	if (tbr_dequeue_ptr == NULL)
 		tbr_dequeue_ptr = tbr_dequeue;
 
 	if (machclk_freq == 0)
 		init_machclk();
 	if (machclk_freq == 0) {
 		printf("tbr_set: no cpu clock available!\n");
 		return (ENXIO);
 	}
 
 	IFQ_LOCK(ifq);
 	if (profile->rate == 0) {
 		/* delete this tbr */
 		if ((tbr = ifq->altq_tbr) == NULL) {
 			IFQ_UNLOCK(ifq);
 			return (ENOENT);
 		}
 		ifq->altq_tbr = NULL;
 		free(tbr, M_DEVBUF);
 		IFQ_UNLOCK(ifq);
 		return (0);
 	}
 
 	tbr = malloc(sizeof(struct tb_regulator), M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (tbr == NULL) {
 		IFQ_UNLOCK(ifq);
 		return (ENOMEM);
 	}
 
 	tbr->tbr_rate = TBR_SCALE(profile->rate / 8) / machclk_freq;
 	tbr->tbr_depth = TBR_SCALE(profile->depth);
 	if (tbr->tbr_rate > 0)
 		tbr->tbr_filluptime = tbr->tbr_depth / tbr->tbr_rate;
 	else
 		tbr->tbr_filluptime = LLONG_MAX;
 	/*
 	 *  The longest time between tbr_dequeue() calls will be about 1
 	 *  system tick, as the callout that drives it is scheduled once per
 	 *  tick.  The refill-time detection logic in tbr_dequeue() can only
 	 *  properly detect the passage of up to LLONG_MAX machclk ticks.
 	 *  Therefore, in order for this logic to function properly in the
 	 *  extreme case, the maximum value of tbr_filluptime should be
 	 *  LLONG_MAX less one system tick's worth of machclk ticks less
 	 *  some additional slop factor (here one more system tick's worth
 	 *  of machclk ticks).
 	 */
 	if (tbr->tbr_filluptime > (LLONG_MAX - 2 * machclk_per_tick))
 		tbr->tbr_filluptime = LLONG_MAX - 2 * machclk_per_tick;
 	tbr->tbr_token = tbr->tbr_depth;
 	tbr->tbr_last = read_machclk();
 	tbr->tbr_lastop = ALTDQ_REMOVE;
 
 	otbr = ifq->altq_tbr;
 	ifq->altq_tbr = tbr;	/* set the new tbr */
 
 	if (otbr != NULL)
 		free(otbr, M_DEVBUF);
 	else {
 		if (tbr_timer == 0) {
 			CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0);
 			tbr_timer = 1;
 		}
 	}
 	IFQ_UNLOCK(ifq);
 	return (0);
 }
 
 /*
  * tbr_timeout goes through the interface list, and kicks the drivers
  * if necessary.
  *
  * MPSAFE
  */
 static void
 tbr_timeout(arg)
 	void *arg;
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct ifnet *ifp;
 	struct epoch_tracker et;
 	int active;
 
 	active = 0;
 	NET_EPOCH_ENTER(et);
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		for (ifp = CK_STAILQ_FIRST(&V_ifnet); ifp;
 		    ifp = CK_STAILQ_NEXT(ifp, if_link)) {
 			/* read from if_snd unlocked */
 			if (!TBR_IS_ENABLED(&ifp->if_snd))
 				continue;
 			active++;
 			if (!IFQ_IS_EMPTY(&ifp->if_snd) &&
 			    ifp->if_start != NULL)
 				(*ifp->if_start)(ifp);
 		}
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 	NET_EPOCH_EXIT(et);
 	if (active > 0)
 		CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0);
 	else
 		tbr_timer = 0;	/* don't need tbr_timer anymore */
 }
 
 /*
  * attach a discipline to the interface.  if one already exists, it is
  * overridden.
  * Locking is done in the discipline specific attach functions. Basically
  * they call back to altq_attach which takes care of the attach and locking.
  */
 int
 altq_pfattach(struct pf_altq *a)
 {
 	int error = 0;
 
 	switch (a->scheduler) {
 	case ALTQT_NONE:
 		break;
 #ifdef ALTQ_CBQ
 	case ALTQT_CBQ:
 		error = cbq_pfattach(a);
 		break;
 #endif
 #ifdef ALTQ_PRIQ
 	case ALTQT_PRIQ:
 		error = priq_pfattach(a);
 		break;
 #endif
 #ifdef ALTQ_HFSC
 	case ALTQT_HFSC:
 		error = hfsc_pfattach(a);
 		break;
 #endif
 #ifdef ALTQ_FAIRQ
 	case ALTQT_FAIRQ:
 		error = fairq_pfattach(a);
 		break;
 #endif
 #ifdef ALTQ_CODEL
 	case ALTQT_CODEL:
 		error = codel_pfattach(a);
 		break;
 #endif
 	default:
 		error = ENXIO;
 	}
 
 	return (error);
 }
 
 /*
  * detach a discipline from the interface.
  * it is possible that the discipline was already overridden by another
  * discipline.
  */
 int
 altq_pfdetach(struct pf_altq *a)
 {
 	struct ifnet *ifp;
 	int s, error = 0;
 
 	if ((ifp = ifunit(a->ifname)) == NULL)
 		return (EINVAL);
 
 	/* if this discipline is no longer referenced, just return */
 	/* read unlocked from if_snd */
 	if (a->altq_disc == NULL || a->altq_disc != ifp->if_snd.altq_disc)
 		return (0);
 
 	s = splnet();
 	/* read unlocked from if_snd, _disable and _detach take care */
 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
 		error = altq_disable(&ifp->if_snd);
 	if (error == 0)
 		error = altq_detach(&ifp->if_snd);
 	splx(s);
 
 	return (error);
 }
 
 /*
  * add a discipline or a queue
  * Locking is done in the discipline specific functions with regards to
  * malloc with WAITOK, also it is not yet clear which lock to use.
  */
 int
 altq_add(struct ifnet *ifp, struct pf_altq *a)
 {
 	int error = 0;
 
 	if (a->qname[0] != 0)
 		return (altq_add_queue(a));
 
 	if (machclk_freq == 0)
 		init_machclk();
 	if (machclk_freq == 0)
 		panic("altq_add: no cpu clock");
 
 	switch (a->scheduler) {
 #ifdef ALTQ_CBQ
 	case ALTQT_CBQ:
 		error = cbq_add_altq(ifp, a);
 		break;
 #endif
 #ifdef ALTQ_PRIQ
 	case ALTQT_PRIQ:
 		error = priq_add_altq(ifp, a);
 		break;
 #endif
 #ifdef ALTQ_HFSC
 	case ALTQT_HFSC:
 		error = hfsc_add_altq(ifp, a);
 		break;
 #endif
 #ifdef ALTQ_FAIRQ
         case ALTQT_FAIRQ:
                 error = fairq_add_altq(ifp, a);
                 break;
 #endif
 #ifdef ALTQ_CODEL
 	case ALTQT_CODEL:
 		error = codel_add_altq(ifp, a);
 		break;
 #endif
 	default:
 		error = ENXIO;
 	}
 
 	return (error);
 }
 
 /*
  * remove a discipline or a queue
  * It is yet unclear what lock to use to protect this operation, the
  * discipline specific functions will determine and grab it
  */
 int
 altq_remove(struct pf_altq *a)
 {
 	int error = 0;
 
 	if (a->qname[0] != 0)
 		return (altq_remove_queue(a));
 
 	switch (a->scheduler) {
 #ifdef ALTQ_CBQ
 	case ALTQT_CBQ:
 		error = cbq_remove_altq(a);
 		break;
 #endif
 #ifdef ALTQ_PRIQ
 	case ALTQT_PRIQ:
 		error = priq_remove_altq(a);
 		break;
 #endif
 #ifdef ALTQ_HFSC
 	case ALTQT_HFSC:
 		error = hfsc_remove_altq(a);
 		break;
 #endif
 #ifdef ALTQ_FAIRQ
         case ALTQT_FAIRQ:
                 error = fairq_remove_altq(a);
                 break;
 #endif
 #ifdef ALTQ_CODEL
 	case ALTQT_CODEL:
 		error = codel_remove_altq(a);
 		break;
 #endif
 	default:
 		error = ENXIO;
 	}
 
 	return (error);
 }
 
 /*
  * add a queue to the discipline
  * It is yet unclear what lock to use to protect this operation, the
  * discipline specific functions will determine and grab it
  */
 int
 altq_add_queue(struct pf_altq *a)
 {
 	int error = 0;
 
 	switch (a->scheduler) {
 #ifdef ALTQ_CBQ
 	case ALTQT_CBQ:
 		error = cbq_add_queue(a);
 		break;
 #endif
 #ifdef ALTQ_PRIQ
 	case ALTQT_PRIQ:
 		error = priq_add_queue(a);
 		break;
 #endif
 #ifdef ALTQ_HFSC
 	case ALTQT_HFSC:
 		error = hfsc_add_queue(a);
 		break;
 #endif
 #ifdef ALTQ_FAIRQ
         case ALTQT_FAIRQ:
                 error = fairq_add_queue(a);
                 break;
 #endif
 	default:
 		error = ENXIO;
 	}
 
 	return (error);
 }
 
 /*
  * remove a queue from the discipline
  * It is yet unclear what lock to use to protect this operation, the
  * discipline specific functions will determine and grab it
  */
 int
 altq_remove_queue(struct pf_altq *a)
 {
 	int error = 0;
 
 	switch (a->scheduler) {
 #ifdef ALTQ_CBQ
 	case ALTQT_CBQ:
 		error = cbq_remove_queue(a);
 		break;
 #endif
 #ifdef ALTQ_PRIQ
 	case ALTQT_PRIQ:
 		error = priq_remove_queue(a);
 		break;
 #endif
 #ifdef ALTQ_HFSC
 	case ALTQT_HFSC:
 		error = hfsc_remove_queue(a);
 		break;
 #endif
 #ifdef ALTQ_FAIRQ
         case ALTQT_FAIRQ:
                 error = fairq_remove_queue(a);
                 break;
 #endif
 	default:
 		error = ENXIO;
 	}
 
 	return (error);
 }
 
 /*
  * get queue statistics
  * Locking is done in the discipline specific functions with regards to
  * copyout operations, also it is not yet clear which lock to use.
  */
 int
 altq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes, int version)
 {
 	int error = 0;
 
 	switch (a->scheduler) {
 #ifdef ALTQ_CBQ
 	case ALTQT_CBQ:
 		error = cbq_getqstats(a, ubuf, nbytes, version);
 		break;
 #endif
 #ifdef ALTQ_PRIQ
 	case ALTQT_PRIQ:
 		error = priq_getqstats(a, ubuf, nbytes, version);
 		break;
 #endif
 #ifdef ALTQ_HFSC
 	case ALTQT_HFSC:
 		error = hfsc_getqstats(a, ubuf, nbytes, version);
 		break;
 #endif
 #ifdef ALTQ_FAIRQ
         case ALTQT_FAIRQ:
                 error = fairq_getqstats(a, ubuf, nbytes, version);
                 break;
 #endif
 #ifdef ALTQ_CODEL
 	case ALTQT_CODEL:
 		error = codel_getqstats(a, ubuf, nbytes, version);
 		break;
 #endif
 	default:
 		error = ENXIO;
 	}
 
 	return (error);
 }
 
 /*
  * read and write diffserv field in IPv4 or IPv6 header
  */
 u_int8_t
 read_dsfield(m, pktattr)
 	struct mbuf *m;
 	struct altq_pktattr *pktattr;
 {
 	struct mbuf *m0;
 	u_int8_t ds_field = 0;
 
 	if (pktattr == NULL ||
 	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
 		return ((u_int8_t)0);
 
 	/* verify that pattr_hdr is within the mbuf data */
 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
 		if ((pktattr->pattr_hdr >= m0->m_data) &&
 		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
 			break;
 	if (m0 == NULL) {
 		/* ick, pattr_hdr is stale */
 		pktattr->pattr_af = AF_UNSPEC;
 #ifdef ALTQ_DEBUG
 		printf("read_dsfield: can't locate header!\n");
 #endif
 		return ((u_int8_t)0);
 	}
 
 	if (pktattr->pattr_af == AF_INET) {
 		struct ip *ip = (struct ip *)pktattr->pattr_hdr;
 
 		if (ip->ip_v != 4)
 			return ((u_int8_t)0);	/* version mismatch! */
 		ds_field = ip->ip_tos;
 	}
 #ifdef INET6
 	else if (pktattr->pattr_af == AF_INET6) {
 		struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
 		u_int32_t flowlabel;
 
 		flowlabel = ntohl(ip6->ip6_flow);
 		if ((flowlabel >> 28) != 6)
 			return ((u_int8_t)0);	/* version mismatch! */
 		ds_field = (flowlabel >> 20) & 0xff;
 	}
 #endif
 	return (ds_field);
 }
 
 void
 write_dsfield(struct mbuf *m, struct altq_pktattr *pktattr, u_int8_t dsfield)
 {
 	struct mbuf *m0;
 
 	if (pktattr == NULL ||
 	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
 		return;
 
 	/* verify that pattr_hdr is within the mbuf data */
 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
 		if ((pktattr->pattr_hdr >= m0->m_data) &&
 		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
 			break;
 	if (m0 == NULL) {
 		/* ick, pattr_hdr is stale */
 		pktattr->pattr_af = AF_UNSPEC;
 #ifdef ALTQ_DEBUG
 		printf("write_dsfield: can't locate header!\n");
 #endif
 		return;
 	}
 
 	if (pktattr->pattr_af == AF_INET) {
 		struct ip *ip = (struct ip *)pktattr->pattr_hdr;
 		u_int8_t old;
 		int32_t sum;
 
 		if (ip->ip_v != 4)
 			return;		/* version mismatch! */
 		old = ip->ip_tos;
 		dsfield |= old & 3;	/* leave CU bits */
 		if (old == dsfield)
 			return;
 		ip->ip_tos = dsfield;
 		/*
 		 * update checksum (from RFC1624)
 		 *	   HC' = ~(~HC + ~m + m')
 		 */
 		sum = ~ntohs(ip->ip_sum) & 0xffff;
 		sum += 0xff00 + (~old & 0xff) + dsfield;
 		sum = (sum >> 16) + (sum & 0xffff);
 		sum += (sum >> 16);  /* add carry */
 
 		ip->ip_sum = htons(~sum & 0xffff);
 	}
 #ifdef INET6
 	else if (pktattr->pattr_af == AF_INET6) {
 		struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
 		u_int32_t flowlabel;
 
 		flowlabel = ntohl(ip6->ip6_flow);
 		if ((flowlabel >> 28) != 6)
 			return;		/* version mismatch! */
 		flowlabel = (flowlabel & 0xf03fffff) | (dsfield << 20);
 		ip6->ip6_flow = htonl(flowlabel);
 	}
 #endif
 	return;
 }
 
 /*
  * high resolution clock support taking advantage of a machine dependent
  * high resolution time counter (e.g., timestamp counter of intel pentium).
  * we assume
  *  - 64-bit-long monotonically-increasing counter
  *  - frequency range is 100M-4GHz (CPU speed)
  */
 /* if pcc is not available or disabled, emulate 256MHz using microtime() */
 #define	MACHCLK_SHIFT	8
 
 int machclk_usepcc;
 u_int32_t machclk_freq;
 u_int32_t machclk_per_tick;
 
 #if defined(__i386__) && defined(__NetBSD__)
 extern u_int64_t cpu_tsc_freq;
 #endif
 
 #if (__FreeBSD_version >= 700035)
 /* Update TSC freq with the value indicated by the caller. */
 static void
 tsc_freq_changed(void *arg, const struct cf_level *level, int status)
 {
 	/* If there was an error during the transition, don't do anything. */
 	if (status != 0)
 		return;
 
 #if (__FreeBSD_version >= 701102) && (defined(__amd64__) || defined(__i386__))
 	/* If TSC is P-state invariant, don't do anything. */
 	if (tsc_is_invariant)
 		return;
 #endif
 
 	/* Total setting for this level gives the new frequency in MHz. */
 	init_machclk();
 }
 EVENTHANDLER_DEFINE(cpufreq_post_change, tsc_freq_changed, NULL,
     EVENTHANDLER_PRI_LAST);
 #endif /* __FreeBSD_version >= 700035 */
 
 static void
 init_machclk_setup(void)
 {
 	callout_init(&tbr_callout, 1);
 
 	machclk_usepcc = 1;
 
 #if (!defined(__amd64__) && !defined(__i386__)) || defined(ALTQ_NOPCC)
 	machclk_usepcc = 0;
 #endif
 #if defined(__FreeBSD__) && defined(SMP)
 	machclk_usepcc = 0;
 #endif
 #if defined(__NetBSD__) && defined(MULTIPROCESSOR)
 	machclk_usepcc = 0;
 #endif
 #if defined(__amd64__) || defined(__i386__)
 	/* check if TSC is available */
 	if ((cpu_feature & CPUID_TSC) == 0 ||
 	    atomic_load_acq_64(&tsc_freq) == 0)
 		machclk_usepcc = 0;
 #endif
 }
 
 void
 init_machclk(void)
 {
 	static int called;
 
 	/* Call one-time initialization function. */
 	if (!called) {
 		init_machclk_setup();
 		called = 1;
 	}
 
 	if (machclk_usepcc == 0) {
 		/* emulate 256MHz using microtime() */
 		machclk_freq = 1000000 << MACHCLK_SHIFT;
 		machclk_per_tick = machclk_freq / hz;
 #ifdef ALTQ_DEBUG
 		printf("altq: emulate %uHz cpu clock\n", machclk_freq);
 #endif
 		return;
 	}
 
 	/*
 	 * if the clock frequency (of Pentium TSC or Alpha PCC) is
 	 * accessible, just use it.
 	 */
 #if defined(__amd64__) || defined(__i386__)
 	machclk_freq = atomic_load_acq_64(&tsc_freq);
 #endif
 
 	/*
 	 * if we don't know the clock frequency, measure it.
 	 */
 	if (machclk_freq == 0) {
 		static int	wait;
 		struct timeval	tv_start, tv_end;
 		u_int64_t	start, end, diff;
 		int		timo;
 
 		microtime(&tv_start);
 		start = read_machclk();
 		timo = hz;	/* 1 sec */
 		(void)tsleep(&wait, PWAIT | PCATCH, "init_machclk", timo);
 		microtime(&tv_end);
 		end = read_machclk();
 		diff = (u_int64_t)(tv_end.tv_sec - tv_start.tv_sec) * 1000000
 		    + tv_end.tv_usec - tv_start.tv_usec;
 		if (diff != 0)
 			machclk_freq = (u_int)((end - start) * 1000000 / diff);
 	}
 
 	machclk_per_tick = machclk_freq / hz;
 
 #ifdef ALTQ_DEBUG
 	printf("altq: CPU clock: %uHz\n", machclk_freq);
 #endif
 }
 
 #if defined(__OpenBSD__) && defined(__i386__)
 static __inline u_int64_t
 rdtsc(void)
 {
 	u_int64_t rv;
 	__asm __volatile(".byte 0x0f, 0x31" : "=A" (rv));
 	return (rv);
 }
 #endif /* __OpenBSD__ && __i386__ */
 
 u_int64_t
 read_machclk(void)
 {
 	u_int64_t val;
 
 	if (machclk_usepcc) {
 #if defined(__amd64__) || defined(__i386__)
 		val = rdtsc();
 #else
 		panic("read_machclk");
 #endif
 	} else {
 		struct timeval tv, boottime;
 
 		microtime(&tv);
 		getboottime(&boottime);
 		val = (((u_int64_t)(tv.tv_sec - boottime.tv_sec) * 1000000
 		    + tv.tv_usec) << MACHCLK_SHIFT);
 	}
 	return (val);
 }
 
 #ifdef ALTQ3_CLFIER_COMPAT
 
 #ifndef IPPROTO_ESP
 #define	IPPROTO_ESP	50		/* encapsulating security payload */
 #endif
 #ifndef IPPROTO_AH
 #define	IPPROTO_AH	51		/* authentication header */
 #endif
 
 /*
  * extract flow information from a given packet.
  * filt_mask shows flowinfo fields required.
  * we assume the ip header is in one mbuf, and addresses and ports are
  * in network byte order.
  */
 int
 altq_extractflow(m, af, flow, filt_bmask)
 	struct mbuf *m;
 	int af;
 	struct flowinfo *flow;
 	u_int32_t	filt_bmask;
 {
 
 	switch (af) {
 	case PF_INET: {
 		struct flowinfo_in *fin;
 		struct ip *ip;
 
 		ip = mtod(m, struct ip *);
 
 		if (ip->ip_v != 4)
 			break;
 
 		fin = (struct flowinfo_in *)flow;
 		fin->fi_len = sizeof(struct flowinfo_in);
 		fin->fi_family = AF_INET;
 
 		fin->fi_proto = ip->ip_p;
 		fin->fi_tos = ip->ip_tos;
 
 		fin->fi_src.s_addr = ip->ip_src.s_addr;
 		fin->fi_dst.s_addr = ip->ip_dst.s_addr;
 
 		if (filt_bmask & FIMB4_PORTS)
 			/* if port info is required, extract port numbers */
 			extract_ports4(m, ip, fin);
 		else {
 			fin->fi_sport = 0;
 			fin->fi_dport = 0;
 			fin->fi_gpi = 0;
 		}
 		return (1);
 	}
 
 #ifdef INET6
 	case PF_INET6: {
 		struct flowinfo_in6 *fin6;
 		struct ip6_hdr *ip6;
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		/* should we check the ip version? */
 
 		fin6 = (struct flowinfo_in6 *)flow;
 		fin6->fi6_len = sizeof(struct flowinfo_in6);
 		fin6->fi6_family = AF_INET6;
 
 		fin6->fi6_proto = ip6->ip6_nxt;
 		fin6->fi6_tclass   = IPV6_TRAFFIC_CLASS(ip6);
 
 		fin6->fi6_flowlabel = ip6->ip6_flow & htonl(0x000fffff);
 		fin6->fi6_src = ip6->ip6_src;
 		fin6->fi6_dst = ip6->ip6_dst;
 
 		if ((filt_bmask & FIMB6_PORTS) ||
 		    ((filt_bmask & FIMB6_PROTO)
 		     && ip6->ip6_nxt > IPPROTO_IPV6))
 			/*
 			 * if port info is required, or proto is required
 			 * but there are option headers, extract port
 			 * and protocol numbers.
 			 */
 			extract_ports6(m, ip6, fin6);
 		else {
 			fin6->fi6_sport = 0;
 			fin6->fi6_dport = 0;
 			fin6->fi6_gpi = 0;
 		}
 		return (1);
 	}
 #endif /* INET6 */
 
 	default:
 		break;
 	}
 
 	/* failed */
 	flow->fi_len = sizeof(struct flowinfo);
 	flow->fi_family = AF_UNSPEC;
 	return (0);
 }
 
 /*
  * helper routine to extract port numbers
  */
 /* structure for ipsec and ipv6 option header template */
 struct _opt6 {
 	u_int8_t	opt6_nxt;	/* next header */
 	u_int8_t	opt6_hlen;	/* header extension length */
 	u_int16_t	_pad;
 	u_int32_t	ah_spi;		/* security parameter index
 					   for authentication header */
 };
 
 /*
  * extract port numbers from a ipv4 packet.
  */
 static int
 extract_ports4(m, ip, fin)
 	struct mbuf *m;
 	struct ip *ip;
 	struct flowinfo_in *fin;
 {
 	struct mbuf *m0;
 	u_short ip_off;
 	u_int8_t proto;
 	int 	off;
 
 	fin->fi_sport = 0;
 	fin->fi_dport = 0;
 	fin->fi_gpi = 0;
 
 	ip_off = ntohs(ip->ip_off);
 	/* if it is a fragment, try cached fragment info */
 	if (ip_off & IP_OFFMASK) {
 		ip4f_lookup(ip, fin);
 		return (1);
 	}
 
 	/* locate the mbuf containing the protocol header */
 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
 		if (((caddr_t)ip >= m0->m_data) &&
 		    ((caddr_t)ip < m0->m_data + m0->m_len))
 			break;
 	if (m0 == NULL) {
 #ifdef ALTQ_DEBUG
 		printf("extract_ports4: can't locate header! ip=%p\n", ip);
 #endif
 		return (0);
 	}
 	off = ((caddr_t)ip - m0->m_data) + (ip->ip_hl << 2);
 	proto = ip->ip_p;
 
 #ifdef ALTQ_IPSEC
  again:
 #endif
 	while (off >= m0->m_len) {
 		off -= m0->m_len;
 		m0 = m0->m_next;
 		if (m0 == NULL)
 			return (0);  /* bogus ip_hl! */
 	}
 	if (m0->m_len < off + 4)
 		return (0);
 
 	switch (proto) {
 	case IPPROTO_TCP:
 	case IPPROTO_UDP: {
 		struct udphdr *udp;
 
 		udp = (struct udphdr *)(mtod(m0, caddr_t) + off);
 		fin->fi_sport = udp->uh_sport;
 		fin->fi_dport = udp->uh_dport;
 		fin->fi_proto = proto;
 		}
 		break;
 
 #ifdef ALTQ_IPSEC
 	case IPPROTO_ESP:
 		if (fin->fi_gpi == 0){
 			u_int32_t *gpi;
 
 			gpi = (u_int32_t *)(mtod(m0, caddr_t) + off);
 			fin->fi_gpi   = *gpi;
 		}
 		fin->fi_proto = proto;
 		break;
 
 	case IPPROTO_AH: {
 			/* get next header and header length */
 			struct _opt6 *opt6;
 
 			opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off);
 			proto = opt6->opt6_nxt;
 			off += 8 + (opt6->opt6_hlen * 4);
 			if (fin->fi_gpi == 0 && m0->m_len >= off + 8)
 				fin->fi_gpi = opt6->ah_spi;
 		}
 		/* goto the next header */
 		goto again;
 #endif  /* ALTQ_IPSEC */
 
 	default:
 		fin->fi_proto = proto;
 		return (0);
 	}
 
 	/* if this is a first fragment, cache it. */
 	if (ip_off & IP_MF)
 		ip4f_cache(ip, fin);
 
 	return (1);
 }
 
 #ifdef INET6
 static int
 extract_ports6(m, ip6, fin6)
 	struct mbuf *m;
 	struct ip6_hdr *ip6;
 	struct flowinfo_in6 *fin6;
 {
 	struct mbuf *m0;
 	int	off;
 	u_int8_t proto;
 
 	fin6->fi6_gpi   = 0;
 	fin6->fi6_sport = 0;
 	fin6->fi6_dport = 0;
 
 	/* locate the mbuf containing the protocol header */
 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
 		if (((caddr_t)ip6 >= m0->m_data) &&
 		    ((caddr_t)ip6 < m0->m_data + m0->m_len))
 			break;
 	if (m0 == NULL) {
 #ifdef ALTQ_DEBUG
 		printf("extract_ports6: can't locate header! ip6=%p\n", ip6);
 #endif
 		return (0);
 	}
 	off = ((caddr_t)ip6 - m0->m_data) + sizeof(struct ip6_hdr);
 
 	proto = ip6->ip6_nxt;
 	do {
 		while (off >= m0->m_len) {
 			off -= m0->m_len;
 			m0 = m0->m_next;
 			if (m0 == NULL)
 				return (0);
 		}
 		if (m0->m_len < off + 4)
 			return (0);
 
 		switch (proto) {
 		case IPPROTO_TCP:
 		case IPPROTO_UDP: {
 			struct udphdr *udp;
 
 			udp = (struct udphdr *)(mtod(m0, caddr_t) + off);
 			fin6->fi6_sport = udp->uh_sport;
 			fin6->fi6_dport = udp->uh_dport;
 			fin6->fi6_proto = proto;
 			}
 			return (1);
 
 		case IPPROTO_ESP:
 			if (fin6->fi6_gpi == 0) {
 				u_int32_t *gpi;
 
 				gpi = (u_int32_t *)(mtod(m0, caddr_t) + off);
 				fin6->fi6_gpi   = *gpi;
 			}
 			fin6->fi6_proto = proto;
 			return (1);
 
 		case IPPROTO_AH: {
 			/* get next header and header length */
 			struct _opt6 *opt6;
 
 			opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off);
 			if (fin6->fi6_gpi == 0 && m0->m_len >= off + 8)
 				fin6->fi6_gpi = opt6->ah_spi;
 			proto = opt6->opt6_nxt;
 			off += 8 + (opt6->opt6_hlen * 4);
 			/* goto the next header */
 			break;
 			}
 
 		case IPPROTO_HOPOPTS:
 		case IPPROTO_ROUTING:
 		case IPPROTO_DSTOPTS: {
 			/* get next header and header length */
 			struct _opt6 *opt6;
 
 			opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off);
 			proto = opt6->opt6_nxt;
 			off += (opt6->opt6_hlen + 1) * 8;
 			/* goto the next header */
 			break;
 			}
 
 		case IPPROTO_FRAGMENT:
 			/* ipv6 fragmentations are not supported yet */
 		default:
 			fin6->fi6_proto = proto;
 			return (0);
 		}
 	} while (1);
 	/*NOTREACHED*/
 }
 #endif /* INET6 */
 
 /*
  * altq common classifier
  */
 int
 acc_add_filter(classifier, filter, class, phandle)
 	struct acc_classifier *classifier;
 	struct flow_filter *filter;
 	void	*class;
 	u_long	*phandle;
 {
 	struct acc_filter *afp, *prev, *tmp;
 	int	i, s;
 
 #ifdef INET6
 	if (filter->ff_flow.fi_family != AF_INET &&
 	    filter->ff_flow.fi_family != AF_INET6)
 		return (EINVAL);
 #else
 	if (filter->ff_flow.fi_family != AF_INET)
 		return (EINVAL);
 #endif
 
 	afp = malloc(sizeof(struct acc_filter),
 	       M_DEVBUF, M_WAITOK);
 	if (afp == NULL)
 		return (ENOMEM);
 	bzero(afp, sizeof(struct acc_filter));
 
 	afp->f_filter = *filter;
 	afp->f_class = class;
 
 	i = ACC_WILDCARD_INDEX;
 	if (filter->ff_flow.fi_family == AF_INET) {
 		struct flow_filter *filter4 = &afp->f_filter;
 
 		/*
 		 * if address is 0, it's a wildcard.  if address mask
 		 * isn't set, use full mask.
 		 */
 		if (filter4->ff_flow.fi_dst.s_addr == 0)
 			filter4->ff_mask.mask_dst.s_addr = 0;
 		else if (filter4->ff_mask.mask_dst.s_addr == 0)
 			filter4->ff_mask.mask_dst.s_addr = 0xffffffff;
 		if (filter4->ff_flow.fi_src.s_addr == 0)
 			filter4->ff_mask.mask_src.s_addr = 0;
 		else if (filter4->ff_mask.mask_src.s_addr == 0)
 			filter4->ff_mask.mask_src.s_addr = 0xffffffff;
 
 		/* clear extra bits in addresses  */
 		   filter4->ff_flow.fi_dst.s_addr &=
 		       filter4->ff_mask.mask_dst.s_addr;
 		   filter4->ff_flow.fi_src.s_addr &=
 		       filter4->ff_mask.mask_src.s_addr;
 
 		/*
 		 * if dst address is a wildcard, use hash-entry
 		 * ACC_WILDCARD_INDEX.
 		 */
 		if (filter4->ff_mask.mask_dst.s_addr != 0xffffffff)
 			i = ACC_WILDCARD_INDEX;
 		else
 			i = ACC_GET_HASH_INDEX(filter4->ff_flow.fi_dst.s_addr);
 	}
 #ifdef INET6
 	else if (filter->ff_flow.fi_family == AF_INET6) {
 		struct flow_filter6 *filter6 =
 			(struct flow_filter6 *)&afp->f_filter;
 #ifndef IN6MASK0 /* taken from kame ipv6 */
 #define	IN6MASK0	{{{ 0, 0, 0, 0 }}}
 #define	IN6MASK128	{{{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }}}
 		const struct in6_addr in6mask0 = IN6MASK0;
 		const struct in6_addr in6mask128 = IN6MASK128;
 #endif
 
 		if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_dst))
 			filter6->ff_mask6.mask6_dst = in6mask0;
 		else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_dst))
 			filter6->ff_mask6.mask6_dst = in6mask128;
 		if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_src))
 			filter6->ff_mask6.mask6_src = in6mask0;
 		else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_src))
 			filter6->ff_mask6.mask6_src = in6mask128;
 
 		/* clear extra bits in addresses  */
 		for (i = 0; i < 16; i++)
 			filter6->ff_flow6.fi6_dst.s6_addr[i] &=
 			    filter6->ff_mask6.mask6_dst.s6_addr[i];
 		for (i = 0; i < 16; i++)
 			filter6->ff_flow6.fi6_src.s6_addr[i] &=
 			    filter6->ff_mask6.mask6_src.s6_addr[i];
 
 		if (filter6->ff_flow6.fi6_flowlabel == 0)
 			i = ACC_WILDCARD_INDEX;
 		else
 			i = ACC_GET_HASH_INDEX(filter6->ff_flow6.fi6_flowlabel);
 	}
 #endif /* INET6 */
 
 	afp->f_handle = get_filt_handle(classifier, i);
 
 	/* update filter bitmask */
 	afp->f_fbmask = filt2fibmask(filter);
 	classifier->acc_fbmask |= afp->f_fbmask;
 
 	/*
 	 * add this filter to the filter list.
 	 * filters are ordered from the highest rule number.
 	 */
 	s = splnet();
 	prev = NULL;
 	LIST_FOREACH(tmp, &classifier->acc_filters[i], f_chain) {
 		if (tmp->f_filter.ff_ruleno > afp->f_filter.ff_ruleno)
 			prev = tmp;
 		else
 			break;
 	}
 	if (prev == NULL)
 		LIST_INSERT_HEAD(&classifier->acc_filters[i], afp, f_chain);
 	else
 		LIST_INSERT_AFTER(prev, afp, f_chain);
 	splx(s);
 
 	*phandle = afp->f_handle;
 	return (0);
 }
 
 int
 acc_delete_filter(classifier, handle)
 	struct acc_classifier *classifier;
 	u_long handle;
 {
 	struct acc_filter *afp;
 	int	s;
 
 	if ((afp = filth_to_filtp(classifier, handle)) == NULL)
 		return (EINVAL);
 
 	s = splnet();
 	LIST_REMOVE(afp, f_chain);
 	splx(s);
 
 	free(afp, M_DEVBUF);
 
 	/* todo: update filt_bmask */
 
 	return (0);
 }
 
 /*
  * delete filters referencing to the specified class.
  * if the all flag is not 0, delete all the filters.
  */
 int
 acc_discard_filters(classifier, class, all)
 	struct acc_classifier *classifier;
 	void	*class;
 	int	all;
 {
 	struct acc_filter *afp;
 	int	i, s;
 
 	s = splnet();
 	for (i = 0; i < ACC_FILTER_TABLESIZE; i++) {
 		do {
 			LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
 				if (all || afp->f_class == class) {
 					LIST_REMOVE(afp, f_chain);
 					free(afp, M_DEVBUF);
 					/* start again from the head */
 					break;
 				}
 		} while (afp != NULL);
 	}
 	splx(s);
 
 	if (all)
 		classifier->acc_fbmask = 0;
 
 	return (0);
 }
 
 void *
 acc_classify(clfier, m, af)
 	void *clfier;
 	struct mbuf *m;
 	int af;
 {
 	struct acc_classifier *classifier;
 	struct flowinfo flow;
 	struct acc_filter *afp;
 	int	i;
 
 	classifier = (struct acc_classifier *)clfier;
 	altq_extractflow(m, af, &flow, classifier->acc_fbmask);
 
 	if (flow.fi_family == AF_INET) {
 		struct flowinfo_in *fp = (struct flowinfo_in *)&flow;
 
 		if ((classifier->acc_fbmask & FIMB4_ALL) == FIMB4_TOS) {
 			/* only tos is used */
 			LIST_FOREACH(afp,
 				 &classifier->acc_filters[ACC_WILDCARD_INDEX],
 				 f_chain)
 				if (apply_tosfilter4(afp->f_fbmask,
 						     &afp->f_filter, fp))
 					/* filter matched */
 					return (afp->f_class);
 		} else if ((classifier->acc_fbmask &
 			(~(FIMB4_PROTO|FIMB4_SPORT|FIMB4_DPORT) & FIMB4_ALL))
 		    == 0) {
 			/* only proto and ports are used */
 			LIST_FOREACH(afp,
 				 &classifier->acc_filters[ACC_WILDCARD_INDEX],
 				 f_chain)
 				if (apply_ppfilter4(afp->f_fbmask,
 						    &afp->f_filter, fp))
 					/* filter matched */
 					return (afp->f_class);
 		} else {
 			/* get the filter hash entry from its dest address */
 			i = ACC_GET_HASH_INDEX(fp->fi_dst.s_addr);
 			do {
 				/*
 				 * go through this loop twice.  first for dst
 				 * hash, second for wildcards.
 				 */
 				LIST_FOREACH(afp, &classifier->acc_filters[i],
 					     f_chain)
 					if (apply_filter4(afp->f_fbmask,
 							  &afp->f_filter, fp))
 						/* filter matched */
 						return (afp->f_class);
 
 				/*
 				 * check again for filters with a dst addr
 				 * wildcard.
 				 * (daddr == 0 || dmask != 0xffffffff).
 				 */
 				if (i != ACC_WILDCARD_INDEX)
 					i = ACC_WILDCARD_INDEX;
 				else
 					break;
 			} while (1);
 		}
 	}
 #ifdef INET6
 	else if (flow.fi_family == AF_INET6) {
 		struct flowinfo_in6 *fp6 = (struct flowinfo_in6 *)&flow;
 
 		/* get the filter hash entry from its flow ID */
 		if (fp6->fi6_flowlabel != 0)
 			i = ACC_GET_HASH_INDEX(fp6->fi6_flowlabel);
 		else
 			/* flowlable can be zero */
 			i = ACC_WILDCARD_INDEX;
 
 		/* go through this loop twice.  first for flow hash, second
 		   for wildcards. */
 		do {
 			LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
 				if (apply_filter6(afp->f_fbmask,
 					(struct flow_filter6 *)&afp->f_filter,
 					fp6))
 					/* filter matched */
 					return (afp->f_class);
 
 			/*
 			 * check again for filters with a wildcard.
 			 */
 			if (i != ACC_WILDCARD_INDEX)
 				i = ACC_WILDCARD_INDEX;
 			else
 				break;
 		} while (1);
 	}
 #endif /* INET6 */
 
 	/* no filter matched */
 	return (NULL);
 }
 
 static int
 apply_filter4(fbmask, filt, pkt)
 	u_int32_t	fbmask;
 	struct flow_filter *filt;
 	struct flowinfo_in *pkt;
 {
 	if (filt->ff_flow.fi_family != AF_INET)
 		return (0);
 	if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport)
 		return (0);
 	if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport)
 		return (0);
 	if ((fbmask & FIMB4_DADDR) &&
 	    filt->ff_flow.fi_dst.s_addr !=
 	    (pkt->fi_dst.s_addr & filt->ff_mask.mask_dst.s_addr))
 		return (0);
 	if ((fbmask & FIMB4_SADDR) &&
 	    filt->ff_flow.fi_src.s_addr !=
 	    (pkt->fi_src.s_addr & filt->ff_mask.mask_src.s_addr))
 		return (0);
 	if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto)
 		return (0);
 	if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos !=
 	    (pkt->fi_tos & filt->ff_mask.mask_tos))
 		return (0);
 	if ((fbmask & FIMB4_GPI) && filt->ff_flow.fi_gpi != (pkt->fi_gpi))
 		return (0);
 	/* match */
 	return (1);
 }
 
 /*
  * filter matching function optimized for a common case that checks
  * only protocol and port numbers
  */
 static int
 apply_ppfilter4(fbmask, filt, pkt)
 	u_int32_t	fbmask;
 	struct flow_filter *filt;
 	struct flowinfo_in *pkt;
 {
 	if (filt->ff_flow.fi_family != AF_INET)
 		return (0);
 	if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport)
 		return (0);
 	if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport)
 		return (0);
 	if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto)
 		return (0);
 	/* match */
 	return (1);
 }
 
 /*
  * filter matching function only for tos field.
  */
 static int
 apply_tosfilter4(fbmask, filt, pkt)
 	u_int32_t	fbmask;
 	struct flow_filter *filt;
 	struct flowinfo_in *pkt;
 {
 	if (filt->ff_flow.fi_family != AF_INET)
 		return (0);
 	if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos !=
 	    (pkt->fi_tos & filt->ff_mask.mask_tos))
 		return (0);
 	/* match */
 	return (1);
 }
 
 #ifdef INET6
 static int
 apply_filter6(fbmask, filt, pkt)
 	u_int32_t	fbmask;
 	struct flow_filter6 *filt;
 	struct flowinfo_in6 *pkt;
 {
 	int i;
 
 	if (filt->ff_flow6.fi6_family != AF_INET6)
 		return (0);
 	if ((fbmask & FIMB6_FLABEL) &&
 	    filt->ff_flow6.fi6_flowlabel != pkt->fi6_flowlabel)
 		return (0);
 	if ((fbmask & FIMB6_PROTO) &&
 	    filt->ff_flow6.fi6_proto != pkt->fi6_proto)
 		return (0);
 	if ((fbmask & FIMB6_SPORT) &&
 	    filt->ff_flow6.fi6_sport != pkt->fi6_sport)
 		return (0);
 	if ((fbmask & FIMB6_DPORT) &&
 	    filt->ff_flow6.fi6_dport != pkt->fi6_dport)
 		return (0);
 	if (fbmask & FIMB6_SADDR) {
 		for (i = 0; i < 4; i++)
 			if (filt->ff_flow6.fi6_src.s6_addr32[i] !=
 			    (pkt->fi6_src.s6_addr32[i] &
 			     filt->ff_mask6.mask6_src.s6_addr32[i]))
 				return (0);
 	}
 	if (fbmask & FIMB6_DADDR) {
 		for (i = 0; i < 4; i++)
 			if (filt->ff_flow6.fi6_dst.s6_addr32[i] !=
 			    (pkt->fi6_dst.s6_addr32[i] &
 			     filt->ff_mask6.mask6_dst.s6_addr32[i]))
 				return (0);
 	}
 	if ((fbmask & FIMB6_TCLASS) &&
 	    filt->ff_flow6.fi6_tclass !=
 	    (pkt->fi6_tclass & filt->ff_mask6.mask6_tclass))
 		return (0);
 	if ((fbmask & FIMB6_GPI) &&
 	    filt->ff_flow6.fi6_gpi != pkt->fi6_gpi)
 		return (0);
 	/* match */
 	return (1);
 }
 #endif /* INET6 */
 
 /*
  *  filter handle:
  *	bit 20-28: index to the filter hash table
  *	bit  0-19: unique id in the hash bucket.
  */
 static u_long
 get_filt_handle(classifier, i)
 	struct acc_classifier *classifier;
 	int	i;
 {
 	static u_long handle_number = 1;
 	u_long 	handle;
 	struct acc_filter *afp;
 
 	while (1) {
 		handle = handle_number++ & 0x000fffff;
 
 		if (LIST_EMPTY(&classifier->acc_filters[i]))
 			break;
 
 		LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
 			if ((afp->f_handle & 0x000fffff) == handle)
 				break;
 		if (afp == NULL)
 			break;
 		/* this handle is already used, try again */
 	}
 
 	return ((i << 20) | handle);
 }
 
 /* convert filter handle to filter pointer */
 static struct acc_filter *
 filth_to_filtp(classifier, handle)
 	struct acc_classifier *classifier;
 	u_long handle;
 {
 	struct acc_filter *afp;
 	int	i;
 
 	i = ACC_GET_HINDEX(handle);
 
 	LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
 		if (afp->f_handle == handle)
 			return (afp);
 
 	return (NULL);
 }
 
 /* create flowinfo bitmask */
 static u_int32_t
 filt2fibmask(filt)
 	struct flow_filter *filt;
 {
 	u_int32_t mask = 0;
 #ifdef INET6
 	struct flow_filter6 *filt6;
 #endif
 
 	switch (filt->ff_flow.fi_family) {
 	case AF_INET:
 		if (filt->ff_flow.fi_proto != 0)
 			mask |= FIMB4_PROTO;
 		if (filt->ff_flow.fi_tos != 0)
 			mask |= FIMB4_TOS;
 		if (filt->ff_flow.fi_dst.s_addr != 0)
 			mask |= FIMB4_DADDR;
 		if (filt->ff_flow.fi_src.s_addr != 0)
 			mask |= FIMB4_SADDR;
 		if (filt->ff_flow.fi_sport != 0)
 			mask |= FIMB4_SPORT;
 		if (filt->ff_flow.fi_dport != 0)
 			mask |= FIMB4_DPORT;
 		if (filt->ff_flow.fi_gpi != 0)
 			mask |= FIMB4_GPI;
 		break;
 #ifdef INET6
 	case AF_INET6:
 		filt6 = (struct flow_filter6 *)filt;
 
 		if (filt6->ff_flow6.fi6_proto != 0)
 			mask |= FIMB6_PROTO;
 		if (filt6->ff_flow6.fi6_tclass != 0)
 			mask |= FIMB6_TCLASS;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_dst))
 			mask |= FIMB6_DADDR;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_src))
 			mask |= FIMB6_SADDR;
 		if (filt6->ff_flow6.fi6_sport != 0)
 			mask |= FIMB6_SPORT;
 		if (filt6->ff_flow6.fi6_dport != 0)
 			mask |= FIMB6_DPORT;
 		if (filt6->ff_flow6.fi6_gpi != 0)
 			mask |= FIMB6_GPI;
 		if (filt6->ff_flow6.fi6_flowlabel != 0)
 			mask |= FIMB6_FLABEL;
 		break;
 #endif /* INET6 */
 	}
 	return (mask);
 }
 
 /*
  * helper functions to handle IPv4 fragments.
  * currently only in-sequence fragments are handled.
  *	- fragment info is cached in a LRU list.
  *	- when a first fragment is found, cache its flow info.
  *	- when a non-first fragment is found, lookup the cache.
  */
 
 struct ip4_frag {
     TAILQ_ENTRY(ip4_frag) ip4f_chain;
     char    ip4f_valid;
     u_short ip4f_id;
     struct flowinfo_in ip4f_info;
 };
 
 static TAILQ_HEAD(ip4f_list, ip4_frag) ip4f_list; /* IPv4 fragment cache */
 
 #define	IP4F_TABSIZE		16	/* IPv4 fragment cache size */
 
 static void
 ip4f_cache(ip, fin)
 	struct ip *ip;
 	struct flowinfo_in *fin;
 {
 	struct ip4_frag *fp;
 
 	if (TAILQ_EMPTY(&ip4f_list)) {
 		/* first time call, allocate fragment cache entries. */
 		if (ip4f_init() < 0)
 			/* allocation failed! */
 			return;
 	}
 
 	fp = ip4f_alloc();
 	fp->ip4f_id = ip->ip_id;
 	fp->ip4f_info.fi_proto = ip->ip_p;
 	fp->ip4f_info.fi_src.s_addr = ip->ip_src.s_addr;
 	fp->ip4f_info.fi_dst.s_addr = ip->ip_dst.s_addr;
 
 	/* save port numbers */
 	fp->ip4f_info.fi_sport = fin->fi_sport;
 	fp->ip4f_info.fi_dport = fin->fi_dport;
 	fp->ip4f_info.fi_gpi   = fin->fi_gpi;
 }
 
 static int
 ip4f_lookup(ip, fin)
 	struct ip *ip;
 	struct flowinfo_in *fin;
 {
 	struct ip4_frag *fp;
 
 	for (fp = TAILQ_FIRST(&ip4f_list); fp != NULL && fp->ip4f_valid;
 	     fp = TAILQ_NEXT(fp, ip4f_chain))
 		if (ip->ip_id == fp->ip4f_id &&
 		    ip->ip_src.s_addr == fp->ip4f_info.fi_src.s_addr &&
 		    ip->ip_dst.s_addr == fp->ip4f_info.fi_dst.s_addr &&
 		    ip->ip_p == fp->ip4f_info.fi_proto) {
 			/* found the matching entry */
 			fin->fi_sport = fp->ip4f_info.fi_sport;
 			fin->fi_dport = fp->ip4f_info.fi_dport;
 			fin->fi_gpi   = fp->ip4f_info.fi_gpi;
 
 			if ((ntohs(ip->ip_off) & IP_MF) == 0)
 				/* this is the last fragment,
 				   release the entry. */
 				ip4f_free(fp);
 
 			return (1);
 		}
 
 	/* no matching entry found */
 	return (0);
 }
 
 static int
 ip4f_init(void)
 {
 	struct ip4_frag *fp;
 	int i;
 
 	TAILQ_INIT(&ip4f_list);
 	for (i=0; i<IP4F_TABSIZE; i++) {
 		fp = malloc(sizeof(struct ip4_frag),
 		       M_DEVBUF, M_NOWAIT);
 		if (fp == NULL) {
 			printf("ip4f_init: can't alloc %dth entry!\n", i);
 			if (i == 0)
 				return (-1);
 			return (0);
 		}
 		fp->ip4f_valid = 0;
 		TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain);
 	}
 	return (0);
 }
 
 static struct ip4_frag *
 ip4f_alloc(void)
 {
 	struct ip4_frag *fp;
 
 	/* reclaim an entry at the tail, put it at the head */
 	fp = TAILQ_LAST(&ip4f_list, ip4f_list);
 	TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain);
 	fp->ip4f_valid = 1;
 	TAILQ_INSERT_HEAD(&ip4f_list, fp, ip4f_chain);
 	return (fp);
 }
 
 static void
 ip4f_free(fp)
 	struct ip4_frag *fp;
 {
 	TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain);
 	fp->ip4f_valid = 0;
 	TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain);
 }
 
 #endif /* ALTQ3_CLFIER_COMPAT */
diff --git a/sys/net/bpf.c b/sys/net/bpf.c
index 1885f1fd3733..ab733f1d68ec 100644
--- a/sys/net/bpf.c
+++ b/sys/net/bpf.c
@@ -1,3170 +1,3171 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 2019 Andrey V. Elsukov <ae@FreeBSD.org>
  *
  * This code is derived from the Stanford/CMU enet packet filter,
  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
  * Berkeley Laboratory.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      @(#)bpf.c	8.4 (Berkeley) 1/9/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bpf.h"
 #include "opt_ddb.h"
 #include "opt_netgraph.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/time.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/filio.h>
 #include <sys/sockio.h>
 #include <sys/ttycom.h>
 #include <sys/uio.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 
 #include <sys/event.h>
 #include <sys/file.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 
 #include <sys/socket.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_vlan_var.h>
 #include <net/if_dl.h>
 #include <net/bpf.h>
 #include <net/bpf_buffer.h>
 #ifdef BPF_JITTER
 #include <net/bpf_jitter.h>
 #endif
 #include <net/bpf_zerocopy.h>
 #include <net/bpfdesc.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #include <net80211/ieee80211_freebsd.h>
 
 #include <security/mac/mac_framework.h>
 
 MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
 
 static struct bpf_if_ext dead_bpf_if = {
 	.bif_dlist = CK_LIST_HEAD_INITIALIZER()
 };
 
 struct bpf_if {
 #define	bif_next	bif_ext.bif_next
 #define	bif_dlist	bif_ext.bif_dlist
 	struct bpf_if_ext bif_ext;	/* public members */
 	u_int		bif_dlt;	/* link layer type */
 	u_int		bif_hdrlen;	/* length of link header */
 	struct bpfd_list bif_wlist;	/* writer-only list */
 	struct ifnet	*bif_ifp;	/* corresponding interface */
 	struct bpf_if	**bif_bpf;	/* Pointer to pointer to us */
 	volatile u_int	bif_refcnt;
 	struct epoch_context epoch_ctx;
 };
 
 CTASSERT(offsetof(struct bpf_if, bif_ext) == 0);
 
 struct bpf_program_buffer {
 	struct epoch_context	epoch_ctx;
 #ifdef BPF_JITTER
 	bpf_jit_filter		*func;
 #endif
 	void			*buffer[0];
 };
 
 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
 
 #define PRINET  26			/* interruptible */
 #define BPF_PRIO_MAX	7
 
 #define	SIZEOF_BPF_HDR(type)	\
     (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen))
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 #define BPF_ALIGNMENT32 sizeof(int32_t)
 #define	BPF_WORDALIGN32(x) roundup2(x, BPF_ALIGNMENT32)
 
 #ifndef BURN_BRIDGES
 /*
  * 32-bit version of structure prepended to each packet.  We use this header
  * instead of the standard one for 32-bit streams.  We mark the a stream as
  * 32-bit the first time we see a 32-bit compat ioctl request.
  */
 struct bpf_hdr32 {
 	struct timeval32 bh_tstamp;	/* time stamp */
 	uint32_t	bh_caplen;	/* length of captured portion */
 	uint32_t	bh_datalen;	/* original length of packet */
 	uint16_t	bh_hdrlen;	/* length of bpf header (this struct
 					   plus alignment padding) */
 };
 #endif
 
 struct bpf_program32 {
 	u_int bf_len;
 	uint32_t bf_insns;
 };
 
 struct bpf_dltlist32 {
 	u_int	bfl_len;
 	u_int	bfl_list;
 };
 
 #define	BIOCSETF32	_IOW('B', 103, struct bpf_program32)
 #define	BIOCSRTIMEOUT32	_IOW('B', 109, struct timeval32)
 #define	BIOCGRTIMEOUT32	_IOR('B', 110, struct timeval32)
 #define	BIOCGDLTLIST32	_IOWR('B', 121, struct bpf_dltlist32)
 #define	BIOCSETWF32	_IOW('B', 123, struct bpf_program32)
 #define	BIOCSETFNR32	_IOW('B', 130, struct bpf_program32)
 #endif
 
 #define BPF_LOCK()	   sx_xlock(&bpf_sx)
 #define BPF_UNLOCK()		sx_xunlock(&bpf_sx)
 #define BPF_LOCK_ASSERT()	sx_assert(&bpf_sx, SA_XLOCKED)
 /*
  * bpf_iflist is a list of BPF interface structures, each corresponding to a
  * specific DLT. The same network interface might have several BPF interface
  * structures registered by different layers in the stack (i.e., 802.11
  * frames, ethernet frames, etc).
  */
 CK_LIST_HEAD(bpf_iflist, bpf_if);
 static struct bpf_iflist bpf_iflist;
 static struct sx	bpf_sx;		/* bpf global lock */
 static int		bpf_bpfd_cnt;
 
 static void	bpfif_ref(struct bpf_if *);
 static void	bpfif_rele(struct bpf_if *);
 
 static void	bpfd_ref(struct bpf_d *);
 static void	bpfd_rele(struct bpf_d *);
 static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
 static void	bpf_detachd(struct bpf_d *);
 static void	bpf_detachd_locked(struct bpf_d *, bool);
 static void	bpfd_free(epoch_context_t);
 static int	bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
 		    struct sockaddr *, int *, struct bpf_d *);
 static int	bpf_setif(struct bpf_d *, struct ifreq *);
 static void	bpf_timed_out(void *);
 static __inline void
 		bpf_wakeup(struct bpf_d *);
 static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
 		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
 		    struct bintime *);
 static void	reset_d(struct bpf_d *);
 static int	bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
 static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 static int	bpf_setdlt(struct bpf_d *, u_int);
 static void	filt_bpfdetach(struct knote *);
 static int	filt_bpfread(struct knote *, long);
 static int	filt_bpfwrite(struct knote *, long);
 static void	bpf_drvinit(void *);
 static int	bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "bpf sysctl");
 int bpf_maxinsns = BPF_MAXINSNS;
 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
     &bpf_maxinsns, 0, "Maximum bpf program instructions");
 static int bpf_zerocopy_enable = 0;
 SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
     &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
 static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW,
     bpf_stats_sysctl, "bpf statistics portal");
 
 VNET_DEFINE_STATIC(int, bpf_optimize_writers) = 0;
 #define	V_bpf_optimize_writers VNET(bpf_optimize_writers)
 SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RWTUN,
     &VNET_NAME(bpf_optimize_writers), 0,
     "Do not send packets until BPF program is set");
 
 static	d_open_t	bpfopen;
 static	d_read_t	bpfread;
 static	d_write_t	bpfwrite;
 static	d_ioctl_t	bpfioctl;
 static	d_poll_t	bpfpoll;
 static	d_kqfilter_t	bpfkqfilter;
 
 static struct cdevsw bpf_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	bpfopen,
 	.d_read =	bpfread,
 	.d_write =	bpfwrite,
 	.d_ioctl =	bpfioctl,
 	.d_poll =	bpfpoll,
 	.d_name =	"bpf",
 	.d_kqfilter =	bpfkqfilter,
 };
 
 static struct filterops bpfread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_bpfdetach,
 	.f_event = filt_bpfread,
 };
 
 static struct filterops bpfwrite_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_bpfdetach,
 	.f_event = filt_bpfwrite,
 };
 
 /*
  * LOCKING MODEL USED BY BPF
  *
  * Locks:
  * 1) global lock (BPF_LOCK). Sx, used to protect some global counters,
  * every bpf_iflist changes, serializes ioctl access to bpf descriptors.
  * 2) Descriptor lock. Mutex, used to protect BPF buffers and various
  * structure fields used by bpf_*tap* code.
  *
  * Lock order: global lock, then descriptor lock.
  *
  * There are several possible consumers:
  *
  * 1. The kernel registers interface pointer with bpfattach().
  * Each call allocates new bpf_if structure, references ifnet pointer
  * and links bpf_if into bpf_iflist chain. This is protected with global
  * lock.
  *
  * 2. An userland application uses ioctl() call to bpf_d descriptor.
  * All such call are serialized with global lock. BPF filters can be
  * changed, but pointer to old filter will be freed using NET_EPOCH_CALL().
  * Thus it should be safe for bpf_tap/bpf_mtap* code to do access to
  * filter pointers, even if change will happen during bpf_tap execution.
  * Destroying of bpf_d descriptor also is doing using NET_EPOCH_CALL().
  *
  * 3. An userland application can write packets into bpf_d descriptor.
  * There we need to be sure, that ifnet won't disappear during bpfwrite().
  *
  * 4. The kernel invokes bpf_tap/bpf_mtap* functions. The access to
  * bif_dlist is protected with net_epoch_preempt section. So, it should
  * be safe to make access to bpf_d descriptor inside the section.
  *
  * 5. The kernel invokes bpfdetach() on interface destroying. All lists
  * are modified with global lock held and actual free() is done using
  * NET_EPOCH_CALL().
  */
 
 static void
 bpfif_free(epoch_context_t ctx)
 {
 	struct bpf_if *bp;
 
 	bp = __containerof(ctx, struct bpf_if, epoch_ctx);
 	if_rele(bp->bif_ifp);
 	free(bp, M_BPF);
 }
 
 static void
 bpfif_ref(struct bpf_if *bp)
 {
 
 	refcount_acquire(&bp->bif_refcnt);
 }
 
 static void
 bpfif_rele(struct bpf_if *bp)
 {
 
 	if (!refcount_release(&bp->bif_refcnt))
 		return;
 	NET_EPOCH_CALL(bpfif_free, &bp->epoch_ctx);
 }
 
 static void
 bpfd_ref(struct bpf_d *d)
 {
 
 	refcount_acquire(&d->bd_refcnt);
 }
 
 static void
 bpfd_rele(struct bpf_d *d)
 {
 
 	if (!refcount_release(&d->bd_refcnt))
 		return;
 	NET_EPOCH_CALL(bpfd_free, &d->epoch_ctx);
 }
 
 static struct bpf_program_buffer*
 bpf_program_buffer_alloc(size_t size, int flags)
 {
 
 	return (malloc(sizeof(struct bpf_program_buffer) + size,
 	    M_BPF, flags));
 }
 
 static void
 bpf_program_buffer_free(epoch_context_t ctx)
 {
 	struct bpf_program_buffer *ptr;
 
 	ptr = __containerof(ctx, struct bpf_program_buffer, epoch_ctx);
 #ifdef BPF_JITTER
 	if (ptr->func != NULL)
 		bpf_destroy_jit_filter(ptr->func);
 #endif
 	free(ptr, M_BPF);
 }
 
 /*
  * Wrapper functions for various buffering methods.  If the set of buffer
  * modes expands, we will probably want to introduce a switch data structure
  * similar to protosw, et.
  */
 static void
 bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
     u_int len)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
 
 	case BPF_BUFMODE_ZBUF:
 		counter_u64_add(d->bd_zcopy, 1);
 		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
 
 	default:
 		panic("bpf_buf_append_bytes");
 	}
 }
 
 static void
 bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
     u_int len)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
 
 	case BPF_BUFMODE_ZBUF:
 		counter_u64_add(d->bd_zcopy, 1);
 		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
 
 	default:
 		panic("bpf_buf_append_mbuf");
 	}
 }
 
 /*
  * This function gets called when the free buffer is re-assigned.
  */
 static void
 bpf_buf_reclaimed(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return;
 
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_buf_reclaimed(d);
 		return;
 
 	default:
 		panic("bpf_buf_reclaimed");
 	}
 }
 
 /*
  * If the buffer mechanism has a way to decide that a held buffer can be made
  * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
  * returned if the buffer can be discarded, (0) is returned if it cannot.
  */
 static int
 bpf_canfreebuf(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_canfreebuf(d));
 	}
 	return (0);
 }
 
 /*
  * Allow the buffer model to indicate that the current store buffer is
  * immutable, regardless of the appearance of space.  Return (1) if the
  * buffer is writable, and (0) if not.
  */
 static int
 bpf_canwritebuf(struct bpf_d *d)
 {
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_canwritebuf(d));
 	}
 	return (1);
 }
 
 /*
  * Notify buffer model that an attempt to write to the store buffer has
  * resulted in a dropped packet, in which case the buffer may be considered
  * full.
  */
 static void
 bpf_buffull(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_buffull(d);
 		break;
 	}
 }
 
 /*
  * Notify the buffer model that a buffer has moved into the hold position.
  */
 void
 bpf_bufheld(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_bufheld(d);
 		break;
 	}
 }
 
 static void
 bpf_free(struct bpf_d *d)
 {
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_free(d));
 
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_free(d));
 
 	default:
 		panic("bpf_buf_free");
 	}
 }
 
 static int
 bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
 		return (EOPNOTSUPP);
 	return (bpf_buffer_uiomove(d, buf, len, uio));
 }
 
 static int
 bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
 		return (EOPNOTSUPP);
 	return (bpf_buffer_ioctl_sblen(d, i));
 }
 
 static int
 bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
 }
 
 static int
 bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
 }
 
 static int
 bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
 }
 
 /*
  * General BPF functions.
  */
 static int
 bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
     struct sockaddr *sockp, int *hdrlen, struct bpf_d *d)
 {
 	const struct ieee80211_bpf_params *p;
 	struct ether_header *eh;
 	struct mbuf *m;
 	int error;
 	int len;
 	int hlen;
 	int slen;
 
 	/*
 	 * Build a sockaddr based on the data link layer type.
 	 * We do this at this level because the ethernet header
 	 * is copied directly into the data field of the sockaddr.
 	 * In the case of SLIP, there is no header and the packet
 	 * is forwarded as is.
 	 * Also, we are careful to leave room at the front of the mbuf
 	 * for the link level header.
 	 */
 	switch (linktype) {
 	case DLT_SLIP:
 		sockp->sa_family = AF_INET;
 		hlen = 0;
 		break;
 
 	case DLT_EN10MB:
 		sockp->sa_family = AF_UNSPEC;
 		/* XXX Would MAXLINKHDR be better? */
 		hlen = ETHER_HDR_LEN;
 		break;
 
 	case DLT_FDDI:
 		sockp->sa_family = AF_IMPLINK;
 		hlen = 0;
 		break;
 
 	case DLT_RAW:
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 0;
 		break;
 
 	case DLT_NULL:
 		/*
 		 * null interface types require a 4 byte pseudo header which
 		 * corresponds to the address family of the packet.
 		 */
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 4;
 		break;
 
 	case DLT_ATM_RFC1483:
 		/*
 		 * en atm driver requires 4-byte atm pseudo header.
 		 * though it isn't standard, vpi:vci needs to be
 		 * specified anyway.
 		 */
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 12;	/* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
 		break;
 
 	case DLT_PPP:
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 4;	/* This should match PPP_HDRLEN */
 		break;
 
 	case DLT_IEEE802_11:		/* IEEE 802.11 wireless */
 		sockp->sa_family = AF_IEEE80211;
 		hlen = 0;
 		break;
 
 	case DLT_IEEE802_11_RADIO:	/* IEEE 802.11 wireless w/ phy params */
 		sockp->sa_family = AF_IEEE80211;
 		sockp->sa_len = 12;	/* XXX != 0 */
 		hlen = sizeof(struct ieee80211_bpf_params);
 		break;
 
 	default:
 		return (EIO);
 	}
 
 	len = uio->uio_resid;
 	if (len < hlen || len - hlen > ifp->if_mtu)
 		return (EMSGSIZE);
 
 	/* Allocate a mbuf for our write, since m_get2 fails if len >= to MJUMPAGESIZE, use m_getjcl for bigger buffers */
 	m = m_get3(len, M_WAITOK, MT_DATA, M_PKTHDR);
 	if (m == NULL)
 		return (EIO);
 	m->m_pkthdr.len = m->m_len = len;
 	*mp = m;
 
 	error = uiomove(mtod(m, u_char *), len, uio);
 	if (error)
 		goto bad;
 
 	slen = bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len);
 	if (slen == 0) {
 		error = EPERM;
 		goto bad;
 	}
 
 	/* Check for multicast destination */
 	switch (linktype) {
 	case DLT_EN10MB:
 		eh = mtod(m, struct ether_header *);
 		if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 			if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost,
 			    ETHER_ADDR_LEN) == 0)
 				m->m_flags |= M_BCAST;
 			else
 				m->m_flags |= M_MCAST;
 		}
 		if (d->bd_hdrcmplt == 0) {
 			memcpy(eh->ether_shost, IF_LLADDR(ifp),
 			    sizeof(eh->ether_shost));
 		}
 		break;
 	}
 
 	/*
 	 * Make room for link header, and copy it to sockaddr
 	 */
 	if (hlen != 0) {
 		if (sockp->sa_family == AF_IEEE80211) {
 			/*
 			 * Collect true length from the parameter header
 			 * NB: sockp is known to be zero'd so if we do a
 			 *     short copy unspecified parameters will be
 			 *     zero.
 			 * NB: packet may not be aligned after stripping
 			 *     bpf params
 			 * XXX check ibp_vers
 			 */
 			p = mtod(m, const struct ieee80211_bpf_params *);
 			hlen = p->ibp_len;
 			if (hlen > sizeof(sockp->sa_data)) {
 				error = EINVAL;
 				goto bad;
 			}
 		}
 		bcopy(mtod(m, const void *), sockp->sa_data, hlen);
 	}
 	*hdrlen = hlen;
 
 	return (0);
 bad:
 	m_freem(m);
 	return (error);
 }
 
 /*
  * Attach descriptor to the bpf interface, i.e. make d listen on bp,
  * then reset its buffers and counters with reset_d().
  */
 static void
 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 {
 	int op_w;
 
 	BPF_LOCK_ASSERT();
 
 	/*
 	 * Save sysctl value to protect from sysctl change
 	 * between reads
 	 */
 	op_w = V_bpf_optimize_writers || d->bd_writer;
 
 	if (d->bd_bif != NULL)
 		bpf_detachd_locked(d, false);
 	/*
 	 * Point d at bp, and add d to the interface's list.
 	 * Since there are many applications using BPF for
 	 * sending raw packets only (dhcpd, cdpd are good examples)
 	 * we can delay adding d to the list of active listeners until
 	 * some filter is configured.
 	 */
 
 	BPFD_LOCK(d);
 	/*
 	 * Hold reference to bpif while descriptor uses this interface.
 	 */
 	bpfif_ref(bp);
 	d->bd_bif = bp;
 	if (op_w != 0) {
 		/* Add to writers-only list */
 		CK_LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
 		/*
 		 * We decrement bd_writer on every filter set operation.
 		 * First BIOCSETF is done by pcap_open_live() to set up
 		 * snap length. After that appliation usually sets its own
 		 * filter.
 		 */
 		d->bd_writer = 2;
 	} else
 		CK_LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
 
 	reset_d(d);
 
 	/* Trigger EVFILT_WRITE events. */
 	bpf_wakeup(d);
 
 	BPFD_UNLOCK(d);
 	bpf_bpfd_cnt++;
 
 	CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list",
 	    __func__, d->bd_pid, d->bd_writer ? "writer" : "active");
 
 	if (op_w == 0)
 		EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
 }
 
 /*
  * Check if we need to upgrade our descriptor @d from write-only mode.
  */
 static int
 bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode,
     int flen)
 {
 	int is_snap, need_upgrade;
 
 	/*
 	 * Check if we've already upgraded or new filter is empty.
 	 */
 	if (d->bd_writer == 0 || fcode == NULL)
 		return (0);
 
 	need_upgrade = 0;
 
 	/*
 	 * Check if cmd looks like snaplen setting from
 	 * pcap_bpf.c:pcap_open_live().
 	 * Note we're not checking .k value here:
 	 * while pcap_open_live() definitely sets to non-zero value,
 	 * we'd prefer to treat k=0 (deny ALL) case the same way: e.g.
 	 * do not consider upgrading immediately
 	 */
 	if (cmd == BIOCSETF && flen == 1 &&
 	    fcode[0].code == (BPF_RET | BPF_K))
 		is_snap = 1;
 	else
 		is_snap = 0;
 
 	if (is_snap == 0) {
 		/*
 		 * We're setting first filter and it doesn't look like
 		 * setting snaplen.  We're probably using bpf directly.
 		 * Upgrade immediately.
 		 */
 		need_upgrade = 1;
 	} else {
 		/*
 		 * Do not require upgrade by first BIOCSETF
 		 * (used to set snaplen) by pcap_open_live().
 		 */
 
 		if (--d->bd_writer == 0) {
 			/*
 			 * First snaplen filter has already
 			 * been set. This is probably catch-all
 			 * filter
 			 */
 			need_upgrade = 1;
 		}
 	}
 
 	CTR5(KTR_NET,
 	    "%s: filter function set by pid %d, "
 	    "bd_writer counter %d, snap %d upgrade %d",
 	    __func__, d->bd_pid, d->bd_writer,
 	    is_snap, need_upgrade);
 
 	return (need_upgrade);
 }
 
 /*
  * Detach a file from its interface.
  */
 static void
 bpf_detachd(struct bpf_d *d)
 {
 	BPF_LOCK();
 	bpf_detachd_locked(d, false);
 	BPF_UNLOCK();
 }
 
 static void
 bpf_detachd_locked(struct bpf_d *d, bool detached_ifp)
 {
 	struct bpf_if *bp;
 	struct ifnet *ifp;
 	int error;
 
 	BPF_LOCK_ASSERT();
 	CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
 
 	/* Check if descriptor is attached */
 	if ((bp = d->bd_bif) == NULL)
 		return;
 
 	BPFD_LOCK(d);
 	/* Remove d from the interface's descriptor list. */
 	CK_LIST_REMOVE(d, bd_next);
 	/* Save bd_writer value */
 	error = d->bd_writer;
 	ifp = bp->bif_ifp;
 	d->bd_bif = NULL;
 	if (detached_ifp) {
 		/*
 		 * Notify descriptor as it's detached, so that any
 		 * sleepers wake up and get ENXIO.
 		 */
 		bpf_wakeup(d);
 	}
 	BPFD_UNLOCK(d);
 	bpf_bpfd_cnt--;
 
 	/* Call event handler iff d is attached */
 	if (error == 0)
 		EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0);
 
 	/*
 	 * Check if this descriptor had requested promiscuous mode.
 	 * If so and ifnet is not detached, turn it off.
 	 */
 	if (d->bd_promisc && !detached_ifp) {
 		d->bd_promisc = 0;
 		CURVNET_SET(ifp->if_vnet);
 		error = ifpromisc(ifp, 0);
 		CURVNET_RESTORE();
 		if (error != 0 && error != ENXIO) {
 			/*
 			 * ENXIO can happen if a pccard is unplugged
 			 * Something is really wrong if we were able to put
 			 * the driver into promiscuous mode, but can't
 			 * take it out.
 			 */
 			if_printf(bp->bif_ifp,
 				"bpf_detach: ifpromisc failed (%d)\n", error);
 		}
 	}
 	bpfif_rele(bp);
 }
 
 /*
  * Close the descriptor by detaching it from its interface,
  * deallocating its buffers, and marking it free.
  */
 static void
 bpf_dtor(void *data)
 {
 	struct bpf_d *d = data;
 
 	BPFD_LOCK(d);
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	d->bd_state = BPF_IDLE;
 	BPFD_UNLOCK(d);
 	funsetown(&d->bd_sigio);
 	bpf_detachd(d);
 #ifdef MAC
 	mac_bpfdesc_destroy(d);
 #endif /* MAC */
 	seldrain(&d->bd_sel);
 	knlist_destroy(&d->bd_sel.si_note);
 	callout_drain(&d->bd_callout);
 	bpfd_rele(d);
 }
 
 /*
  * Open ethernet device.  Returns ENXIO for illegal minor device number,
  * EBUSY if file is open by another process.
  */
 /* ARGSUSED */
 static	int
 bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct bpf_d *d;
 	int error;
 
 	d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
 	error = devfs_set_cdevpriv(d, bpf_dtor);
 	if (error != 0) {
 		free(d, M_BPF);
 		return (error);
 	}
 
 	/* Setup counters */
 	d->bd_rcount = counter_u64_alloc(M_WAITOK);
 	d->bd_dcount = counter_u64_alloc(M_WAITOK);
 	d->bd_fcount = counter_u64_alloc(M_WAITOK);
 	d->bd_wcount = counter_u64_alloc(M_WAITOK);
 	d->bd_wfcount = counter_u64_alloc(M_WAITOK);
 	d->bd_wdcount = counter_u64_alloc(M_WAITOK);
 	d->bd_zcopy = counter_u64_alloc(M_WAITOK);
 
 	/*
 	 * For historical reasons, perform a one-time initialization call to
 	 * the buffer routines, even though we're not yet committed to a
 	 * particular buffer method.
 	 */
 	bpf_buffer_init(d);
 	if ((flags & FREAD) == 0)
 		d->bd_writer = 2;
 	d->bd_hbuf_in_use = 0;
 	d->bd_bufmode = BPF_BUFMODE_BUFFER;
 	d->bd_sig = SIGIO;
 	d->bd_direction = BPF_D_INOUT;
 	refcount_init(&d->bd_refcnt, 1);
 	BPF_PID_REFRESH(d, td);
 #ifdef MAC
 	mac_bpfdesc_init(d);
 	mac_bpfdesc_create(td->td_ucred, d);
 #endif
 	mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF);
 	callout_init_mtx(&d->bd_callout, &d->bd_lock, 0);
 	knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock);
 
 	/* Disable VLAN pcp tagging. */
 	d->bd_pcp = 0;
 
 	return (0);
 }
 
 /*
  *  bpfread - read next chunk of packets from buffers
  */
 static	int
 bpfread(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct bpf_d *d;
 	int error;
 	int non_block;
 	int timed_out;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Restrict application to use a buffer the same size as
 	 * as kernel buffers.
 	 */
 	if (uio->uio_resid != d->bd_bufsize)
 		return (EINVAL);
 
 	non_block = ((ioflag & O_NONBLOCK) != 0);
 
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH_CUR(d);
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
 		BPFD_UNLOCK(d);
 		return (EOPNOTSUPP);
 	}
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	timed_out = (d->bd_state == BPF_TIMED_OUT);
 	d->bd_state = BPF_IDLE;
 	while (d->bd_hbuf_in_use) {
 		error = mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
 		    PRINET|PCATCH, "bd_hbuf", 0);
 		if (error != 0) {
 			BPFD_UNLOCK(d);
 			return (error);
 		}
 	}
 	/*
 	 * If the hold buffer is empty, then do a timed sleep, which
 	 * ends when the timeout expires or when enough packets
 	 * have arrived to fill the store buffer.
 	 */
 	while (d->bd_hbuf == NULL) {
 		if (d->bd_slen != 0) {
 			/*
 			 * A packet(s) either arrived since the previous
 			 * read or arrived while we were asleep.
 			 */
 			if (d->bd_immediate || non_block || timed_out) {
 				/*
 				 * Rotate the buffers and return what's here
 				 * if we are in immediate mode, non-blocking
 				 * flag is set, or this descriptor timed out.
 				 */
 				ROTATE_BUFFERS(d);
 				break;
 			}
 		}
 
 		/*
 		 * No data is available, check to see if the bpf device
 		 * is still pointed at a real interface.  If not, return
 		 * ENXIO so that the userland process knows to rebind
 		 * it before using it again.
 		 */
 		if (d->bd_bif == NULL) {
 			BPFD_UNLOCK(d);
 			return (ENXIO);
 		}
 
 		if (non_block) {
 			BPFD_UNLOCK(d);
 			return (EWOULDBLOCK);
 		}
 		error = msleep(d, &d->bd_lock, PRINET|PCATCH,
 		     "bpf", d->bd_rtout);
 		if (error == EINTR || error == ERESTART) {
 			BPFD_UNLOCK(d);
 			return (error);
 		}
 		if (error == EWOULDBLOCK) {
 			/*
 			 * On a timeout, return what's in the buffer,
 			 * which may be nothing.  If there is something
 			 * in the store buffer, we can rotate the buffers.
 			 */
 			if (d->bd_hbuf)
 				/*
 				 * We filled up the buffer in between
 				 * getting the timeout and arriving
 				 * here, so we don't need to rotate.
 				 */
 				break;
 
 			if (d->bd_slen == 0) {
 				BPFD_UNLOCK(d);
 				return (0);
 			}
 			ROTATE_BUFFERS(d);
 			break;
 		}
 	}
 	/*
 	 * At this point, we know we have something in the hold slot.
 	 */
 	d->bd_hbuf_in_use = 1;
 	BPFD_UNLOCK(d);
 
 	/*
 	 * Move data from hold buffer into user space.
 	 * We know the entire buffer is transferred since
 	 * we checked above that the read buffer is bpf_bufsize bytes.
   	 *
 	 * We do not have to worry about simultaneous reads because
 	 * we waited for sole access to the hold buffer above.
 	 */
 	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
 
 	BPFD_LOCK(d);
 	KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf"));
 	d->bd_fbuf = d->bd_hbuf;
 	d->bd_hbuf = NULL;
 	d->bd_hlen = 0;
 	bpf_buf_reclaimed(d);
 	d->bd_hbuf_in_use = 0;
 	wakeup(&d->bd_hbuf_in_use);
 	BPFD_UNLOCK(d);
 
 	return (error);
 }
 
 /*
  * If there are processes sleeping on this descriptor, wake them up.
  */
 static __inline void
 bpf_wakeup(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 	if (d->bd_state == BPF_WAITING) {
 		callout_stop(&d->bd_callout);
 		d->bd_state = BPF_IDLE;
 	}
 	wakeup(d);
 	if (d->bd_async && d->bd_sig && d->bd_sigio)
 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
 
 	selwakeuppri(&d->bd_sel, PRINET);
 	KNOTE_LOCKED(&d->bd_sel.si_note, 0);
 }
 
 static void
 bpf_timed_out(void *arg)
 {
 	struct bpf_d *d = (struct bpf_d *)arg;
 
 	BPFD_LOCK_ASSERT(d);
 
 	if (callout_pending(&d->bd_callout) ||
 	    !callout_active(&d->bd_callout))
 		return;
 	if (d->bd_state == BPF_WAITING) {
 		d->bd_state = BPF_TIMED_OUT;
 		if (d->bd_slen != 0)
 			bpf_wakeup(d);
 	}
 }
 
 static int
 bpf_ready(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
 		return (1);
 	if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
 	    d->bd_slen != 0)
 		return (1);
 	return (0);
 }
 
 static int
 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct route ro;
 	struct sockaddr dst;
 	struct epoch_tracker et;
 	struct bpf_if *bp;
 	struct bpf_d *d;
 	struct ifnet *ifp;
 	struct mbuf *m, *mc;
 	int error, hlen;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	NET_EPOCH_ENTER(et);
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH_CUR(d);
 	counter_u64_add(d->bd_wcount, 1);
 	if ((bp = d->bd_bif) == NULL) {
 		error = ENXIO;
 		goto out_locked;
 	}
 
 	ifp = bp->bif_ifp;
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		error = ENETDOWN;
 		goto out_locked;
 	}
 
 	if (uio->uio_resid == 0)
 		goto out_locked;
 
 	bzero(&dst, sizeof(dst));
 	m = NULL;
 	hlen = 0;
 
 	/*
 	 * Take extra reference, unlock d and exit from epoch section,
 	 * since bpf_movein() can sleep.
 	 */
 	bpfd_ref(d);
 	NET_EPOCH_EXIT(et);
 	BPFD_UNLOCK(d);
 
 	error = bpf_movein(uio, (int)bp->bif_dlt, ifp,
 	    &m, &dst, &hlen, d);
 
 	if (error != 0) {
 		counter_u64_add(d->bd_wdcount, 1);
 		bpfd_rele(d);
 		return (error);
 	}
 
 	BPFD_LOCK(d);
 	/*
 	 * Check that descriptor is still attached to the interface.
 	 * This can happen on bpfdetach(). To avoid access to detached
 	 * ifnet, free mbuf and return ENXIO.
 	 */
 	if (d->bd_bif == NULL) {
 		counter_u64_add(d->bd_wdcount, 1);
 		BPFD_UNLOCK(d);
 		bpfd_rele(d);
 		m_freem(m);
 		return (ENXIO);
 	}
 	counter_u64_add(d->bd_wfcount, 1);
 	if (d->bd_hdrcmplt)
 		dst.sa_family = pseudo_AF_HDRCMPLT;
 
 	if (d->bd_feedback) {
 		mc = m_dup(m, M_NOWAIT);
 		if (mc != NULL)
 			mc->m_pkthdr.rcvif = ifp;
 		/* Set M_PROMISC for outgoing packets to be discarded. */
 		if (d->bd_direction == BPF_D_INOUT)
 			m->m_flags |= M_PROMISC;
 	} else
 		mc = NULL;
 
 	m->m_pkthdr.len -= hlen;
 	m->m_len -= hlen;
 	m->m_data += hlen;	/* XXX */
 
 	CURVNET_SET(ifp->if_vnet);
 #ifdef MAC
 	mac_bpfdesc_create_mbuf(d, m);
 	if (mc != NULL)
 		mac_bpfdesc_create_mbuf(d, mc);
 #endif
 
 	bzero(&ro, sizeof(ro));
 	if (hlen != 0) {
 		ro.ro_prepend = (u_char *)&dst.sa_data;
 		ro.ro_plen = hlen;
 		ro.ro_flags = RT_HAS_HEADER;
 	}
 
 	if (d->bd_pcp != 0)
 		vlan_set_pcp(m, d->bd_pcp);
 
 	/* Avoid possible recursion on BPFD_LOCK(). */
 	NET_EPOCH_ENTER(et);
 	BPFD_UNLOCK(d);
 	error = (*ifp->if_output)(ifp, m, &dst, &ro);
 	if (error)
 		counter_u64_add(d->bd_wdcount, 1);
 
 	if (mc != NULL) {
 		if (error == 0)
 			(*ifp->if_input)(ifp, mc);
 		else
 			m_freem(mc);
 	}
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	bpfd_rele(d);
 	return (error);
 
 out_locked:
 	counter_u64_add(d->bd_wdcount, 1);
 	NET_EPOCH_EXIT(et);
 	BPFD_UNLOCK(d);
 	return (error);
 }
 
 /*
  * Reset a descriptor by flushing its packet buffer and clearing the receive
  * and drop counts.  This is doable for kernel-only buffers, but with
  * zero-copy buffers, we can't write to (or rotate) buffers that are
  * currently owned by userspace.  It would be nice if we could encapsulate
  * this logic in the buffer code rather than here.
  */
 static void
 reset_d(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	while (d->bd_hbuf_in_use)
 		mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET,
 		    "bd_hbuf", 0);
 	if ((d->bd_hbuf != NULL) &&
 	    (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) {
 		/* Free the hold buffer. */
 		d->bd_fbuf = d->bd_hbuf;
 		d->bd_hbuf = NULL;
 		d->bd_hlen = 0;
 		bpf_buf_reclaimed(d);
 	}
 	if (bpf_canwritebuf(d))
 		d->bd_slen = 0;
 	counter_u64_zero(d->bd_rcount);
 	counter_u64_zero(d->bd_dcount);
 	counter_u64_zero(d->bd_fcount);
 	counter_u64_zero(d->bd_wcount);
 	counter_u64_zero(d->bd_wfcount);
 	counter_u64_zero(d->bd_wdcount);
 	counter_u64_zero(d->bd_zcopy);
 }
 
 /*
  *  FIONREAD		Check for read packet available.
  *  BIOCGBLEN		Get buffer len [for read()].
  *  BIOCSETF		Set read filter.
  *  BIOCSETFNR		Set read filter without resetting descriptor.
  *  BIOCSETWF		Set write filter.
  *  BIOCFLUSH		Flush read packet buffer.
  *  BIOCPROMISC		Put interface into promiscuous mode.
  *  BIOCGDLT		Get link layer type.
  *  BIOCGETIF		Get interface name.
  *  BIOCSETIF		Set interface.
  *  BIOCSRTIMEOUT	Set read timeout.
  *  BIOCGRTIMEOUT	Get read timeout.
  *  BIOCGSTATS		Get packet stats.
  *  BIOCIMMEDIATE	Set immediate mode.
  *  BIOCVERSION		Get filter language version.
  *  BIOCGHDRCMPLT	Get "header already complete" flag
  *  BIOCSHDRCMPLT	Set "header already complete" flag
  *  BIOCGDIRECTION	Get packet direction flag
  *  BIOCSDIRECTION	Set packet direction flag
  *  BIOCGTSTAMP		Get time stamp format and resolution.
  *  BIOCSTSTAMP		Set time stamp format and resolution.
  *  BIOCLOCK		Set "locked" flag
  *  BIOCFEEDBACK	Set packet feedback mode.
  *  BIOCSETZBUF		Set current zero-copy buffer locations.
  *  BIOCGETZMAX		Get maximum zero-copy buffer size.
  *  BIOCROTZBUF		Force rotation of zero-copy buffer
  *  BIOCSETBUFMODE	Set buffer mode.
  *  BIOCGETBUFMODE	Get current buffer mode.
  *  BIOCSETVLANPCP	Set VLAN PCP tag.
  */
 /* ARGSUSED */
 static	int
 bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
     struct thread *td)
 {
 	struct bpf_d *d;
 	int error;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH(d, td);
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	d->bd_state = BPF_IDLE;
 	BPFD_UNLOCK(d);
 
 	if (d->bd_locked == 1) {
 		switch (cmd) {
 		case BIOCGBLEN:
 		case BIOCFLUSH:
 		case BIOCGDLT:
 		case BIOCGDLTLIST:
 #ifdef COMPAT_FREEBSD32
 		case BIOCGDLTLIST32:
 #endif
 		case BIOCGETIF:
 		case BIOCGRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 		case BIOCGRTIMEOUT32:
 #endif
 		case BIOCGSTATS:
 		case BIOCVERSION:
 		case BIOCGRSIG:
 		case BIOCGHDRCMPLT:
 		case BIOCSTSTAMP:
 		case BIOCFEEDBACK:
 		case FIONREAD:
 		case BIOCLOCK:
 		case BIOCSRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 		case BIOCSRTIMEOUT32:
 #endif
 		case BIOCIMMEDIATE:
 		case TIOCGPGRP:
 		case BIOCROTZBUF:
 			break;
 		default:
 			return (EPERM);
 		}
 	}
 #ifdef COMPAT_FREEBSD32
 	/*
 	 * If we see a 32-bit compat ioctl, mark the stream as 32-bit so
 	 * that it will get 32-bit packet headers.
 	 */
 	switch (cmd) {
 	case BIOCSETF32:
 	case BIOCSETFNR32:
 	case BIOCSETWF32:
 	case BIOCGDLTLIST32:
 	case BIOCGRTIMEOUT32:
 	case BIOCSRTIMEOUT32:
 		if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 			BPFD_LOCK(d);
 			d->bd_compat32 = 1;
 			BPFD_UNLOCK(d);
 		}
 	}
 #endif
 
 	CURVNET_SET(TD_TO_VNET(td));
 	switch (cmd) {
 	default:
 		error = EINVAL;
 		break;
 
 	/*
 	 * Check for read packet available.
 	 */
 	case FIONREAD:
 		{
 			int n;
 
 			BPFD_LOCK(d);
 			n = d->bd_slen;
 			while (d->bd_hbuf_in_use)
 				mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
 				    PRINET, "bd_hbuf", 0);
 			if (d->bd_hbuf)
 				n += d->bd_hlen;
 			BPFD_UNLOCK(d);
 
 			*(int *)addr = n;
 			break;
 		}
 
 	/*
 	 * Get buffer len [for read()].
 	 */
 	case BIOCGBLEN:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_bufsize;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set buffer length.
 	 */
 	case BIOCSBLEN:
 		error = bpf_ioctl_sblen(d, (u_int *)addr);
 		break;
 
 	/*
 	 * Set link layer read filter.
 	 */
 	case BIOCSETF:
 	case BIOCSETFNR:
 	case BIOCSETWF:
 #ifdef COMPAT_FREEBSD32
 	case BIOCSETF32:
 	case BIOCSETFNR32:
 	case BIOCSETWF32:
 #endif
 		error = bpf_setf(d, (struct bpf_program *)addr, cmd);
 		break;
 
 	/*
 	 * Flush read packet buffer.
 	 */
 	case BIOCFLUSH:
 		BPFD_LOCK(d);
 		reset_d(d);
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Put interface into promiscuous mode.
 	 */
 	case BIOCPROMISC:
 		BPF_LOCK();
 		if (d->bd_bif == NULL) {
 			/*
 			 * No interface attached yet.
 			 */
 			error = EINVAL;
 		} else if (d->bd_promisc == 0) {
 			error = ifpromisc(d->bd_bif->bif_ifp, 1);
 			if (error == 0)
 				d->bd_promisc = 1;
 		}
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Get current data link type.
 	 */
 	case BIOCGDLT:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			*(u_int *)addr = d->bd_bif->bif_dlt;
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Get a list of supported data link types.
 	 */
 #ifdef COMPAT_FREEBSD32
 	case BIOCGDLTLIST32:
 		{
 			struct bpf_dltlist32 *list32;
 			struct bpf_dltlist dltlist;
 
 			list32 = (struct bpf_dltlist32 *)addr;
 			dltlist.bfl_len = list32->bfl_len;
 			dltlist.bfl_list = PTRIN(list32->bfl_list);
 			BPF_LOCK();
 			if (d->bd_bif == NULL)
 				error = EINVAL;
 			else {
 				error = bpf_getdltlist(d, &dltlist);
 				if (error == 0)
 					list32->bfl_len = dltlist.bfl_len;
 			}
 			BPF_UNLOCK();
 			break;
 		}
 #endif
 
 	case BIOCGDLTLIST:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Set data link type.
 	 */
 	case BIOCSDLT:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			error = bpf_setdlt(d, *(u_int *)addr);
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Get interface name.
 	 */
 	case BIOCGETIF:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else {
 			struct ifnet *const ifp = d->bd_bif->bif_ifp;
 			struct ifreq *const ifr = (struct ifreq *)addr;
 
 			strlcpy(ifr->ifr_name, ifp->if_xname,
 			    sizeof(ifr->ifr_name));
 		}
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Set interface.
 	 */
 	case BIOCSETIF:
 		{
 			int alloc_buf, size;
 
 			/*
 			 * Behavior here depends on the buffering model.  If
 			 * we're using kernel memory buffers, then we can
 			 * allocate them here.  If we're using zero-copy,
 			 * then the user process must have registered buffers
 			 * by the time we get here.
 			 */
 			alloc_buf = 0;
 			BPFD_LOCK(d);
 			if (d->bd_bufmode == BPF_BUFMODE_BUFFER &&
 			    d->bd_sbuf == NULL)
 				alloc_buf = 1;
 			BPFD_UNLOCK(d);
 			if (alloc_buf) {
 				size = d->bd_bufsize;
 				error = bpf_buffer_ioctl_sblen(d, &size);
 				if (error != 0)
 					break;
 			}
 			BPF_LOCK();
 			error = bpf_setif(d, (struct ifreq *)addr);
 			BPF_UNLOCK();
 			break;
 		}
 
 	/*
 	 * Set read timeout.
 	 */
 	case BIOCSRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 	case BIOCSRTIMEOUT32:
 #endif
 		{
 			struct timeval *tv = (struct timeval *)addr;
 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
 			struct timeval32 *tv32;
 			struct timeval tv64;
 
 			if (cmd == BIOCSRTIMEOUT32) {
 				tv32 = (struct timeval32 *)addr;
 				tv = &tv64;
 				tv->tv_sec = tv32->tv_sec;
 				tv->tv_usec = tv32->tv_usec;
 			} else
 #endif
 				tv = (struct timeval *)addr;
 
 			/*
 			 * Subtract 1 tick from tvtohz() since this isn't
 			 * a one-shot timer.
 			 */
 			if ((error = itimerfix(tv)) == 0)
 				d->bd_rtout = tvtohz(tv) - 1;
 			break;
 		}
 
 	/*
 	 * Get read timeout.
 	 */
 	case BIOCGRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 	case BIOCGRTIMEOUT32:
 #endif
 		{
 			struct timeval *tv;
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 			struct timeval32 *tv32;
 			struct timeval tv64;
 
 			if (cmd == BIOCGRTIMEOUT32)
 				tv = &tv64;
 			else
 #endif
 				tv = (struct timeval *)addr;
 
 			tv->tv_sec = d->bd_rtout / hz;
 			tv->tv_usec = (d->bd_rtout % hz) * tick;
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 			if (cmd == BIOCGRTIMEOUT32) {
 				tv32 = (struct timeval32 *)addr;
 				tv32->tv_sec = tv->tv_sec;
 				tv32->tv_usec = tv->tv_usec;
 			}
 #endif
 
 			break;
 		}
 
 	/*
 	 * Get packet stats.
 	 */
 	case BIOCGSTATS:
 		{
 			struct bpf_stat *bs = (struct bpf_stat *)addr;
 
 			/* XXXCSJP overflow */
 			bs->bs_recv = (u_int)counter_u64_fetch(d->bd_rcount);
 			bs->bs_drop = (u_int)counter_u64_fetch(d->bd_dcount);
 			break;
 		}
 
 	/*
 	 * Set immediate mode.
 	 */
 	case BIOCIMMEDIATE:
 		BPFD_LOCK(d);
 		d->bd_immediate = *(u_int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCVERSION:
 		{
 			struct bpf_version *bv = (struct bpf_version *)addr;
 
 			bv->bv_major = BPF_MAJOR_VERSION;
 			bv->bv_minor = BPF_MINOR_VERSION;
 			break;
 		}
 
 	/*
 	 * Get "header already complete" flag
 	 */
 	case BIOCGHDRCMPLT:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_hdrcmplt;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set "header already complete" flag
 	 */
 	case BIOCSHDRCMPLT:
 		BPFD_LOCK(d);
 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Get packet direction flag
 	 */
 	case BIOCGDIRECTION:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_direction;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set packet direction flag
 	 */
 	case BIOCSDIRECTION:
 		{
 			u_int	direction;
 
 			direction = *(u_int *)addr;
 			switch (direction) {
 			case BPF_D_IN:
 			case BPF_D_INOUT:
 			case BPF_D_OUT:
 				BPFD_LOCK(d);
 				d->bd_direction = direction;
 				BPFD_UNLOCK(d);
 				break;
 			default:
 				error = EINVAL;
 			}
 		}
 		break;
 
 	/*
 	 * Get packet timestamp format and resolution.
 	 */
 	case BIOCGTSTAMP:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_tstamp;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set packet timestamp format and resolution.
 	 */
 	case BIOCSTSTAMP:
 		{
 			u_int	func;
 
 			func = *(u_int *)addr;
 			if (BPF_T_VALID(func))
 				d->bd_tstamp = func;
 			else
 				error = EINVAL;
 		}
 		break;
 
 	case BIOCFEEDBACK:
 		BPFD_LOCK(d);
 		d->bd_feedback = *(u_int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCLOCK:
 		BPFD_LOCK(d);
 		d->bd_locked = 1;
 		BPFD_UNLOCK(d);
 		break;
 
 	case FIONBIO:		/* Non-blocking I/O */
 		break;
 
 	case FIOASYNC:		/* Send signal on receive packets */
 		BPFD_LOCK(d);
 		d->bd_async = *(int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case FIOSETOWN:
 		/*
 		 * XXX: Add some sort of locking here?
 		 * fsetown() can sleep.
 		 */
 		error = fsetown(*(int *)addr, &d->bd_sigio);
 		break;
 
 	case FIOGETOWN:
 		BPFD_LOCK(d);
 		*(int *)addr = fgetown(&d->bd_sigio);
 		BPFD_UNLOCK(d);
 		break;
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		error = fsetown(-(*(int *)addr), &d->bd_sigio);
 		break;
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)addr = -fgetown(&d->bd_sigio);
 		break;
 
 	case BIOCSRSIG:		/* Set receive signal */
 		{
 			u_int sig;
 
 			sig = *(u_int *)addr;
 
 			if (sig >= NSIG)
 				error = EINVAL;
 			else {
 				BPFD_LOCK(d);
 				d->bd_sig = sig;
 				BPFD_UNLOCK(d);
 			}
 			break;
 		}
 	case BIOCGRSIG:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_sig;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCGETBUFMODE:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_bufmode;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCSETBUFMODE:
 		/*
 		 * Allow the buffering mode to be changed as long as we
 		 * haven't yet committed to a particular mode.  Our
 		 * definition of commitment, for now, is whether or not a
 		 * buffer has been allocated or an interface attached, since
 		 * that's the point where things get tricky.
 		 */
 		switch (*(u_int *)addr) {
 		case BPF_BUFMODE_BUFFER:
 			break;
 
 		case BPF_BUFMODE_ZBUF:
 			if (bpf_zerocopy_enable)
 				break;
 			/* FALLSTHROUGH */
 
 		default:
 			CURVNET_RESTORE();
 			return (EINVAL);
 		}
 
 		BPFD_LOCK(d);
 		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
 		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
 			BPFD_UNLOCK(d);
 			CURVNET_RESTORE();
 			return (EBUSY);
 		}
 		d->bd_bufmode = *(u_int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCGETZMAX:
 		error = bpf_ioctl_getzmax(td, d, (size_t *)addr);
 		break;
 
 	case BIOCSETZBUF:
 		error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr);
 		break;
 
 	case BIOCROTZBUF:
 		error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr);
 		break;
 
 	case BIOCSETVLANPCP:
 		{
 			u_int pcp;
 
 			pcp = *(u_int *)addr;
 			if (pcp > BPF_PRIO_MAX || pcp < 0) {
 				error = EINVAL;
 				break;
 			}
 			d->bd_pcp = pcp;
 			break;
 		}
 	}
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * Set d's packet filter program to fp. If this file already has a filter,
  * free it and replace it. Returns EINVAL for bogus requests.
  *
  * Note we use global lock here to serialize bpf_setf() and bpf_setif()
  * calls.
  */
 static int
 bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
 {
 #ifdef COMPAT_FREEBSD32
 	struct bpf_program fp_swab;
 	struct bpf_program32 *fp32;
 #endif
 	struct bpf_program_buffer *fcode;
 	struct bpf_insn *filter;
 #ifdef BPF_JITTER
 	bpf_jit_filter *jfunc;
 #endif
 	size_t size;
 	u_int flen;
 	bool track_event;
 
 #ifdef COMPAT_FREEBSD32
 	switch (cmd) {
 	case BIOCSETF32:
 	case BIOCSETWF32:
 	case BIOCSETFNR32:
 		fp32 = (struct bpf_program32 *)fp;
 		fp_swab.bf_len = fp32->bf_len;
 		fp_swab.bf_insns =
 		    (struct bpf_insn *)(uintptr_t)fp32->bf_insns;
 		fp = &fp_swab;
 		switch (cmd) {
 		case BIOCSETF32:
 			cmd = BIOCSETF;
 			break;
 		case BIOCSETWF32:
 			cmd = BIOCSETWF;
 			break;
 		}
 		break;
 	}
 #endif
 
 	filter = NULL;
 #ifdef BPF_JITTER
 	jfunc = NULL;
 #endif
 	/*
 	 * Check new filter validness before acquiring any locks.
 	 * Allocate memory for new filter, if needed.
 	 */
 	flen = fp->bf_len;
 	if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0))
 		return (EINVAL);
 	size = flen * sizeof(*fp->bf_insns);
 	if (size > 0) {
 		/* We're setting up new filter. Copy and check actual data. */
 		fcode = bpf_program_buffer_alloc(size, M_WAITOK);
 		filter = (struct bpf_insn *)fcode->buffer;
 		if (copyin(fp->bf_insns, filter, size) != 0 ||
 		    !bpf_validate(filter, flen)) {
 			free(fcode, M_BPF);
 			return (EINVAL);
 		}
 #ifdef BPF_JITTER
 		if (cmd != BIOCSETWF) {
 			/*
 			 * Filter is copied inside fcode and is
 			 * perfectly valid.
 			 */
 			jfunc = bpf_jitter(filter, flen);
 		}
 #endif
 	}
 
 	track_event = false;
 	fcode = NULL;
 
 	BPF_LOCK();
 	BPFD_LOCK(d);
 	/* Set up new filter. */
 	if (cmd == BIOCSETWF) {
 		if (d->bd_wfilter != NULL) {
 			fcode = __containerof((void *)d->bd_wfilter,
 			    struct bpf_program_buffer, buffer);
 #ifdef BPF_JITTER
 			fcode->func = NULL;
 #endif
 		}
 		d->bd_wfilter = filter;
 	} else {
 		if (d->bd_rfilter != NULL) {
 			fcode = __containerof((void *)d->bd_rfilter,
 			    struct bpf_program_buffer, buffer);
 #ifdef BPF_JITTER
 			fcode->func = d->bd_bfilter;
 #endif
 		}
 		d->bd_rfilter = filter;
 #ifdef BPF_JITTER
 		d->bd_bfilter = jfunc;
 #endif
 		if (cmd == BIOCSETF)
 			reset_d(d);
 
 		if (bpf_check_upgrade(cmd, d, filter, flen) != 0) {
 			/*
 			 * Filter can be set several times without
 			 * specifying interface. In this case just mark d
 			 * as reader.
 			 */
 			d->bd_writer = 0;
 			if (d->bd_bif != NULL) {
 				/*
 				 * Remove descriptor from writers-only list
 				 * and add it to active readers list.
 				 */
 				CK_LIST_REMOVE(d, bd_next);
 				CK_LIST_INSERT_HEAD(&d->bd_bif->bif_dlist,
 				    d, bd_next);
 				CTR2(KTR_NET,
 				    "%s: upgrade required by pid %d",
 				    __func__, d->bd_pid);
 				track_event = true;
 			}
 		}
 	}
 	BPFD_UNLOCK(d);
 
 	if (fcode != NULL)
 		NET_EPOCH_CALL(bpf_program_buffer_free, &fcode->epoch_ctx);
 
 	if (track_event)
 		EVENTHANDLER_INVOKE(bpf_track,
 		    d->bd_bif->bif_ifp, d->bd_bif->bif_dlt, 1);
 
 	BPF_UNLOCK();
 	return (0);
 }
 
 /*
  * Detach a file from its current interface (if attached at all) and attach
  * to the interface indicated by the name stored in ifr.
  * Return an errno or 0.
  */
 static int
 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
 {
 	struct bpf_if *bp;
 	struct ifnet *theywant;
 
 	BPF_LOCK_ASSERT();
 
 	theywant = ifunit(ifr->ifr_name);
 	if (theywant == NULL || theywant->if_bpf == NULL)
 		return (ENXIO);
 
 	bp = theywant->if_bpf;
 	/*
 	 * At this point, we expect the buffer is already allocated.  If not,
 	 * return an error.
 	 */
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 	case BPF_BUFMODE_ZBUF:
 		if (d->bd_sbuf == NULL)
 			return (EINVAL);
 		break;
 
 	default:
 		panic("bpf_setif: bufmode %d", d->bd_bufmode);
 	}
 	if (bp != d->bd_bif)
 		bpf_attachd(d, bp);
 	else {
 		BPFD_LOCK(d);
 		reset_d(d);
 		BPFD_UNLOCK(d);
 	}
 	return (0);
 }
 
 /*
  * Support for select() and poll() system calls
  *
  * Return true iff the specific operation will not block indefinitely.
  * Otherwise, return false but make a note that a selwakeup() must be done.
  */
 static int
 bpfpoll(struct cdev *dev, int events, struct thread *td)
 {
 	struct bpf_d *d;
 	int revents;
 
 	if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL)
 		return (events &
 		    (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM));
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	revents = events & (POLLOUT | POLLWRNORM);
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH(d, td);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (bpf_ready(d))
 			revents |= events & (POLLIN | POLLRDNORM);
 		else {
 			selrecord(td, &d->bd_sel);
 			/* Start the read timeout if necessary. */
 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
 				callout_reset(&d->bd_callout, d->bd_rtout,
 				    bpf_timed_out, d);
 				d->bd_state = BPF_WAITING;
 			}
 		}
 	}
 	BPFD_UNLOCK(d);
 	return (revents);
 }
 
 /*
  * Support for kevent() system call.  Register EVFILT_READ filters and
  * reject all others.
  */
 int
 bpfkqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct bpf_d *d;
 
 	if (devfs_get_cdevpriv((void **)&d) != 0)
 		return (1);
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &bpfread_filtops;
 		break;
 
 	case EVFILT_WRITE:
 		kn->kn_fop = &bpfwrite_filtops;
 		break;
 
 	default:
 		return (1);
 	}
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH_CUR(d);
 	kn->kn_hook = d;
 	knlist_add(&d->bd_sel.si_note, kn, 1);
 	BPFD_UNLOCK(d);
 
 	return (0);
 }
 
 static void
 filt_bpfdetach(struct knote *kn)
 {
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 
 	knlist_remove(&d->bd_sel.si_note, kn, 0);
 }
 
 static int
 filt_bpfread(struct knote *kn, long hint)
 {
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 	int ready;
 
 	BPFD_LOCK_ASSERT(d);
 	ready = bpf_ready(d);
 	if (ready) {
 		kn->kn_data = d->bd_slen;
 		/*
 		 * Ignore the hold buffer if it is being copied to user space.
 		 */
 		if (!d->bd_hbuf_in_use && d->bd_hbuf)
 			kn->kn_data += d->bd_hlen;
 	} else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
 		callout_reset(&d->bd_callout, d->bd_rtout,
 		    bpf_timed_out, d);
 		d->bd_state = BPF_WAITING;
 	}
 
 	return (ready);
 }
 
 static int
 filt_bpfwrite(struct knote *kn, long hint)
 {
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 
 	BPFD_LOCK_ASSERT(d);
 
 	if (d->bd_bif == NULL) {
 		kn->kn_data = 0;
 		return (0);
 	} else {
 		kn->kn_data = d->bd_bif->bif_ifp->if_mtu;
 		return (1);
 	}
 }
 
 #define	BPF_TSTAMP_NONE		0
 #define	BPF_TSTAMP_FAST		1
 #define	BPF_TSTAMP_NORMAL	2
 #define	BPF_TSTAMP_EXTERN	3
 
 static int
 bpf_ts_quality(int tstype)
 {
 
 	if (tstype == BPF_T_NONE)
 		return (BPF_TSTAMP_NONE);
 	if ((tstype & BPF_T_FAST) != 0)
 		return (BPF_TSTAMP_FAST);
 
 	return (BPF_TSTAMP_NORMAL);
 }
 
 static int
 bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m)
 {
 	struct timespec ts;
 	struct m_tag *tag;
 	int quality;
 
 	quality = bpf_ts_quality(tstype);
 	if (quality == BPF_TSTAMP_NONE)
 		return (quality);
 
 	if (m != NULL) {
 		if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) {
 			mbuf_tstmp2timespec(m, &ts);
 			timespec2bintime(&ts, bt);
 			return (BPF_TSTAMP_EXTERN);
 		}
 		tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL);
 		if (tag != NULL) {
 			*bt = *(struct bintime *)(tag + 1);
 			return (BPF_TSTAMP_EXTERN);
 		}
 	}
 	if (quality == BPF_TSTAMP_NORMAL)
 		binuptime(bt);
 	else
 		getbinuptime(bt);
 
 	return (quality);
 }
 
 /*
  * Incoming linkage from device drivers.  Process the packet pkt, of length
  * pktlen, which is stored in a contiguous buffer.  The packet is parsed
  * by each process' filter, and if accepted, stashed into the corresponding
  * buffer.
  */
 void
 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 {
 	struct epoch_tracker et;
 	struct bintime bt;
 	struct bpf_d *d;
 #ifdef BPF_JITTER
 	bpf_jit_filter *bf;
 #endif
 	u_int slen;
 	int gottime;
 
 	gottime = BPF_TSTAMP_NONE;
 	NET_EPOCH_ENTER(et);
 	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		counter_u64_add(d->bd_rcount, 1);
 		/*
 		 * NB: We dont call BPF_CHECK_DIRECTION() here since there
 		 * is no way for the caller to indiciate to us whether this
 		 * packet is inbound or outbound. In the bpf_mtap() routines,
 		 * we use the interface pointers on the mbuf to figure it out.
 		 */
 #ifdef BPF_JITTER
 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
 		if (bf != NULL)
 			slen = (*(bf->func))(pkt, pktlen, pktlen);
 		else
 #endif
 		slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
 		if (slen != 0) {
 			/*
 			 * Filter matches. Let's to acquire write lock.
 			 */
 			BPFD_LOCK(d);
 			counter_u64_add(d->bd_fcount, 1);
 			if (gottime < bpf_ts_quality(d->bd_tstamp))
 				gottime = bpf_gettime(&bt, d->bd_tstamp,
 				    NULL);
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, pkt, pktlen, slen,
 				    bpf_append_bytes, &bt);
 			BPFD_UNLOCK(d);
 		}
 	}
 	NET_EPOCH_EXIT(et);
 }
 
 #define	BPF_CHECK_DIRECTION(d, r, i)				\
 	    (((d)->bd_direction == BPF_D_IN && (r) != (i)) ||	\
 	    ((d)->bd_direction == BPF_D_OUT && (r) == (i)))
 
 /*
  * Incoming linkage from device drivers, when packet is in an mbuf chain.
  * Locking model is explained in bpf_tap().
  */
 void
 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct bintime bt;
 	struct bpf_d *d;
 #ifdef BPF_JITTER
 	bpf_jit_filter *bf;
 #endif
 	u_int pktlen, slen;
 	int gottime;
 
 	/* Skip outgoing duplicate packets. */
 	if ((m->m_flags & M_PROMISC) != 0 && m_rcvif(m) == NULL) {
 		m->m_flags &= ~M_PROMISC;
 		return;
 	}
 
 	pktlen = m_length(m, NULL);
 	gottime = BPF_TSTAMP_NONE;
 
 	NET_EPOCH_ENTER(et);
 	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		if (BPF_CHECK_DIRECTION(d, m_rcvif(m), bp->bif_ifp))
 			continue;
 		counter_u64_add(d->bd_rcount, 1);
 #ifdef BPF_JITTER
 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
 		/* XXX We cannot handle multiple mbufs. */
 		if (bf != NULL && m->m_next == NULL)
 			slen = (*(bf->func))(mtod(m, u_char *), pktlen,
 			    pktlen);
 		else
 #endif
 		slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
 		if (slen != 0) {
 			BPFD_LOCK(d);
 
 			counter_u64_add(d->bd_fcount, 1);
 			if (gottime < bpf_ts_quality(d->bd_tstamp))
 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)m, pktlen, slen,
 				    bpf_append_mbuf, &bt);
 			BPFD_UNLOCK(d);
 		}
 	}
 	NET_EPOCH_EXIT(et);
 }
 
 /*
  * Incoming linkage from device drivers, when packet is in
  * an mbuf chain and to be prepended by a contiguous header.
  */
 void
 bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct bintime bt;
 	struct mbuf mb;
 	struct bpf_d *d;
 	u_int pktlen, slen;
 	int gottime;
 
 	/* Skip outgoing duplicate packets. */
 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
 		m->m_flags &= ~M_PROMISC;
 		return;
 	}
 
 	pktlen = m_length(m, NULL);
 	/*
 	 * Craft on-stack mbuf suitable for passing to bpf_filter.
 	 * Note that we cut corners here; we only setup what's
 	 * absolutely needed--this mbuf should never go anywhere else.
 	 */
 	mb.m_flags = 0;
 	mb.m_next = m;
 	mb.m_data = data;
 	mb.m_len = dlen;
 	pktlen += dlen;
 
 	gottime = BPF_TSTAMP_NONE;
 
 	NET_EPOCH_ENTER(et);
 	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
 			continue;
 		counter_u64_add(d->bd_rcount, 1);
 		slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
 		if (slen != 0) {
 			BPFD_LOCK(d);
 
 			counter_u64_add(d->bd_fcount, 1);
 			if (gottime < bpf_ts_quality(d->bd_tstamp))
 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)&mb, pktlen, slen,
 				    bpf_append_mbuf, &bt);
 			BPFD_UNLOCK(d);
 		}
 	}
 	NET_EPOCH_EXIT(et);
 }
 
 #undef	BPF_CHECK_DIRECTION
 #undef	BPF_TSTAMP_NONE
 #undef	BPF_TSTAMP_FAST
 #undef	BPF_TSTAMP_NORMAL
 #undef	BPF_TSTAMP_EXTERN
 
 static int
 bpf_hdrlen(struct bpf_d *d)
 {
 	int hdrlen;
 
 	hdrlen = d->bd_bif->bif_hdrlen;
 #ifndef BURN_BRIDGES
 	if (d->bd_tstamp == BPF_T_NONE ||
 	    BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME)
 #ifdef COMPAT_FREEBSD32
 		if (d->bd_compat32)
 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32);
 		else
 #endif
 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr);
 	else
 #endif
 		hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr);
 #ifdef COMPAT_FREEBSD32
 	if (d->bd_compat32)
 		hdrlen = BPF_WORDALIGN32(hdrlen);
 	else
 #endif
 		hdrlen = BPF_WORDALIGN(hdrlen);
 
 	return (hdrlen - d->bd_bif->bif_hdrlen);
 }
 
 static void
 bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype)
 {
 	struct bintime bt2, boottimebin;
 	struct timeval tsm;
 	struct timespec tsn;
 
 	if ((tstype & BPF_T_MONOTONIC) == 0) {
 		bt2 = *bt;
 		getboottimebin(&boottimebin);
 		bintime_add(&bt2, &boottimebin);
 		bt = &bt2;
 	}
 	switch (BPF_T_FORMAT(tstype)) {
 	case BPF_T_MICROTIME:
 		bintime2timeval(bt, &tsm);
 		ts->bt_sec = tsm.tv_sec;
 		ts->bt_frac = tsm.tv_usec;
 		break;
 	case BPF_T_NANOTIME:
 		bintime2timespec(bt, &tsn);
 		ts->bt_sec = tsn.tv_sec;
 		ts->bt_frac = tsn.tv_nsec;
 		break;
 	case BPF_T_BINTIME:
 		ts->bt_sec = bt->sec;
 		ts->bt_frac = bt->frac;
 		break;
 	}
 }
 
 /*
  * Move the packet data from interface memory (pkt) into the
  * store buffer.  "cpfn" is the routine called to do the actual data
  * transfer.  bcopy is passed in to copy contiguous chunks, while
  * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
  * pkt is really an mbuf.
  */
 static void
 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
     void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
     struct bintime *bt)
 {
 	static char zeroes[BPF_ALIGNMENT];
 	struct bpf_xhdr hdr;
 #ifndef BURN_BRIDGES
 	struct bpf_hdr hdr_old;
 #ifdef COMPAT_FREEBSD32
 	struct bpf_hdr32 hdr32_old;
 #endif
 #endif
 	int caplen, curlen, hdrlen, pad, totlen;
 	int do_wakeup = 0;
 	int do_timestamp;
 	int tstype;
 
 	BPFD_LOCK_ASSERT(d);
 	if (d->bd_bif == NULL) {
 		/* Descriptor was detached in concurrent thread */
 		counter_u64_add(d->bd_dcount, 1);
 		return;
 	}
 
 	/*
 	 * Detect whether user space has released a buffer back to us, and if
 	 * so, move it from being a hold buffer to a free buffer.  This may
 	 * not be the best place to do it (for example, we might only want to
 	 * run this check if we need the space), but for now it's a reliable
 	 * spot to do it.
 	 */
 	if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) {
 		d->bd_fbuf = d->bd_hbuf;
 		d->bd_hbuf = NULL;
 		d->bd_hlen = 0;
 		bpf_buf_reclaimed(d);
 	}
 
 	/*
 	 * Figure out how many bytes to move.  If the packet is
 	 * greater or equal to the snapshot length, transfer that
 	 * much.  Otherwise, transfer the whole packet (unless
 	 * we hit the buffer size limit).
 	 */
 	hdrlen = bpf_hdrlen(d);
 	totlen = hdrlen + min(snaplen, pktlen);
 	if (totlen > d->bd_bufsize)
 		totlen = d->bd_bufsize;
 
 	/*
 	 * Round up the end of the previous packet to the next longword.
 	 *
 	 * Drop the packet if there's no room and no hope of room
 	 * If the packet would overflow the storage buffer or the storage
 	 * buffer is considered immutable by the buffer model, try to rotate
 	 * the buffer and wakeup pending processes.
 	 */
 #ifdef COMPAT_FREEBSD32
 	if (d->bd_compat32)
 		curlen = BPF_WORDALIGN32(d->bd_slen);
 	else
 #endif
 		curlen = BPF_WORDALIGN(d->bd_slen);
 	if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
 		if (d->bd_fbuf == NULL) {
 			/*
 			 * There's no room in the store buffer, and no
 			 * prospect of room, so drop the packet.  Notify the
 			 * buffer model.
 			 */
 			bpf_buffull(d);
 			counter_u64_add(d->bd_dcount, 1);
 			return;
 		}
 		KASSERT(!d->bd_hbuf_in_use, ("hold buffer is in use"));
 		ROTATE_BUFFERS(d);
 		do_wakeup = 1;
 		curlen = 0;
 	} else {
 		if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
 			/*
 			 * Immediate mode is set, or the read timeout has
 			 * already expired during a select call.  A packet
 			 * arrived, so the reader should be woken up.
 			 */
 			do_wakeup = 1;
 		}
 		pad = curlen - d->bd_slen;
 		KASSERT(pad >= 0 && pad <= sizeof(zeroes),
 		    ("%s: invalid pad byte count %d", __func__, pad));
 		if (pad > 0) {
 			/* Zero pad bytes. */
 			bpf_append_bytes(d, d->bd_sbuf, d->bd_slen, zeroes,
 			    pad);
 		}
 	}
 
 	caplen = totlen - hdrlen;
 	tstype = d->bd_tstamp;
 	do_timestamp = tstype != BPF_T_NONE;
 #ifndef BURN_BRIDGES
 	if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) {
 		struct bpf_ts ts;
 		if (do_timestamp)
 			bpf_bintime2ts(bt, &ts, tstype);
 #ifdef COMPAT_FREEBSD32
 		if (d->bd_compat32) {
 			bzero(&hdr32_old, sizeof(hdr32_old));
 			if (do_timestamp) {
 				hdr32_old.bh_tstamp.tv_sec = ts.bt_sec;
 				hdr32_old.bh_tstamp.tv_usec = ts.bt_frac;
 			}
 			hdr32_old.bh_datalen = pktlen;
 			hdr32_old.bh_hdrlen = hdrlen;
 			hdr32_old.bh_caplen = caplen;
 			bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old,
 			    sizeof(hdr32_old));
 			goto copy;
 		}
 #endif
 		bzero(&hdr_old, sizeof(hdr_old));
 		if (do_timestamp) {
 			hdr_old.bh_tstamp.tv_sec = ts.bt_sec;
 			hdr_old.bh_tstamp.tv_usec = ts.bt_frac;
 		}
 		hdr_old.bh_datalen = pktlen;
 		hdr_old.bh_hdrlen = hdrlen;
 		hdr_old.bh_caplen = caplen;
 		bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old,
 		    sizeof(hdr_old));
 		goto copy;
 	}
 #endif
 
 	/*
 	 * Append the bpf header.  Note we append the actual header size, but
 	 * move forward the length of the header plus padding.
 	 */
 	bzero(&hdr, sizeof(hdr));
 	if (do_timestamp)
 		bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype);
 	hdr.bh_datalen = pktlen;
 	hdr.bh_hdrlen = hdrlen;
 	hdr.bh_caplen = caplen;
 	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
 
 	/*
 	 * Copy the packet data into the store buffer and update its length.
 	 */
 #ifndef BURN_BRIDGES
 copy:
 #endif
 	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen);
 	d->bd_slen = curlen + totlen;
 
 	if (do_wakeup)
 		bpf_wakeup(d);
 }
 
 /*
  * Free buffers currently in use by a descriptor.
  * Called on close.
  */
 static void
 bpfd_free(epoch_context_t ctx)
 {
 	struct bpf_d *d;
 	struct bpf_program_buffer *p;
 
 	/*
 	 * We don't need to lock out interrupts since this descriptor has
 	 * been detached from its interface and it yet hasn't been marked
 	 * free.
 	 */
 	d = __containerof(ctx, struct bpf_d, epoch_ctx);
 	bpf_free(d);
 	if (d->bd_rfilter != NULL) {
 		p = __containerof((void *)d->bd_rfilter,
 		    struct bpf_program_buffer, buffer);
 #ifdef BPF_JITTER
 		p->func = d->bd_bfilter;
 #endif
 		bpf_program_buffer_free(&p->epoch_ctx);
 	}
 	if (d->bd_wfilter != NULL) {
 		p = __containerof((void *)d->bd_wfilter,
 		    struct bpf_program_buffer, buffer);
 #ifdef BPF_JITTER
 		p->func = NULL;
 #endif
 		bpf_program_buffer_free(&p->epoch_ctx);
 	}
 
 	mtx_destroy(&d->bd_lock);
 	counter_u64_free(d->bd_rcount);
 	counter_u64_free(d->bd_dcount);
 	counter_u64_free(d->bd_fcount);
 	counter_u64_free(d->bd_wcount);
 	counter_u64_free(d->bd_wfcount);
 	counter_u64_free(d->bd_wdcount);
 	counter_u64_free(d->bd_zcopy);
 	free(d, M_BPF);
 }
 
 /*
  * Attach an interface to bpf.  dlt is the link layer type; hdrlen is the
  * fixed size of the link header (variable length headers not yet supported).
  */
 void
 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
 {
 
 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
 }
 
 /*
  * Attach an interface to bpf.  ifp is a pointer to the structure
  * defining the interface to be attached, dlt is the link layer type,
  * and hdrlen is the fixed size of the link header (variable length
  * headers are not yet supporrted).
  */
 void
 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen,
     struct bpf_if **driverp)
 {
 	struct bpf_if *bp;
 
 	KASSERT(*driverp == NULL,
 	    ("bpfattach2: driverp already initialized"));
 
 	bp = malloc(sizeof(*bp), M_BPF, M_WAITOK | M_ZERO);
 
 	CK_LIST_INIT(&bp->bif_dlist);
 	CK_LIST_INIT(&bp->bif_wlist);
 	bp->bif_ifp = ifp;
 	bp->bif_dlt = dlt;
 	bp->bif_hdrlen = hdrlen;
 	bp->bif_bpf = driverp;
 	refcount_init(&bp->bif_refcnt, 1);
 	*driverp = bp;
 	/*
 	 * Reference ifnet pointer, so it won't freed until
 	 * we release it.
 	 */
 	if_ref(ifp);
 	BPF_LOCK();
 	CK_LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
 	BPF_UNLOCK();
 
 	if (bootverbose && IS_DEFAULT_VNET(curvnet))
 		if_printf(ifp, "bpf attached\n");
 }
 
 #ifdef VIMAGE
 /*
  * When moving interfaces between vnet instances we need a way to
  * query the dlt and hdrlen before detach so we can re-attch the if_bpf
  * after the vmove.  We unfortunately have no device driver infrastructure
  * to query the interface for these values after creation/attach, thus
  * add this as a workaround.
  */
 int
 bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u_int *bif_hdrlen)
 {
 
 	if (bp == NULL)
 		return (ENXIO);
 	if (bif_dlt == NULL && bif_hdrlen == NULL)
 		return (0);
 
 	if (bif_dlt != NULL)
 		*bif_dlt = bp->bif_dlt;
 	if (bif_hdrlen != NULL)
 		*bif_hdrlen = bp->bif_hdrlen;
 
 	return (0);
 }
 #endif
 
 /*
  * Detach bpf from an interface. This involves detaching each descriptor
  * associated with the interface. Notify each descriptor as it's detached
  * so that any sleepers wake up and get ENXIO.
  */
 void
 bpfdetach(struct ifnet *ifp)
 {
 	struct bpf_if *bp, *bp_temp;
 	struct bpf_d *d;
 
 	BPF_LOCK();
 	/* Find all bpf_if struct's which reference ifp and detach them. */
 	CK_LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) {
 		if (ifp != bp->bif_ifp)
 			continue;
 
 		CK_LIST_REMOVE(bp, bif_next);
 		*bp->bif_bpf = (struct bpf_if *)&dead_bpf_if;
 
 		CTR4(KTR_NET,
 		    "%s: sheduling free for encap %d (%p) for if %p",
 		    __func__, bp->bif_dlt, bp, ifp);
 
 		/* Detach common descriptors */
 		while ((d = CK_LIST_FIRST(&bp->bif_dlist)) != NULL) {
 			bpf_detachd_locked(d, true);
 		}
 
 		/* Detach writer-only descriptors */
 		while ((d = CK_LIST_FIRST(&bp->bif_wlist)) != NULL) {
 			bpf_detachd_locked(d, true);
 		}
 		bpfif_rele(bp);
 	}
 	BPF_UNLOCK();
 }
 
 /*
  * Get a list of available data link type of the interface.
  */
 static int
 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
 {
 	struct ifnet *ifp;
 	struct bpf_if *bp;
 	u_int *lst;
 	int error, n, n1;
 
 	BPF_LOCK_ASSERT();
 
 	ifp = d->bd_bif->bif_ifp;
 	n1 = 0;
 	CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp == ifp)
 			n1++;
 	}
 	if (bfl->bfl_list == NULL) {
 		bfl->bfl_len = n1;
 		return (0);
 	}
 	if (n1 > bfl->bfl_len)
 		return (ENOMEM);
 
 	lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK);
 	n = 0;
 	CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp != ifp)
 			continue;
 		lst[n++] = bp->bif_dlt;
 	}
 	error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n);
 	free(lst, M_TEMP);
 	bfl->bfl_len = n;
 	return (error);
 }
 
 /*
  * Set the data link type of a BPF instance.
  */
 static int
 bpf_setdlt(struct bpf_d *d, u_int dlt)
 {
 	int error, opromisc;
 	struct ifnet *ifp;
 	struct bpf_if *bp;
 
 	BPF_LOCK_ASSERT();
 	MPASS(d->bd_bif != NULL);
 
 	/*
 	 * It is safe to check bd_bif without BPFD_LOCK, it can not be
 	 * changed while we hold global lock.
 	 */
 	if (d->bd_bif->bif_dlt == dlt)
 		return (0);
 
 	ifp = d->bd_bif->bif_ifp;
 	CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
 			break;
 	}
 	if (bp == NULL)
 		return (EINVAL);
 
 	opromisc = d->bd_promisc;
 	bpf_attachd(d, bp);
 	if (opromisc) {
 		error = ifpromisc(bp->bif_ifp, 1);
 		if (error)
 			if_printf(bp->bif_ifp, "%s: ifpromisc failed (%d)\n",
 			    __func__, error);
 		else
 			d->bd_promisc = 1;
 	}
 	return (0);
 }
 
 static void
 bpf_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	sx_init(&bpf_sx, "bpf global lock");
 	CK_LIST_INIT(&bpf_iflist);
 
 	dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf");
 	/* For compatibility */
 	make_dev_alias(dev, "bpf0");
 }
 
 /*
  * Zero out the various packet counters associated with all of the bpf
  * descriptors.  At some point, we will probably want to get a bit more
  * granular and allow the user to specify descriptors to be zeroed.
  */
 static void
 bpf_zero_counters(void)
 {
 	struct bpf_if *bp;
 	struct bpf_d *bd;
 
 	BPF_LOCK();
 	/*
 	 * We are protected by global lock here, interfaces and
 	 * descriptors can not be deleted while we hold it.
 	 */
 	CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
 			counter_u64_zero(bd->bd_rcount);
 			counter_u64_zero(bd->bd_dcount);
 			counter_u64_zero(bd->bd_fcount);
 			counter_u64_zero(bd->bd_wcount);
 			counter_u64_zero(bd->bd_wfcount);
 			counter_u64_zero(bd->bd_zcopy);
 		}
 	}
 	BPF_UNLOCK();
 }
 
 /*
  * Fill filter statistics
  */
 static void
 bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
 {
 
 	BPF_LOCK_ASSERT();
 	bzero(d, sizeof(*d));
 	d->bd_structsize = sizeof(*d);
 	d->bd_immediate = bd->bd_immediate;
 	d->bd_promisc = bd->bd_promisc;
 	d->bd_hdrcmplt = bd->bd_hdrcmplt;
 	d->bd_direction = bd->bd_direction;
 	d->bd_feedback = bd->bd_feedback;
 	d->bd_async = bd->bd_async;
 	d->bd_rcount = counter_u64_fetch(bd->bd_rcount);
 	d->bd_dcount = counter_u64_fetch(bd->bd_dcount);
 	d->bd_fcount = counter_u64_fetch(bd->bd_fcount);
 	d->bd_sig = bd->bd_sig;
 	d->bd_slen = bd->bd_slen;
 	d->bd_hlen = bd->bd_hlen;
 	d->bd_bufsize = bd->bd_bufsize;
 	d->bd_pid = bd->bd_pid;
 	strlcpy(d->bd_ifname,
 	    bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
 	d->bd_locked = bd->bd_locked;
 	d->bd_wcount = counter_u64_fetch(bd->bd_wcount);
 	d->bd_wdcount = counter_u64_fetch(bd->bd_wdcount);
 	d->bd_wfcount = counter_u64_fetch(bd->bd_wfcount);
 	d->bd_zcopy = counter_u64_fetch(bd->bd_zcopy);
 	d->bd_bufmode = bd->bd_bufmode;
 }
 
 /*
  * Handle `netstat -B' stats request
  */
 static int
 bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	static const struct xbpf_d zerostats;
 	struct xbpf_d *xbdbuf, *xbd, tempstats;
 	int index, error;
 	struct bpf_if *bp;
 	struct bpf_d *bd;
 
 	/*
 	 * XXX This is not technically correct. It is possible for non
 	 * privileged users to open bpf devices. It would make sense
 	 * if the users who opened the devices were able to retrieve
 	 * the statistics for them, too.
 	 */
 	error = priv_check(req->td, PRIV_NET_BPF);
 	if (error)
 		return (error);
 	/*
 	 * Check to see if the user is requesting that the counters be
 	 * zeroed out.  Explicitly check that the supplied data is zeroed,
 	 * as we aren't allowing the user to set the counters currently.
 	 */
 	if (req->newptr != NULL) {
 		if (req->newlen != sizeof(tempstats))
 			return (EINVAL);
 		memset(&tempstats, 0, sizeof(tempstats));
 		error = SYSCTL_IN(req, &tempstats, sizeof(tempstats));
 		if (error)
 			return (error);
 		if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0)
 			return (EINVAL);
 		bpf_zero_counters();
 		return (0);
 	}
 	if (req->oldptr == NULL)
 		return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd)));
 	if (bpf_bpfd_cnt == 0)
 		return (SYSCTL_OUT(req, 0, 0));
 	xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK);
 	BPF_LOCK();
 	if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) {
 		BPF_UNLOCK();
 		free(xbdbuf, M_BPF);
 		return (ENOMEM);
 	}
 	index = 0;
 	CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		/* Send writers-only first */
 		CK_LIST_FOREACH(bd, &bp->bif_wlist, bd_next) {
 			xbd = &xbdbuf[index++];
 			bpfstats_fill_xbpf(xbd, bd);
 		}
 		CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
 			xbd = &xbdbuf[index++];
 			bpfstats_fill_xbpf(xbd, bd);
 		}
 	}
 	BPF_UNLOCK();
 	error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
 	free(xbdbuf, M_BPF);
 	return (error);
 }
 
 SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL);
 
 #else /* !DEV_BPF && !NETGRAPH_BPF */
 
 /*
  * NOP stubs to allow bpf-using drivers to load and function.
  *
  * A 'better' implementation would allow the core bpf functionality
  * to be loaded at runtime.
  */
 
 void
 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 {
 }
 
 void
 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 {
 }
 
 void
 bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
 {
 }
 
 void
 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
 {
 
 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
 }
 
 void
 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
 {
 
 	*driverp = (struct bpf_if *)&dead_bpf_if;
 }
 
 void
 bpfdetach(struct ifnet *ifp)
 {
 }
 
 u_int
 bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
 {
 	return -1;	/* "no filter" behaviour */
 }
 
 int
 bpf_validate(const struct bpf_insn *f, int len)
 {
 	return 0;		/* false */
 }
 
 #endif /* !DEV_BPF && !NETGRAPH_BPF */
 
 #ifdef DDB
 static void
 bpf_show_bpf_if(struct bpf_if *bpf_if)
 {
 
 	if (bpf_if == NULL)
 		return;
 	db_printf("%p:\n", bpf_if);
 #define	BPF_DB_PRINTF(f, e)	db_printf("   %s = " f "\n", #e, bpf_if->e);
 #define	BPF_DB_PRINTF_RAW(f, e)	db_printf("   %s = " f "\n", #e, e);
 	/* bif_ext.bif_next */
 	/* bif_ext.bif_dlist */
 	BPF_DB_PRINTF("%#x", bif_dlt);
 	BPF_DB_PRINTF("%u", bif_hdrlen);
 	/* bif_wlist */
 	BPF_DB_PRINTF("%p", bif_ifp);
 	BPF_DB_PRINTF("%p", bif_bpf);
 	BPF_DB_PRINTF_RAW("%u", refcount_load(&bpf_if->bif_refcnt));
 }
 
 DB_SHOW_COMMAND(bpf_if, db_show_bpf_if)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show bpf_if <struct bpf_if *>\n");
 		return;
 	}
 
 	bpf_show_bpf_if((struct bpf_if *)addr);
 }
 #endif
diff --git a/sys/net/bridgestp.c b/sys/net/bridgestp.c
index cf182d2efe7b..23c035b282f9 100644
--- a/sys/net/bridgestp.c
+++ b/sys/net/bridgestp.c
@@ -1,2306 +1,2307 @@
 /*	$NetBSD: bridgestp.c,v 1.5 2003/11/28 08:56:48 keihan Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-NetBSD
  *
  * Copyright (c) 2000 Jason L. Wright (jason@thought.net)
  * Copyright (c) 2006 Andrew Thompson (thompsa@FreeBSD.org)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * OpenBSD: bridgestp.c,v 1.5 2001/03/22 03:48:29 jason Exp
  */
 
 /*
  * Implementation of the spanning tree protocol as defined in
  * ISO/IEC 802.1D-2004, June 9, 2004.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/callout.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/taskqueue.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_llc.h>
 #include <net/if_media.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <net/bridgestp.h>
 
 #ifdef	BRIDGESTP_DEBUG
 #define	DPRINTF(fmt, arg...)	printf("bstp: " fmt, ##arg)
 #else
 #define	DPRINTF(fmt, arg...)	(void)0
 #endif
 
 #define	PV2ADDR(pv, eaddr)	do {		\
 	eaddr[0] = pv >> 40;			\
 	eaddr[1] = pv >> 32;			\
 	eaddr[2] = pv >> 24;			\
 	eaddr[3] = pv >> 16;			\
 	eaddr[4] = pv >> 8;			\
 	eaddr[5] = pv >> 0;			\
 } while (0)
 
 #define	INFO_BETTER	1
 #define	INFO_SAME	0
 #define	INFO_WORSE	-1
 
 const uint8_t bstp_etheraddr[] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
 
 LIST_HEAD(, bstp_state) bstp_list;
 static struct mtx	bstp_list_mtx;
 
 static void	bstp_transmit(struct bstp_state *, struct bstp_port *);
 static void	bstp_transmit_bpdu(struct bstp_state *, struct bstp_port *);
 static void	bstp_transmit_tcn(struct bstp_state *, struct bstp_port *);
 static void	bstp_decode_bpdu(struct bstp_port *, struct bstp_cbpdu *,
 		    struct bstp_config_unit *);
 static void	bstp_send_bpdu(struct bstp_state *, struct bstp_port *,
 		    struct bstp_cbpdu *);
 static int	bstp_pdu_flags(struct bstp_port *);
 static void	bstp_received_stp(struct bstp_state *, struct bstp_port *,
 		    struct mbuf **, struct bstp_tbpdu *);
 static void	bstp_received_rstp(struct bstp_state *, struct bstp_port *,
 		    struct mbuf **, struct bstp_tbpdu *);
 static void	bstp_received_tcn(struct bstp_state *, struct bstp_port *,
 		    struct bstp_tcn_unit *);
 static void	bstp_received_bpdu(struct bstp_state *, struct bstp_port *,
 		    struct bstp_config_unit *);
 static int	bstp_pdu_rcvtype(struct bstp_port *, struct bstp_config_unit *);
 static int	bstp_pdu_bettersame(struct bstp_port *, int);
 static int	bstp_info_cmp(struct bstp_pri_vector *,
 		    struct bstp_pri_vector *);
 static int	bstp_info_superior(struct bstp_pri_vector *,
 		    struct bstp_pri_vector *);
 static void	bstp_assign_roles(struct bstp_state *);
 static void	bstp_update_roles(struct bstp_state *, struct bstp_port *);
 static void	bstp_update_state(struct bstp_state *, struct bstp_port *);
 static void	bstp_update_tc(struct bstp_port *);
 static void	bstp_update_info(struct bstp_port *);
 static void	bstp_set_other_tcprop(struct bstp_port *);
 static void	bstp_set_all_reroot(struct bstp_state *);
 static void	bstp_set_all_sync(struct bstp_state *);
 static void	bstp_set_port_state(struct bstp_port *, int);
 static void	bstp_set_port_role(struct bstp_port *, int);
 static void	bstp_set_port_proto(struct bstp_port *, int);
 static void	bstp_set_port_tc(struct bstp_port *, int);
 static void	bstp_set_timer_tc(struct bstp_port *);
 static void	bstp_set_timer_msgage(struct bstp_port *);
 static int	bstp_rerooted(struct bstp_state *, struct bstp_port *);
 static uint32_t	bstp_calc_path_cost(struct bstp_port *);
 static void	bstp_notify_state(void *, int);
 static void	bstp_notify_rtage(void *, int);
 static void	bstp_ifupdstatus(void *, int);
 static void	bstp_enable_port(struct bstp_state *, struct bstp_port *);
 static void	bstp_disable_port(struct bstp_state *, struct bstp_port *);
 static void	bstp_tick(void *);
 static void	bstp_timer_start(struct bstp_timer *, uint16_t);
 static void	bstp_timer_stop(struct bstp_timer *);
 static void	bstp_timer_latch(struct bstp_timer *);
 static int	bstp_timer_dectest(struct bstp_timer *);
 static void	bstp_hello_timer_expiry(struct bstp_state *,
 		    struct bstp_port *);
 static void	bstp_message_age_expiry(struct bstp_state *,
 		    struct bstp_port *);
 static void	bstp_migrate_delay_expiry(struct bstp_state *,
 		    struct bstp_port *);
 static void	bstp_edge_delay_expiry(struct bstp_state *,
 		    struct bstp_port *);
 static int	bstp_addr_cmp(const uint8_t *, const uint8_t *);
 static int	bstp_same_bridgeid(uint64_t, uint64_t);
 static void	bstp_reinit(struct bstp_state *);
 
 static void
 bstp_transmit(struct bstp_state *bs, struct bstp_port *bp)
 {
 	NET_EPOCH_ASSERT();
 
 	if (bs->bs_running == 0)
 		return;
 
 	/*
 	 * a PDU can only be sent if we have tx quota left and the
 	 * hello timer is running.
 	 */
 	if (bp->bp_hello_timer.active == 0) {
 		/* Test if it needs to be reset */
 		bstp_hello_timer_expiry(bs, bp);
 		return;
 	}
 	if (bp->bp_txcount > bs->bs_txholdcount)
 		/* Ran out of karma */
 		return;
 
 	if (bp->bp_protover == BSTP_PROTO_RSTP) {
 		bstp_transmit_bpdu(bs, bp);
 		bp->bp_tc_ack = 0;
 	} else { /* STP */
 		switch (bp->bp_role) {
 			case BSTP_ROLE_DESIGNATED:
 				bstp_transmit_bpdu(bs, bp);
 				bp->bp_tc_ack = 0;
 				break;
 
 			case BSTP_ROLE_ROOT:
 				bstp_transmit_tcn(bs, bp);
 				break;
 		}
 	}
 	bstp_timer_start(&bp->bp_hello_timer, bp->bp_desg_htime);
 	bp->bp_flags &= ~BSTP_PORT_NEWINFO;
 }
 
 static void
 bstp_transmit_bpdu(struct bstp_state *bs, struct bstp_port *bp)
 {
 	struct bstp_cbpdu bpdu;
 
 	BSTP_LOCK_ASSERT(bs);
 
 	bpdu.cbu_rootpri = htons(bp->bp_desg_pv.pv_root_id >> 48);
 	PV2ADDR(bp->bp_desg_pv.pv_root_id, bpdu.cbu_rootaddr);
 
 	bpdu.cbu_rootpathcost = htonl(bp->bp_desg_pv.pv_cost);
 
 	bpdu.cbu_bridgepri = htons(bp->bp_desg_pv.pv_dbridge_id >> 48);
 	PV2ADDR(bp->bp_desg_pv.pv_dbridge_id, bpdu.cbu_bridgeaddr);
 
 	bpdu.cbu_portid = htons(bp->bp_port_id);
 	bpdu.cbu_messageage = htons(bp->bp_desg_msg_age);
 	bpdu.cbu_maxage = htons(bp->bp_desg_max_age);
 	bpdu.cbu_hellotime = htons(bp->bp_desg_htime);
 	bpdu.cbu_forwarddelay = htons(bp->bp_desg_fdelay);
 
 	bpdu.cbu_flags = bstp_pdu_flags(bp);
 
 	switch (bp->bp_protover) {
 		case BSTP_PROTO_STP:
 			bpdu.cbu_bpdutype = BSTP_MSGTYPE_CFG;
 			break;
 
 		case BSTP_PROTO_RSTP:
 			bpdu.cbu_bpdutype = BSTP_MSGTYPE_RSTP;
 			break;
 	}
 
 	bstp_send_bpdu(bs, bp, &bpdu);
 }
 
 static void
 bstp_transmit_tcn(struct bstp_state *bs, struct bstp_port *bp)
 {
 	struct bstp_tbpdu bpdu;
 	struct ifnet *ifp = bp->bp_ifp;
 	struct ether_header *eh;
 	struct mbuf *m;
 
 	KASSERT(bp == bs->bs_root_port, ("%s: bad root port\n", __func__));
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return;
 
 	m->m_pkthdr.rcvif = ifp;
 	m->m_pkthdr.len = sizeof(*eh) + sizeof(bpdu);
 	m->m_len = m->m_pkthdr.len;
 
 	eh = mtod(m, struct ether_header *);
 
 	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 	memcpy(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN);
 	eh->ether_type = htons(sizeof(bpdu));
 
 	bpdu.tbu_ssap = bpdu.tbu_dsap = LLC_8021D_LSAP;
 	bpdu.tbu_ctl = LLC_UI;
 	bpdu.tbu_protoid = 0;
 	bpdu.tbu_protover = 0;
 	bpdu.tbu_bpdutype = BSTP_MSGTYPE_TCN;
 
 	memcpy(mtod(m, caddr_t) + sizeof(*eh), &bpdu, sizeof(bpdu));
 
 	bp->bp_txcount++;
 	ifp->if_transmit(ifp, m);
 }
 
 static void
 bstp_decode_bpdu(struct bstp_port *bp, struct bstp_cbpdu *cpdu,
     struct bstp_config_unit *cu)
 {
 	int flags;
 
 	cu->cu_pv.pv_root_id =
 	    (((uint64_t)ntohs(cpdu->cbu_rootpri)) << 48) |
 	    (((uint64_t)cpdu->cbu_rootaddr[0]) << 40) |
 	    (((uint64_t)cpdu->cbu_rootaddr[1]) << 32) |
 	    (((uint64_t)cpdu->cbu_rootaddr[2]) << 24) |
 	    (((uint64_t)cpdu->cbu_rootaddr[3]) << 16) |
 	    (((uint64_t)cpdu->cbu_rootaddr[4]) << 8) |
 	    (((uint64_t)cpdu->cbu_rootaddr[5]) << 0);
 
 	cu->cu_pv.pv_dbridge_id =
 	    (((uint64_t)ntohs(cpdu->cbu_bridgepri)) << 48) |
 	    (((uint64_t)cpdu->cbu_bridgeaddr[0]) << 40) |
 	    (((uint64_t)cpdu->cbu_bridgeaddr[1]) << 32) |
 	    (((uint64_t)cpdu->cbu_bridgeaddr[2]) << 24) |
 	    (((uint64_t)cpdu->cbu_bridgeaddr[3]) << 16) |
 	    (((uint64_t)cpdu->cbu_bridgeaddr[4]) << 8) |
 	    (((uint64_t)cpdu->cbu_bridgeaddr[5]) << 0);
 
 	cu->cu_pv.pv_cost = ntohl(cpdu->cbu_rootpathcost);
 	cu->cu_message_age = ntohs(cpdu->cbu_messageage);
 	cu->cu_max_age = ntohs(cpdu->cbu_maxage);
 	cu->cu_hello_time = ntohs(cpdu->cbu_hellotime);
 	cu->cu_forward_delay = ntohs(cpdu->cbu_forwarddelay);
 	cu->cu_pv.pv_dport_id = ntohs(cpdu->cbu_portid);
 	cu->cu_pv.pv_port_id = bp->bp_port_id;
 	cu->cu_message_type = cpdu->cbu_bpdutype;
 
 	/* Strip off unused flags in STP mode */
 	flags = cpdu->cbu_flags;
 	switch (cpdu->cbu_protover) {
 		case BSTP_PROTO_STP:
 			flags &= BSTP_PDU_STPMASK;
 			/* A STP BPDU explicitly conveys a Designated Port */
 			cu->cu_role = BSTP_ROLE_DESIGNATED;
 			break;
 
 		case BSTP_PROTO_RSTP:
 			flags &= BSTP_PDU_RSTPMASK;
 			break;
 	}
 
 	cu->cu_topology_change_ack =
 		(flags & BSTP_PDU_F_TCA) ? 1 : 0;
 	cu->cu_proposal =
 		(flags & BSTP_PDU_F_P) ? 1 : 0;
 	cu->cu_agree =
 		(flags & BSTP_PDU_F_A) ? 1 : 0;
 	cu->cu_learning =
 		(flags & BSTP_PDU_F_L) ? 1 : 0;
 	cu->cu_forwarding =
 		(flags & BSTP_PDU_F_F) ? 1 : 0;
 	cu->cu_topology_change =
 		(flags & BSTP_PDU_F_TC) ? 1 : 0;
 
 	switch ((flags & BSTP_PDU_PRMASK) >> BSTP_PDU_PRSHIFT) {
 		case BSTP_PDU_F_ROOT:
 			cu->cu_role = BSTP_ROLE_ROOT;
 			break;
 		case BSTP_PDU_F_ALT:
 			cu->cu_role = BSTP_ROLE_ALTERNATE;
 			break;
 		case BSTP_PDU_F_DESG:
 			cu->cu_role = BSTP_ROLE_DESIGNATED;
 			break;
 	}
 }
 
 static void
 bstp_send_bpdu(struct bstp_state *bs, struct bstp_port *bp,
     struct bstp_cbpdu *bpdu)
 {
 	struct ifnet *ifp;
 	struct mbuf *m;
 	struct ether_header *eh;
 
 	BSTP_LOCK_ASSERT(bs);
 	NET_EPOCH_ASSERT();
 
 	ifp = bp->bp_ifp;
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return;
 
 	eh = mtod(m, struct ether_header *);
 
 	bpdu->cbu_ssap = bpdu->cbu_dsap = LLC_8021D_LSAP;
 	bpdu->cbu_ctl = LLC_UI;
 	bpdu->cbu_protoid = htons(BSTP_PROTO_ID);
 
 	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 	memcpy(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN);
 
 	switch (bpdu->cbu_bpdutype) {
 		case BSTP_MSGTYPE_CFG:
 			bpdu->cbu_protover = BSTP_PROTO_STP;
 			m->m_pkthdr.len = sizeof(*eh) + BSTP_BPDU_STP_LEN;
 			eh->ether_type = htons(BSTP_BPDU_STP_LEN);
 			memcpy(mtod(m, caddr_t) + sizeof(*eh), bpdu,
 			    BSTP_BPDU_STP_LEN);
 			break;
 
 		case BSTP_MSGTYPE_RSTP:
 			bpdu->cbu_protover = BSTP_PROTO_RSTP;
 			bpdu->cbu_versionlen = htons(0);
 			m->m_pkthdr.len = sizeof(*eh) + BSTP_BPDU_RSTP_LEN;
 			eh->ether_type = htons(BSTP_BPDU_RSTP_LEN);
 			memcpy(mtod(m, caddr_t) + sizeof(*eh), bpdu,
 			    BSTP_BPDU_RSTP_LEN);
 			break;
 
 		default:
 			panic("not implemented");
 	}
 	m->m_pkthdr.rcvif = ifp;
 	m->m_len = m->m_pkthdr.len;
 
 	bp->bp_txcount++;
 	ifp->if_transmit(ifp, m);
 }
 
 static int
 bstp_pdu_flags(struct bstp_port *bp)
 {
 	int flags = 0;
 
 	if (bp->bp_proposing && bp->bp_state != BSTP_IFSTATE_FORWARDING)
 		flags |= BSTP_PDU_F_P;
 
 	if (bp->bp_agree)
 		flags |= BSTP_PDU_F_A;
 
 	if (bp->bp_tc_timer.active)
 		flags |= BSTP_PDU_F_TC;
 
 	if (bp->bp_tc_ack)
 		flags |= BSTP_PDU_F_TCA;
 
 	switch (bp->bp_state) {
 		case BSTP_IFSTATE_LEARNING:
 			flags |= BSTP_PDU_F_L;
 			break;
 
 		case BSTP_IFSTATE_FORWARDING:
 			flags |= (BSTP_PDU_F_L | BSTP_PDU_F_F);
 			break;
 	}
 
 	switch (bp->bp_role) {
 		case BSTP_ROLE_ROOT:
 			flags |=
 				(BSTP_PDU_F_ROOT << BSTP_PDU_PRSHIFT);
 			break;
 
 		case BSTP_ROLE_ALTERNATE:
 		case BSTP_ROLE_BACKUP:	/* fall through */
 			flags |=
 				(BSTP_PDU_F_ALT << BSTP_PDU_PRSHIFT);
 			break;
 
 		case BSTP_ROLE_DESIGNATED:
 			flags |=
 				(BSTP_PDU_F_DESG << BSTP_PDU_PRSHIFT);
 			break;
 	}
 
 	/* Strip off unused flags in either mode */
 	switch (bp->bp_protover) {
 		case BSTP_PROTO_STP:
 			flags &= BSTP_PDU_STPMASK;
 			break;
 		case BSTP_PROTO_RSTP:
 			flags &= BSTP_PDU_RSTPMASK;
 			break;
 	}
 	return (flags);
 }
 
 void
 bstp_input(struct bstp_port *bp, struct ifnet *ifp, struct mbuf *m)
 {
 	struct bstp_state *bs = bp->bp_bs;
 	struct ether_header *eh;
 	struct bstp_tbpdu tpdu;
 	uint16_t len;
 
 	if (bp->bp_active == 0) {
 		m_freem(m);
 		return;
 	}
 
 	BSTP_LOCK(bs);
 
 	eh = mtod(m, struct ether_header *);
 
 	len = ntohs(eh->ether_type);
 	if (len < sizeof(tpdu))
 		goto out;
 
 	m_adj(m, ETHER_HDR_LEN);
 
 	if (m->m_pkthdr.len > len)
 		m_adj(m, len - m->m_pkthdr.len);
 	if (m->m_len < sizeof(tpdu) &&
 	    (m = m_pullup(m, sizeof(tpdu))) == NULL)
 		goto out;
 
 	memcpy(&tpdu, mtod(m, caddr_t), sizeof(tpdu));
 
 	/* basic packet checks */
 	if (tpdu.tbu_dsap != LLC_8021D_LSAP ||
 	    tpdu.tbu_ssap != LLC_8021D_LSAP ||
 	    tpdu.tbu_ctl != LLC_UI)
 		goto out;
 	if (tpdu.tbu_protoid != BSTP_PROTO_ID)
 		goto out;
 
 	/*
 	 * We can treat later versions of the PDU as the same as the maximum
 	 * version we implement. All additional parameters/flags are ignored.
 	 */
 	if (tpdu.tbu_protover > BSTP_PROTO_MAX)
 		tpdu.tbu_protover = BSTP_PROTO_MAX;
 
 	if (tpdu.tbu_protover != bp->bp_protover) {
 		/*
 		 * Wait for the migration delay timer to expire before changing
 		 * protocol version to avoid flip-flops.
 		 */
 		if (bp->bp_flags & BSTP_PORT_CANMIGRATE)
 			bstp_set_port_proto(bp, tpdu.tbu_protover);
 		else
 			goto out;
 	}
 
 	/* Clear operedge upon receiving a PDU on the port */
 	bp->bp_operedge = 0;
 	bstp_timer_start(&bp->bp_edge_delay_timer,
 	    BSTP_DEFAULT_MIGRATE_DELAY);
 
 	switch (tpdu.tbu_protover) {
 		case BSTP_PROTO_STP:
 			bstp_received_stp(bs, bp, &m, &tpdu);
 			break;
 
 		case BSTP_PROTO_RSTP:
 			bstp_received_rstp(bs, bp, &m, &tpdu);
 			break;
 	}
 out:
 	BSTP_UNLOCK(bs);
 	if (m)
 		m_freem(m);
 }
 
 static void
 bstp_received_stp(struct bstp_state *bs, struct bstp_port *bp,
     struct mbuf **mp, struct bstp_tbpdu *tpdu)
 {
 	struct bstp_cbpdu cpdu;
 	struct bstp_config_unit *cu = &bp->bp_msg_cu;
 	struct bstp_tcn_unit tu;
 
 	switch (tpdu->tbu_bpdutype) {
 	case BSTP_MSGTYPE_TCN:
 		tu.tu_message_type = tpdu->tbu_bpdutype;
 		bstp_received_tcn(bs, bp, &tu);
 		break;
 	case BSTP_MSGTYPE_CFG:
 		if ((*mp)->m_len < BSTP_BPDU_STP_LEN &&
 		    (*mp = m_pullup(*mp, BSTP_BPDU_STP_LEN)) == NULL)
 			return;
 		memcpy(&cpdu, mtod(*mp, caddr_t), BSTP_BPDU_STP_LEN);
 
 		bstp_decode_bpdu(bp, &cpdu, cu);
 		bstp_received_bpdu(bs, bp, cu);
 		break;
 	}
 }
 
 static void
 bstp_received_rstp(struct bstp_state *bs, struct bstp_port *bp,
     struct mbuf **mp, struct bstp_tbpdu *tpdu)
 {
 	struct bstp_cbpdu cpdu;
 	struct bstp_config_unit *cu = &bp->bp_msg_cu;
 
 	if (tpdu->tbu_bpdutype != BSTP_MSGTYPE_RSTP)
 		return;
 
 	if ((*mp)->m_len < BSTP_BPDU_RSTP_LEN &&
 	    (*mp = m_pullup(*mp, BSTP_BPDU_RSTP_LEN)) == NULL)
 		return;
 	memcpy(&cpdu, mtod(*mp, caddr_t), BSTP_BPDU_RSTP_LEN);
 
 	bstp_decode_bpdu(bp, &cpdu, cu);
 	bstp_received_bpdu(bs, bp, cu);
 }
 
 static void
 bstp_received_tcn(struct bstp_state *bs, struct bstp_port *bp,
     struct bstp_tcn_unit *tcn)
 {
 	bp->bp_rcvdtcn = 1;
 	bstp_update_tc(bp);
 }
 
 static void
 bstp_received_bpdu(struct bstp_state *bs, struct bstp_port *bp,
     struct bstp_config_unit *cu)
 {
 	int type;
 
 	BSTP_LOCK_ASSERT(bs);
 
 	/* We need to have transitioned to INFO_MINE before proceeding */
 	switch (bp->bp_infois) {
 		case BSTP_INFO_DISABLED:
 		case BSTP_INFO_AGED:
 			return;
 	}
 
 	/* range checks */
 	if (cu->cu_message_age >= cu->cu_max_age) {
 		return;
 	}
 	if (cu->cu_max_age < BSTP_MIN_MAX_AGE ||
 	    cu->cu_max_age > BSTP_MAX_MAX_AGE) {
 		return;
 	}
 	if (cu->cu_forward_delay < BSTP_MIN_FORWARD_DELAY ||
 	    cu->cu_forward_delay > BSTP_MAX_FORWARD_DELAY) {
 		return;
 	}
 	if (cu->cu_hello_time < BSTP_MIN_HELLO_TIME ||
 	    cu->cu_hello_time > BSTP_MAX_HELLO_TIME) {
 		return;
 	}
 
 	type = bstp_pdu_rcvtype(bp, cu);
 
 	switch (type) {
 		case BSTP_PDU_SUPERIOR:
 			bs->bs_allsynced = 0;
 			bp->bp_agreed = 0;
 			bp->bp_proposing = 0;
 
 			if (cu->cu_proposal && cu->cu_forwarding == 0)
 				bp->bp_proposed = 1;
 			if (cu->cu_topology_change)
 				bp->bp_rcvdtc = 1;
 			if (cu->cu_topology_change_ack)
 				bp->bp_rcvdtca = 1;
 
 			if (bp->bp_agree &&
 			    !bstp_pdu_bettersame(bp, BSTP_INFO_RECEIVED))
 				bp->bp_agree = 0;
 
 			/* copy the received priority and timers to the port */
 			bp->bp_port_pv = cu->cu_pv;
 			bp->bp_port_msg_age = cu->cu_message_age;
 			bp->bp_port_max_age = cu->cu_max_age;
 			bp->bp_port_fdelay = cu->cu_forward_delay;
 			bp->bp_port_htime =
 				(cu->cu_hello_time > BSTP_MIN_HELLO_TIME ?
 				 cu->cu_hello_time : BSTP_MIN_HELLO_TIME);
 
 			/* set expiry for the new info */
 			bstp_set_timer_msgage(bp);
 
 			bp->bp_infois = BSTP_INFO_RECEIVED;
 			bstp_assign_roles(bs);
 			break;
 
 		case BSTP_PDU_REPEATED:
 			if (cu->cu_proposal && cu->cu_forwarding == 0)
 				bp->bp_proposed = 1;
 			if (cu->cu_topology_change)
 				bp->bp_rcvdtc = 1;
 			if (cu->cu_topology_change_ack)
 				bp->bp_rcvdtca = 1;
 
 			/* rearm the age timer */
 			bstp_set_timer_msgage(bp);
 			break;
 
 		case BSTP_PDU_INFERIOR:
 			if (cu->cu_learning) {
 				bp->bp_agreed = 1;
 				bp->bp_proposing = 0;
 			}
 			break;
 
 		case BSTP_PDU_INFERIORALT:
 			/*
 			 * only point to point links are allowed fast
 			 * transitions to forwarding.
 			 */
 			if (cu->cu_agree && bp->bp_ptp_link) {
 				bp->bp_agreed = 1;
 				bp->bp_proposing = 0;
 			} else
 				bp->bp_agreed = 0;
 
 			if (cu->cu_topology_change)
 				bp->bp_rcvdtc = 1;
 			if (cu->cu_topology_change_ack)
 				bp->bp_rcvdtca = 1;
 			break;
 
 		case BSTP_PDU_OTHER:
 			return;	/* do nothing */
 	}
 	/* update the state machines with the new data */
 	bstp_update_state(bs, bp);
 }
 
 static int
 bstp_pdu_rcvtype(struct bstp_port *bp, struct bstp_config_unit *cu)
 {
 	int type;
 
 	/* default return type */
 	type = BSTP_PDU_OTHER;
 
 	switch (cu->cu_role) {
 	case BSTP_ROLE_DESIGNATED:
 		if (bstp_info_superior(&bp->bp_port_pv, &cu->cu_pv))
 			/* bpdu priority is superior */
 			type = BSTP_PDU_SUPERIOR;
 		else if (bstp_info_cmp(&bp->bp_port_pv, &cu->cu_pv) ==
 		    INFO_SAME) {
 			if (bp->bp_port_msg_age != cu->cu_message_age ||
 			    bp->bp_port_max_age != cu->cu_max_age ||
 			    bp->bp_port_fdelay != cu->cu_forward_delay ||
 			    bp->bp_port_htime != cu->cu_hello_time)
 				/* bpdu priority is equal and timers differ */
 				type = BSTP_PDU_SUPERIOR;
 			else
 				/* bpdu is equal */
 				type = BSTP_PDU_REPEATED;
 		} else
 			/* bpdu priority is worse */
 			type = BSTP_PDU_INFERIOR;
 
 		break;
 
 	case BSTP_ROLE_ROOT:
 	case BSTP_ROLE_ALTERNATE:
 	case BSTP_ROLE_BACKUP:
 		if (bstp_info_cmp(&bp->bp_port_pv, &cu->cu_pv) <= INFO_SAME)
 			/*
 			 * not a designated port and priority is the same or
 			 * worse
 			 */
 			type = BSTP_PDU_INFERIORALT;
 		break;
 	}
 
 	return (type);
 }
 
 static int
 bstp_pdu_bettersame(struct bstp_port *bp, int newinfo)
 {
 	if (newinfo == BSTP_INFO_RECEIVED &&
 	    bp->bp_infois == BSTP_INFO_RECEIVED &&
 	    bstp_info_cmp(&bp->bp_port_pv, &bp->bp_msg_cu.cu_pv) >= INFO_SAME)
 		return (1);
 
 	if (newinfo == BSTP_INFO_MINE &&
 	    bp->bp_infois == BSTP_INFO_MINE &&
 	    bstp_info_cmp(&bp->bp_port_pv, &bp->bp_desg_pv) >= INFO_SAME)
 		return (1);
 
 	return (0);
 }
 
 static int
 bstp_info_cmp(struct bstp_pri_vector *pv,
     struct bstp_pri_vector *cpv)
 {
 	if (cpv->pv_root_id < pv->pv_root_id)
 		return (INFO_BETTER);
 	if (cpv->pv_root_id > pv->pv_root_id)
 		return (INFO_WORSE);
 
 	if (cpv->pv_cost < pv->pv_cost)
 		return (INFO_BETTER);
 	if (cpv->pv_cost > pv->pv_cost)
 		return (INFO_WORSE);
 
 	if (cpv->pv_dbridge_id < pv->pv_dbridge_id)
 		return (INFO_BETTER);
 	if (cpv->pv_dbridge_id > pv->pv_dbridge_id)
 		return (INFO_WORSE);
 
 	if (cpv->pv_dport_id < pv->pv_dport_id)
 		return (INFO_BETTER);
 	if (cpv->pv_dport_id > pv->pv_dport_id)
 		return (INFO_WORSE);
 
 	return (INFO_SAME);
 }
 
 /*
  * This message priority vector is superior to the port priority vector and
  * will replace it if, and only if, the message priority vector is better than
  * the port priority vector, or the message has been transmitted from the same
  * designated bridge and designated port as the port priority vector.
  */
 static int
 bstp_info_superior(struct bstp_pri_vector *pv,
     struct bstp_pri_vector *cpv)
 {
 	if (bstp_info_cmp(pv, cpv) == INFO_BETTER ||
 	    (bstp_same_bridgeid(pv->pv_dbridge_id, cpv->pv_dbridge_id) &&
 	    (cpv->pv_dport_id & 0xfff) == (pv->pv_dport_id & 0xfff)))
 		return (1);
 	return (0);
 }
 
 static void
 bstp_assign_roles(struct bstp_state *bs)
 {
 	struct bstp_port *bp, *rbp = NULL;
 	struct bstp_pri_vector pv;
 
 	/* default to our priority vector */
 	bs->bs_root_pv = bs->bs_bridge_pv;
 	bs->bs_root_msg_age = 0;
 	bs->bs_root_max_age = bs->bs_bridge_max_age;
 	bs->bs_root_fdelay = bs->bs_bridge_fdelay;
 	bs->bs_root_htime = bs->bs_bridge_htime;
 	bs->bs_root_port = NULL;
 
 	/* check if any received info supersedes us */
 	LIST_FOREACH(bp, &bs->bs_bplist, bp_next) {
 		if (bp->bp_infois != BSTP_INFO_RECEIVED)
 			continue;
 
 		pv = bp->bp_port_pv;
 		pv.pv_cost += bp->bp_path_cost;
 
 		/*
 		 * The root priority vector is the best of the set comprising
 		 * the bridge priority vector plus all root path priority
 		 * vectors whose bridge address is not equal to us.
 		 */
 		if (bstp_same_bridgeid(pv.pv_dbridge_id,
 		    bs->bs_bridge_pv.pv_dbridge_id) == 0 &&
 		    bstp_info_cmp(&bs->bs_root_pv, &pv) == INFO_BETTER) {
 			/* the port vector replaces the root */
 			bs->bs_root_pv = pv;
 			bs->bs_root_msg_age = bp->bp_port_msg_age +
 			    BSTP_MESSAGE_AGE_INCR;
 			bs->bs_root_max_age = bp->bp_port_max_age;
 			bs->bs_root_fdelay = bp->bp_port_fdelay;
 			bs->bs_root_htime = bp->bp_port_htime;
 			rbp = bp;
 		}
 	}
 
 	LIST_FOREACH(bp, &bs->bs_bplist, bp_next) {
 		/* calculate the port designated vector */
 		bp->bp_desg_pv.pv_root_id = bs->bs_root_pv.pv_root_id;
 		bp->bp_desg_pv.pv_cost = bs->bs_root_pv.pv_cost;
 		bp->bp_desg_pv.pv_dbridge_id = bs->bs_bridge_pv.pv_dbridge_id;
 		bp->bp_desg_pv.pv_dport_id = bp->bp_port_id;
 		bp->bp_desg_pv.pv_port_id = bp->bp_port_id;
 
 		/* calculate designated times */
 		bp->bp_desg_msg_age = bs->bs_root_msg_age;
 		bp->bp_desg_max_age = bs->bs_root_max_age;
 		bp->bp_desg_fdelay = bs->bs_root_fdelay;
 		bp->bp_desg_htime = bs->bs_bridge_htime;
 
 		switch (bp->bp_infois) {
 		case BSTP_INFO_DISABLED:
 			bstp_set_port_role(bp, BSTP_ROLE_DISABLED);
 			break;
 
 		case BSTP_INFO_AGED:
 			bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED);
 			bstp_update_info(bp);
 			break;
 
 		case BSTP_INFO_MINE:
 			bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED);
 			/* update the port info if stale */
 			if (bstp_info_cmp(&bp->bp_port_pv,
 			    &bp->bp_desg_pv) != INFO_SAME ||
 			    (rbp != NULL &&
 			    (bp->bp_port_msg_age != rbp->bp_port_msg_age ||
 			    bp->bp_port_max_age != rbp->bp_port_max_age ||
 			    bp->bp_port_fdelay != rbp->bp_port_fdelay ||
 			    bp->bp_port_htime != rbp->bp_port_htime)))
 				bstp_update_info(bp);
 			break;
 
 		case BSTP_INFO_RECEIVED:
 			if (bp == rbp) {
 				/*
 				 * root priority is derived from this
 				 * port, make it the root port.
 				 */
 				bstp_set_port_role(bp, BSTP_ROLE_ROOT);
 				bs->bs_root_port = bp;
 			} else if (bstp_info_cmp(&bp->bp_port_pv,
 				    &bp->bp_desg_pv) == INFO_BETTER) {
 				/*
 				 * the port priority is lower than the root
 				 * port.
 				 */
 				bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED);
 				bstp_update_info(bp);
 			} else {
 				if (bstp_same_bridgeid(
 				    bp->bp_port_pv.pv_dbridge_id,
 				    bs->bs_bridge_pv.pv_dbridge_id)) {
 					/*
 					 * the designated bridge refers to
 					 * another port on this bridge.
 					 */
 					bstp_set_port_role(bp,
 					    BSTP_ROLE_BACKUP);
 				} else {
 					/*
 					 * the port is an inferior path to the
 					 * root bridge.
 					 */
 					bstp_set_port_role(bp,
 					    BSTP_ROLE_ALTERNATE);
 				}
 			}
 			break;
 		}
 	}
 }
 
 static void
 bstp_update_state(struct bstp_state *bs, struct bstp_port *bp)
 {
 	struct bstp_port *bp2;
 	int synced;
 
 	BSTP_LOCK_ASSERT(bs);
 
 	/* check if all the ports have syncronised again */
 	if (!bs->bs_allsynced) {
 		synced = 1;
 		LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) {
 			if (!(bp2->bp_synced ||
 			     bp2->bp_role == BSTP_ROLE_ROOT)) {
 				synced = 0;
 				break;
 			}
 		}
 		bs->bs_allsynced = synced;
 	}
 
 	bstp_update_roles(bs, bp);
 	bstp_update_tc(bp);
 }
 
 static void
 bstp_update_roles(struct bstp_state *bs, struct bstp_port *bp)
 {
 	NET_EPOCH_ASSERT();
 
 	switch (bp->bp_role) {
 	case BSTP_ROLE_DISABLED:
 		/* Clear any flags if set */
 		if (bp->bp_sync || !bp->bp_synced || bp->bp_reroot) {
 			bp->bp_sync = 0;
 			bp->bp_synced = 1;
 			bp->bp_reroot = 0;
 		}
 		break;
 
 	case BSTP_ROLE_ALTERNATE:
 	case BSTP_ROLE_BACKUP:
 		if ((bs->bs_allsynced && !bp->bp_agree) ||
 		    (bp->bp_proposed && bp->bp_agree)) {
 			bp->bp_proposed = 0;
 			bp->bp_agree = 1;
 			bp->bp_flags |= BSTP_PORT_NEWINFO;
 			DPRINTF("%s -> ALTERNATE_AGREED\n",
 			    bp->bp_ifp->if_xname);
 		}
 
 		if (bp->bp_proposed && !bp->bp_agree) {
 			bstp_set_all_sync(bs);
 			bp->bp_proposed = 0;
 			DPRINTF("%s -> ALTERNATE_PROPOSED\n",
 			    bp->bp_ifp->if_xname);
 		}
 
 		/* Clear any flags if set */
 		if (bp->bp_sync || !bp->bp_synced || bp->bp_reroot) {
 			bp->bp_sync = 0;
 			bp->bp_synced = 1;
 			bp->bp_reroot = 0;
 			DPRINTF("%s -> ALTERNATE_PORT\n", bp->bp_ifp->if_xname);
 		}
 		break;
 
 	case BSTP_ROLE_ROOT:
 		if (bp->bp_state != BSTP_IFSTATE_FORWARDING && !bp->bp_reroot) {
 			bstp_set_all_reroot(bs);
 			DPRINTF("%s -> ROOT_REROOT\n", bp->bp_ifp->if_xname);
 		}
 
 		if ((bs->bs_allsynced && !bp->bp_agree) ||
 		    (bp->bp_proposed && bp->bp_agree)) {
 			bp->bp_proposed = 0;
 			bp->bp_sync = 0;
 			bp->bp_agree = 1;
 			bp->bp_flags |= BSTP_PORT_NEWINFO;
 			DPRINTF("%s -> ROOT_AGREED\n", bp->bp_ifp->if_xname);
 		}
 
 		if (bp->bp_proposed && !bp->bp_agree) {
 			bstp_set_all_sync(bs);
 			bp->bp_proposed = 0;
 			DPRINTF("%s -> ROOT_PROPOSED\n", bp->bp_ifp->if_xname);
 		}
 
 		if (bp->bp_state != BSTP_IFSTATE_FORWARDING &&
 		    (bp->bp_forward_delay_timer.active == 0 ||
 		    (bstp_rerooted(bs, bp) &&
 		    bp->bp_recent_backup_timer.active == 0 &&
 		    bp->bp_protover == BSTP_PROTO_RSTP))) {
 			switch (bp->bp_state) {
 			case BSTP_IFSTATE_DISCARDING:
 				bstp_set_port_state(bp, BSTP_IFSTATE_LEARNING);
 				break;
 			case BSTP_IFSTATE_LEARNING:
 				bstp_set_port_state(bp,
 				    BSTP_IFSTATE_FORWARDING);
 				break;
 			}
 		}
 
 		if (bp->bp_state == BSTP_IFSTATE_FORWARDING && bp->bp_reroot) {
 			bp->bp_reroot = 0;
 			DPRINTF("%s -> ROOT_REROOTED\n", bp->bp_ifp->if_xname);
 		}
 		break;
 
 	case BSTP_ROLE_DESIGNATED:
 		if (bp->bp_recent_root_timer.active == 0 && bp->bp_reroot) {
 			bp->bp_reroot = 0;
 			DPRINTF("%s -> DESIGNATED_RETIRED\n",
 			    bp->bp_ifp->if_xname);
 		}
 
 		if ((bp->bp_state == BSTP_IFSTATE_DISCARDING &&
 		    !bp->bp_synced) || (bp->bp_agreed && !bp->bp_synced) ||
 		    (bp->bp_operedge && !bp->bp_synced) ||
 		    (bp->bp_sync && bp->bp_synced)) {
 			bstp_timer_stop(&bp->bp_recent_root_timer);
 			bp->bp_synced = 1;
 			bp->bp_sync = 0;
 			DPRINTF("%s -> DESIGNATED_SYNCED\n",
 			    bp->bp_ifp->if_xname);
 		}
 
 		if (bp->bp_state != BSTP_IFSTATE_FORWARDING &&
 		    !bp->bp_agreed && !bp->bp_proposing &&
 		    !bp->bp_operedge) {
 			bp->bp_proposing = 1;
 			bp->bp_flags |= BSTP_PORT_NEWINFO;
 			bstp_timer_start(&bp->bp_edge_delay_timer,
 			    (bp->bp_ptp_link ? BSTP_DEFAULT_MIGRATE_DELAY :
 			     bp->bp_desg_max_age));
 			DPRINTF("%s -> DESIGNATED_PROPOSE\n",
 			    bp->bp_ifp->if_xname);
 		}
 
 		if (bp->bp_state != BSTP_IFSTATE_FORWARDING &&
 		    (bp->bp_forward_delay_timer.active == 0 || bp->bp_agreed ||
 		    bp->bp_operedge) &&
 		    (bp->bp_recent_root_timer.active == 0 || !bp->bp_reroot) &&
 		    !bp->bp_sync) {
 			if (bp->bp_agreed)
 				DPRINTF("%s -> AGREED\n", bp->bp_ifp->if_xname);
 			/*
 			 * If agreed|operedge then go straight to forwarding,
 			 * otherwise follow discard -> learn -> forward.
 			 */
 			if (bp->bp_agreed || bp->bp_operedge ||
 			    bp->bp_state == BSTP_IFSTATE_LEARNING) {
 				bstp_set_port_state(bp,
 				    BSTP_IFSTATE_FORWARDING);
 				bp->bp_agreed = bp->bp_protover;
 			} else if (bp->bp_state == BSTP_IFSTATE_DISCARDING)
 				bstp_set_port_state(bp, BSTP_IFSTATE_LEARNING);
 		}
 
 		if (((bp->bp_sync && !bp->bp_synced) ||
 		    (bp->bp_reroot && bp->bp_recent_root_timer.active) ||
 		    (bp->bp_flags & BSTP_PORT_DISPUTED)) && !bp->bp_operedge &&
 		    bp->bp_state != BSTP_IFSTATE_DISCARDING) {
 			bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING);
 			bp->bp_flags &= ~BSTP_PORT_DISPUTED;
 			bstp_timer_start(&bp->bp_forward_delay_timer,
 			    bp->bp_protover == BSTP_PROTO_RSTP ?
 			    bp->bp_desg_htime : bp->bp_desg_fdelay);
 			DPRINTF("%s -> DESIGNATED_DISCARD\n",
 			    bp->bp_ifp->if_xname);
 		}
 		break;
 	}
 
 	if (bp->bp_flags & BSTP_PORT_NEWINFO)
 		bstp_transmit(bs, bp);
 }
 
 static void
 bstp_update_tc(struct bstp_port *bp)
 {
 	switch (bp->bp_tcstate) {
 		case BSTP_TCSTATE_ACTIVE:
 			if ((bp->bp_role != BSTP_ROLE_DESIGNATED &&
 			    bp->bp_role != BSTP_ROLE_ROOT) || bp->bp_operedge)
 				bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING);
 
 			if (bp->bp_rcvdtcn)
 				bstp_set_port_tc(bp, BSTP_TCSTATE_TCN);
 			if (bp->bp_rcvdtc)
 				bstp_set_port_tc(bp, BSTP_TCSTATE_TC);
 
 			if (bp->bp_tc_prop && !bp->bp_operedge)
 				bstp_set_port_tc(bp, BSTP_TCSTATE_PROPAG);
 
 			if (bp->bp_rcvdtca)
 				bstp_set_port_tc(bp, BSTP_TCSTATE_ACK);
 			break;
 
 		case BSTP_TCSTATE_INACTIVE:
 			if ((bp->bp_state == BSTP_IFSTATE_LEARNING ||
 			    bp->bp_state == BSTP_IFSTATE_FORWARDING) &&
 			    bp->bp_fdbflush == 0)
 				bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING);
 			break;
 
 		case BSTP_TCSTATE_LEARNING:
 			if (bp->bp_rcvdtc || bp->bp_rcvdtcn || bp->bp_rcvdtca ||
 			    bp->bp_tc_prop)
 				bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING);
 			else if (bp->bp_role != BSTP_ROLE_DESIGNATED &&
 				 bp->bp_role != BSTP_ROLE_ROOT &&
 				 bp->bp_state == BSTP_IFSTATE_DISCARDING)
 				bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE);
 
 			if ((bp->bp_role == BSTP_ROLE_DESIGNATED ||
 			    bp->bp_role == BSTP_ROLE_ROOT) &&
 			    bp->bp_state == BSTP_IFSTATE_FORWARDING &&
 			    !bp->bp_operedge)
 				bstp_set_port_tc(bp, BSTP_TCSTATE_DETECTED);
 			break;
 
 		/* these are transient states and go straight back to ACTIVE */
 		case BSTP_TCSTATE_DETECTED:
 		case BSTP_TCSTATE_TCN:
 		case BSTP_TCSTATE_TC:
 		case BSTP_TCSTATE_PROPAG:
 		case BSTP_TCSTATE_ACK:
 			DPRINTF("Invalid TC state for %s\n",
 			    bp->bp_ifp->if_xname);
 			break;
 	}
 
 }
 
 static void
 bstp_update_info(struct bstp_port *bp)
 {
 	struct bstp_state *bs = bp->bp_bs;
 
 	bp->bp_proposing = 0;
 	bp->bp_proposed = 0;
 
 	if (bp->bp_agreed && !bstp_pdu_bettersame(bp, BSTP_INFO_MINE))
 		bp->bp_agreed = 0;
 
 	if (bp->bp_synced && !bp->bp_agreed) {
 		bp->bp_synced = 0;
 		bs->bs_allsynced = 0;
 	}
 
 	/* copy the designated pv to the port */
 	bp->bp_port_pv = bp->bp_desg_pv;
 	bp->bp_port_msg_age = bp->bp_desg_msg_age;
 	bp->bp_port_max_age = bp->bp_desg_max_age;
 	bp->bp_port_fdelay = bp->bp_desg_fdelay;
 	bp->bp_port_htime = bp->bp_desg_htime;
 	bp->bp_infois = BSTP_INFO_MINE;
 
 	/* Set transmit flag but do not immediately send */
 	bp->bp_flags |= BSTP_PORT_NEWINFO;
 }
 
 /* set tcprop on every port other than the caller */
 static void
 bstp_set_other_tcprop(struct bstp_port *bp)
 {
 	struct bstp_state *bs = bp->bp_bs;
 	struct bstp_port *bp2;
 
 	BSTP_LOCK_ASSERT(bs);
 
 	LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) {
 		if (bp2 == bp)
 			continue;
 		bp2->bp_tc_prop = 1;
 	}
 }
 
 static void
 bstp_set_all_reroot(struct bstp_state *bs)
 {
 	struct bstp_port *bp;
 
 	BSTP_LOCK_ASSERT(bs);
 
 	LIST_FOREACH(bp, &bs->bs_bplist, bp_next)
 		bp->bp_reroot = 1;
 }
 
 static void
 bstp_set_all_sync(struct bstp_state *bs)
 {
 	struct bstp_port *bp;
 
 	BSTP_LOCK_ASSERT(bs);
 
 	LIST_FOREACH(bp, &bs->bs_bplist, bp_next) {
 		bp->bp_sync = 1;
 		bp->bp_synced = 0;	/* Not explicit in spec */
 	}
 
 	bs->bs_allsynced = 0;
 }
 
 static void
 bstp_set_port_state(struct bstp_port *bp, int state)
 {
 	if (bp->bp_state == state)
 		return;
 
 	bp->bp_state = state;
 
 	switch (bp->bp_state) {
 		case BSTP_IFSTATE_DISCARDING:
 			DPRINTF("state changed to DISCARDING on %s\n",
 			    bp->bp_ifp->if_xname);
 			break;
 
 		case BSTP_IFSTATE_LEARNING:
 			DPRINTF("state changed to LEARNING on %s\n",
 			    bp->bp_ifp->if_xname);
 
 			bstp_timer_start(&bp->bp_forward_delay_timer,
 			    bp->bp_protover == BSTP_PROTO_RSTP ?
 			    bp->bp_desg_htime : bp->bp_desg_fdelay);
 			break;
 
 		case BSTP_IFSTATE_FORWARDING:
 			DPRINTF("state changed to FORWARDING on %s\n",
 			    bp->bp_ifp->if_xname);
 
 			bstp_timer_stop(&bp->bp_forward_delay_timer);
 			/* Record that we enabled forwarding */
 			bp->bp_forward_transitions++;
 			break;
 	}
 
 	/* notify the parent bridge */
 	taskqueue_enqueue(taskqueue_swi, &bp->bp_statetask);
 }
 
 static void
 bstp_set_port_role(struct bstp_port *bp, int role)
 {
 	struct bstp_state *bs = bp->bp_bs;
 
 	if (bp->bp_role == role)
 		return;
 
 	/* perform pre-change tasks */
 	switch (bp->bp_role) {
 		case BSTP_ROLE_DISABLED:
 			bstp_timer_start(&bp->bp_forward_delay_timer,
 			    bp->bp_desg_max_age);
 			break;
 
 		case BSTP_ROLE_BACKUP:
 			bstp_timer_start(&bp->bp_recent_backup_timer,
 			    bp->bp_desg_htime * 2);
 			/* fall through */
 		case BSTP_ROLE_ALTERNATE:
 			bstp_timer_start(&bp->bp_forward_delay_timer,
 			    bp->bp_desg_fdelay);
 			bp->bp_sync = 0;
 			bp->bp_synced = 1;
 			bp->bp_reroot = 0;
 			break;
 
 		case BSTP_ROLE_ROOT:
 			bstp_timer_start(&bp->bp_recent_root_timer,
 			    BSTP_DEFAULT_FORWARD_DELAY);
 			break;
 	}
 
 	bp->bp_role = role;
 	/* clear values not carried between roles */
 	bp->bp_proposing = 0;
 	bs->bs_allsynced = 0;
 
 	/* initialise the new role */
 	switch (bp->bp_role) {
 		case BSTP_ROLE_DISABLED:
 		case BSTP_ROLE_ALTERNATE:
 		case BSTP_ROLE_BACKUP:
 			DPRINTF("%s role -> ALT/BACK/DISABLED\n",
 			    bp->bp_ifp->if_xname);
 			bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING);
 			bstp_timer_stop(&bp->bp_recent_root_timer);
 			bstp_timer_latch(&bp->bp_forward_delay_timer);
 			bp->bp_sync = 0;
 			bp->bp_synced = 1;
 			bp->bp_reroot = 0;
 			break;
 
 		case BSTP_ROLE_ROOT:
 			DPRINTF("%s role -> ROOT\n",
 			    bp->bp_ifp->if_xname);
 			bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING);
 			bstp_timer_latch(&bp->bp_recent_root_timer);
 			bp->bp_proposing = 0;
 			break;
 
 		case BSTP_ROLE_DESIGNATED:
 			DPRINTF("%s role -> DESIGNATED\n",
 			    bp->bp_ifp->if_xname);
 			bstp_timer_start(&bp->bp_hello_timer,
 			    bp->bp_desg_htime);
 			bp->bp_agree = 0;
 			break;
 	}
 
 	/* let the TC state know that the role changed */
 	bstp_update_tc(bp);
 }
 
 static void
 bstp_set_port_proto(struct bstp_port *bp, int proto)
 {
 	struct bstp_state *bs = bp->bp_bs;
 
 	/* supported protocol versions */
 	switch (proto) {
 		case BSTP_PROTO_STP:
 			/* we can downgrade protocols only */
 			bstp_timer_stop(&bp->bp_migrate_delay_timer);
 			/* clear unsupported features */
 			bp->bp_operedge = 0;
 			/* STP compat mode only uses 16 bits of the 32 */
 			if (bp->bp_path_cost > 65535)
 				bp->bp_path_cost = 65535;
 			break;
 
 		case BSTP_PROTO_RSTP:
 			bstp_timer_start(&bp->bp_migrate_delay_timer,
 			    bs->bs_migration_delay);
 			break;
 
 		default:
 			DPRINTF("Unsupported STP version %d\n", proto);
 			return;
 	}
 
 	bp->bp_protover = proto;
 	bp->bp_flags &= ~BSTP_PORT_CANMIGRATE;
 }
 
 static void
 bstp_set_port_tc(struct bstp_port *bp, int state)
 {
 	struct bstp_state *bs = bp->bp_bs;
 
 	bp->bp_tcstate = state;
 
 	/* initialise the new state */
 	switch (bp->bp_tcstate) {
 		case BSTP_TCSTATE_ACTIVE:
 			DPRINTF("%s -> TC_ACTIVE\n", bp->bp_ifp->if_xname);
 			/* nothing to do */
 			break;
 
 		case BSTP_TCSTATE_INACTIVE:
 			bstp_timer_stop(&bp->bp_tc_timer);
 			/* flush routes on the parent bridge */
 			bp->bp_fdbflush = 1;
 			taskqueue_enqueue(taskqueue_swi, &bp->bp_rtagetask);
 			bp->bp_tc_ack = 0;
 			DPRINTF("%s -> TC_INACTIVE\n", bp->bp_ifp->if_xname);
 			break;
 
 		case BSTP_TCSTATE_LEARNING:
 			bp->bp_rcvdtc = 0;
 			bp->bp_rcvdtcn = 0;
 			bp->bp_rcvdtca = 0;
 			bp->bp_tc_prop = 0;
 			DPRINTF("%s -> TC_LEARNING\n", bp->bp_ifp->if_xname);
 			break;
 
 		case BSTP_TCSTATE_DETECTED:
 			bstp_set_timer_tc(bp);
 			bstp_set_other_tcprop(bp);
 			/* send out notification */
 			bp->bp_flags |= BSTP_PORT_NEWINFO;
 			bstp_transmit(bs, bp);
 			getmicrotime(&bs->bs_last_tc_time);
 			DPRINTF("%s -> TC_DETECTED\n", bp->bp_ifp->if_xname);
 			bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */
 			break;
 
 		case BSTP_TCSTATE_TCN:
 			bstp_set_timer_tc(bp);
 			DPRINTF("%s -> TC_TCN\n", bp->bp_ifp->if_xname);
 			/* fall through */
 		case BSTP_TCSTATE_TC:
 			bp->bp_rcvdtc = 0;
 			bp->bp_rcvdtcn = 0;
 			if (bp->bp_role == BSTP_ROLE_DESIGNATED)
 				bp->bp_tc_ack = 1;
 
 			bstp_set_other_tcprop(bp);
 			DPRINTF("%s -> TC_TC\n", bp->bp_ifp->if_xname);
 			bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */
 			break;
 
 		case BSTP_TCSTATE_PROPAG:
 			/* flush routes on the parent bridge */
 			bp->bp_fdbflush = 1;
 			taskqueue_enqueue(taskqueue_swi, &bp->bp_rtagetask);
 			bp->bp_tc_prop = 0;
 			bstp_set_timer_tc(bp);
 			DPRINTF("%s -> TC_PROPAG\n", bp->bp_ifp->if_xname);
 			bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */
 			break;
 
 		case BSTP_TCSTATE_ACK:
 			bstp_timer_stop(&bp->bp_tc_timer);
 			bp->bp_rcvdtca = 0;
 			DPRINTF("%s -> TC_ACK\n", bp->bp_ifp->if_xname);
 			bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */
 			break;
 	}
 }
 
 static void
 bstp_set_timer_tc(struct bstp_port *bp)
 {
 	struct bstp_state *bs = bp->bp_bs;
 
 	if (bp->bp_tc_timer.active)
 		return;
 
 	switch (bp->bp_protover) {
 		case BSTP_PROTO_RSTP:
 			bstp_timer_start(&bp->bp_tc_timer,
 			    bp->bp_desg_htime + BSTP_TICK_VAL);
 			bp->bp_flags |= BSTP_PORT_NEWINFO;
 			break;
 
 		case BSTP_PROTO_STP:
 			bstp_timer_start(&bp->bp_tc_timer,
 			    bs->bs_root_max_age + bs->bs_root_fdelay);
 			break;
 	}
 }
 
 static void
 bstp_set_timer_msgage(struct bstp_port *bp)
 {
 	if (bp->bp_port_msg_age + BSTP_MESSAGE_AGE_INCR <=
 	    bp->bp_port_max_age) {
 		bstp_timer_start(&bp->bp_message_age_timer,
 		    bp->bp_port_htime * 3);
 	} else
 		/* expires immediately */
 		bstp_timer_start(&bp->bp_message_age_timer, 0);
 }
 
 static int
 bstp_rerooted(struct bstp_state *bs, struct bstp_port *bp)
 {
 	struct bstp_port *bp2;
 	int rr_set = 0;
 
 	LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) {
 		if (bp2 == bp)
 			continue;
 		if (bp2->bp_recent_root_timer.active) {
 			rr_set = 1;
 			break;
 		}
 	}
 	return (!rr_set);
 }
 
 int
 bstp_set_htime(struct bstp_state *bs, int t)
 {
 	/* convert seconds to ticks */
 	t *=  BSTP_TICK_VAL;
 
 	/* value can only be changed in leagacy stp mode */
 	if (bs->bs_protover != BSTP_PROTO_STP)
 		return (EPERM);
 
 	if (t < BSTP_MIN_HELLO_TIME || t > BSTP_MAX_HELLO_TIME)
 		return (EINVAL);
 
 	BSTP_LOCK(bs);
 	bs->bs_bridge_htime = t;
 	bstp_reinit(bs);
 	BSTP_UNLOCK(bs);
 	return (0);
 }
 
 int
 bstp_set_fdelay(struct bstp_state *bs, int t)
 {
 	/* convert seconds to ticks */
 	t *= BSTP_TICK_VAL;
 
 	if (t < BSTP_MIN_FORWARD_DELAY || t > BSTP_MAX_FORWARD_DELAY)
 		return (EINVAL);
 
 	BSTP_LOCK(bs);
 	bs->bs_bridge_fdelay = t;
 	bstp_reinit(bs);
 	BSTP_UNLOCK(bs);
 	return (0);
 }
 
 int
 bstp_set_maxage(struct bstp_state *bs, int t)
 {
 	/* convert seconds to ticks */
 	t *= BSTP_TICK_VAL;
 
 	if (t < BSTP_MIN_MAX_AGE || t > BSTP_MAX_MAX_AGE)
 		return (EINVAL);
 
 	BSTP_LOCK(bs);
 	bs->bs_bridge_max_age = t;
 	bstp_reinit(bs);
 	BSTP_UNLOCK(bs);
 	return (0);
 }
 
 int
 bstp_set_holdcount(struct bstp_state *bs, int count)
 {
 	struct bstp_port *bp;
 
 	if (count < BSTP_MIN_HOLD_COUNT ||
 	    count > BSTP_MAX_HOLD_COUNT)
 		return (EINVAL);
 
 	BSTP_LOCK(bs);
 	bs->bs_txholdcount = count;
 	LIST_FOREACH(bp, &bs->bs_bplist, bp_next)
 		bp->bp_txcount = 0;
 	BSTP_UNLOCK(bs);
 	return (0);
 }
 
 int
 bstp_set_protocol(struct bstp_state *bs, int proto)
 {
 	struct bstp_port *bp;
 
 	switch (proto) {
 		/* Supported protocol versions */
 		case BSTP_PROTO_STP:
 		case BSTP_PROTO_RSTP:
 			break;
 
 		default:
 			return (EINVAL);
 	}
 
 	BSTP_LOCK(bs);
 	bs->bs_protover = proto;
 	bs->bs_bridge_htime = BSTP_DEFAULT_HELLO_TIME;
 	LIST_FOREACH(bp, &bs->bs_bplist, bp_next) {
 		/* reinit state */
 		bp->bp_infois = BSTP_INFO_DISABLED;
 		bp->bp_txcount = 0;
 		bstp_set_port_proto(bp, bs->bs_protover);
 		bstp_set_port_role(bp, BSTP_ROLE_DISABLED);
 		bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE);
 		bstp_timer_stop(&bp->bp_recent_backup_timer);
 	}
 	bstp_reinit(bs);
 	BSTP_UNLOCK(bs);
 	return (0);
 }
 
 int
 bstp_set_priority(struct bstp_state *bs, int pri)
 {
 	if (pri < 0 || pri > BSTP_MAX_PRIORITY)
 		return (EINVAL);
 
 	/* Limit to steps of 4096 */
 	pri -= pri % 4096;
 
 	BSTP_LOCK(bs);
 	bs->bs_bridge_priority = pri;
 	bstp_reinit(bs);
 	BSTP_UNLOCK(bs);
 	return (0);
 }
 
 int
 bstp_set_port_priority(struct bstp_port *bp, int pri)
 {
 	struct bstp_state *bs = bp->bp_bs;
 
 	if (pri < 0 || pri > BSTP_MAX_PORT_PRIORITY)
 		return (EINVAL);
 
 	/* Limit to steps of 16 */
 	pri -= pri % 16;
 
 	BSTP_LOCK(bs);
 	bp->bp_priority = pri;
 	bstp_reinit(bs);
 	BSTP_UNLOCK(bs);
 	return (0);
 }
 
 int
 bstp_set_path_cost(struct bstp_port *bp, uint32_t path_cost)
 {
 	struct bstp_state *bs = bp->bp_bs;
 
 	if (path_cost > BSTP_MAX_PATH_COST)
 		return (EINVAL);
 
 	/* STP compat mode only uses 16 bits of the 32 */
 	if (bp->bp_protover == BSTP_PROTO_STP && path_cost > 65535)
 		path_cost = 65535;
 
 	BSTP_LOCK(bs);
 
 	if (path_cost == 0) {	/* use auto */
 		bp->bp_flags &= ~BSTP_PORT_ADMCOST;
 		bp->bp_path_cost = bstp_calc_path_cost(bp);
 	} else {
 		bp->bp_path_cost = path_cost;
 		bp->bp_flags |= BSTP_PORT_ADMCOST;
 	}
 	bstp_reinit(bs);
 	BSTP_UNLOCK(bs);
 	return (0);
 }
 
 int
 bstp_set_edge(struct bstp_port *bp, int set)
 {
 	struct bstp_state *bs = bp->bp_bs;
 
 	BSTP_LOCK(bs);
 	if ((bp->bp_operedge = set) == 0)
 		bp->bp_flags &= ~BSTP_PORT_ADMEDGE;
 	else
 		bp->bp_flags |= BSTP_PORT_ADMEDGE;
 	BSTP_UNLOCK(bs);
 	return (0);
 }
 
 int
 bstp_set_autoedge(struct bstp_port *bp, int set)
 {
 	struct bstp_state *bs = bp->bp_bs;
 
 	BSTP_LOCK(bs);
 	if (set) {
 		bp->bp_flags |= BSTP_PORT_AUTOEDGE;
 		/* we may be able to transition straight to edge */
 		if (bp->bp_edge_delay_timer.active == 0)
 			bstp_edge_delay_expiry(bs, bp);
 	} else
 		bp->bp_flags &= ~BSTP_PORT_AUTOEDGE;
 	BSTP_UNLOCK(bs);
 	return (0);
 }
 
 int
 bstp_set_ptp(struct bstp_port *bp, int set)
 {
 	struct bstp_state *bs = bp->bp_bs;
 
 	BSTP_LOCK(bs);
 	bp->bp_ptp_link = set;
 	BSTP_UNLOCK(bs);
 	return (0);
 }
 
 int
 bstp_set_autoptp(struct bstp_port *bp, int set)
 {
 	struct bstp_state *bs = bp->bp_bs;
 
 	BSTP_LOCK(bs);
 	if (set) {
 		bp->bp_flags |= BSTP_PORT_AUTOPTP;
 		if (bp->bp_role != BSTP_ROLE_DISABLED)
 			taskqueue_enqueue(taskqueue_swi, &bp->bp_mediatask);
 	} else
 		bp->bp_flags &= ~BSTP_PORT_AUTOPTP;
 	BSTP_UNLOCK(bs);
 	return (0);
 }
 
 /*
  * Calculate the path cost according to the link speed.
  */
 static uint32_t
 bstp_calc_path_cost(struct bstp_port *bp)
 {
 	struct ifnet *ifp = bp->bp_ifp;
 	uint32_t path_cost;
 
 	/* If the priority has been manually set then retain the value */
 	if (bp->bp_flags & BSTP_PORT_ADMCOST)
 		return bp->bp_path_cost;
 
 	if (ifp->if_link_state == LINK_STATE_DOWN) {
 		/* Recalc when the link comes up again */
 		bp->bp_flags |= BSTP_PORT_PNDCOST;
 		return (BSTP_DEFAULT_PATH_COST);
 	}
 
 	if (ifp->if_baudrate < 1000)
 		return (BSTP_DEFAULT_PATH_COST);
 
  	/* formula from section 17.14, IEEE Std 802.1D-2004 */
 	path_cost = 20000000000ULL / (ifp->if_baudrate / 1000);
 
 	if (path_cost > BSTP_MAX_PATH_COST)
 		path_cost = BSTP_MAX_PATH_COST;
 
 	/* STP compat mode only uses 16 bits of the 32 */
 	if (bp->bp_protover == BSTP_PROTO_STP && path_cost > 65535)
 		path_cost = 65535;
 
 	return (path_cost);
 }
 
 /*
  * Notify the bridge that a port state has changed, we need to do this from a
  * taskqueue to avoid a LOR.
  */
 static void
 bstp_notify_state(void *arg, int pending)
 {
 	struct bstp_port *bp = (struct bstp_port *)arg;
 	struct bstp_state *bs = bp->bp_bs;
 
 	if (bp->bp_active == 1 && bs->bs_state_cb != NULL)
 		(*bs->bs_state_cb)(bp->bp_ifp, bp->bp_state);
 }
 
 /*
  * Flush the routes on the bridge port, we need to do this from a
  * taskqueue to avoid a LOR.
  */
 static void
 bstp_notify_rtage(void *arg, int pending)
 {
 	struct bstp_port *bp = (struct bstp_port *)arg;
 	struct bstp_state *bs = bp->bp_bs;
 	int age = 0;
 
 	BSTP_LOCK(bs);
 	switch (bp->bp_protover) {
 		case BSTP_PROTO_STP:
 			/* convert to seconds */
 			age = bp->bp_desg_fdelay / BSTP_TICK_VAL;
 			break;
 
 		case BSTP_PROTO_RSTP:
 			age = 0;
 			break;
 	}
 	BSTP_UNLOCK(bs);
 
 	if (bp->bp_active == 1 && bs->bs_rtage_cb != NULL)
 		(*bs->bs_rtage_cb)(bp->bp_ifp, age);
 
 	/* flush is complete */
 	BSTP_LOCK(bs);
 	bp->bp_fdbflush = 0;
 	BSTP_UNLOCK(bs);
 }
 
 void
 bstp_linkstate(struct bstp_port *bp)
 {
 	struct bstp_state *bs = bp->bp_bs;
 
 	if (!bp->bp_active)
 		return;
 
 	bstp_ifupdstatus(bp, 0);
 	BSTP_LOCK(bs);
 	bstp_update_state(bs, bp);
 	BSTP_UNLOCK(bs);
 }
 
 static void
 bstp_ifupdstatus(void *arg, int pending)
 {
 	struct bstp_port *bp = (struct bstp_port *)arg;
 	struct bstp_state *bs = bp->bp_bs;
 	struct ifnet *ifp = bp->bp_ifp;
 	struct ifmediareq ifmr;
 	int error, changed;
 
 	if (!bp->bp_active)
 		return;
 
 	bzero((char *)&ifmr, sizeof(ifmr));
 	error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (caddr_t)&ifmr);
 
 	BSTP_LOCK(bs);
 	changed = 0;
 	if ((error == 0) && (ifp->if_flags & IFF_UP)) {
 		if (ifmr.ifm_status & IFM_ACTIVE) {
 			/* A full-duplex link is assumed to be point to point */
 			if (bp->bp_flags & BSTP_PORT_AUTOPTP) {
 				int fdx;
 
 				fdx = ifmr.ifm_active & IFM_FDX ? 1 : 0;
 				if (bp->bp_ptp_link ^ fdx) {
 					bp->bp_ptp_link = fdx;
 					changed = 1;
 				}
 			}
 
 			/* Calc the cost if the link was down previously */
 			if (bp->bp_flags & BSTP_PORT_PNDCOST) {
 				uint32_t cost;
 
 				cost = bstp_calc_path_cost(bp);
 				if (bp->bp_path_cost != cost) {
 					bp->bp_path_cost = cost;
 					changed = 1;
 				}
 				bp->bp_flags &= ~BSTP_PORT_PNDCOST;
 			}
 
 			if (bp->bp_role == BSTP_ROLE_DISABLED) {
 				bstp_enable_port(bs, bp);
 				changed = 1;
 			}
 		} else {
 			if (bp->bp_role != BSTP_ROLE_DISABLED) {
 				bstp_disable_port(bs, bp);
 				changed = 1;
 				if ((bp->bp_flags & BSTP_PORT_ADMEDGE) &&
 				    bp->bp_protover == BSTP_PROTO_RSTP)
 					bp->bp_operedge = 1;
 			}
 		}
 	} else if (bp->bp_infois != BSTP_INFO_DISABLED) {
 		bstp_disable_port(bs, bp);
 		changed = 1;
 	}
 	if (changed)
 		bstp_assign_roles(bs);
 	BSTP_UNLOCK(bs);
 }
 
 static void
 bstp_enable_port(struct bstp_state *bs, struct bstp_port *bp)
 {
 	bp->bp_infois = BSTP_INFO_AGED;
 }
 
 static void
 bstp_disable_port(struct bstp_state *bs, struct bstp_port *bp)
 {
 	bp->bp_infois = BSTP_INFO_DISABLED;
 }
 
 static void
 bstp_tick(void *arg)
 {
 	struct epoch_tracker et;
 	struct bstp_state *bs = arg;
 	struct bstp_port *bp;
 
 	BSTP_LOCK_ASSERT(bs);
 
 	if (bs->bs_running == 0)
 		return;
 
 	NET_EPOCH_ENTER(et);
 	CURVNET_SET(bs->bs_vnet);
 
 	/* poll link events on interfaces that do not support linkstate */
 	if (bstp_timer_dectest(&bs->bs_link_timer)) {
 		LIST_FOREACH(bp, &bs->bs_bplist, bp_next) {
 			if (!(bp->bp_ifp->if_capabilities & IFCAP_LINKSTATE))
 				taskqueue_enqueue(taskqueue_swi, &bp->bp_mediatask);
 		}
 		bstp_timer_start(&bs->bs_link_timer, BSTP_LINK_TIMER);
 	}
 
 	LIST_FOREACH(bp, &bs->bs_bplist, bp_next) {
 		/* no events need to happen for these */
 		bstp_timer_dectest(&bp->bp_tc_timer);
 		bstp_timer_dectest(&bp->bp_recent_root_timer);
 		bstp_timer_dectest(&bp->bp_forward_delay_timer);
 		bstp_timer_dectest(&bp->bp_recent_backup_timer);
 
 		if (bstp_timer_dectest(&bp->bp_hello_timer))
 			bstp_hello_timer_expiry(bs, bp);
 
 		if (bstp_timer_dectest(&bp->bp_message_age_timer))
 			bstp_message_age_expiry(bs, bp);
 
 		if (bstp_timer_dectest(&bp->bp_migrate_delay_timer))
 			bstp_migrate_delay_expiry(bs, bp);
 
 		if (bstp_timer_dectest(&bp->bp_edge_delay_timer))
 			bstp_edge_delay_expiry(bs, bp);
 
 		/* update the various state machines for the port */
 		bstp_update_state(bs, bp);
 
 		if (bp->bp_txcount > 0)
 			bp->bp_txcount--;
 	}
 
 	CURVNET_RESTORE();
 	NET_EPOCH_EXIT(et);
 
 	callout_reset(&bs->bs_bstpcallout, hz, bstp_tick, bs);
 }
 
 static void
 bstp_timer_start(struct bstp_timer *t, uint16_t v)
 {
 	t->value = v;
 	t->active = 1;
 	t->latched = 0;
 }
 
 static void
 bstp_timer_stop(struct bstp_timer *t)
 {
 	t->value = 0;
 	t->active = 0;
 	t->latched = 0;
 }
 
 static void
 bstp_timer_latch(struct bstp_timer *t)
 {
 	t->latched = 1;
 	t->active = 1;
 }
 
 static int
 bstp_timer_dectest(struct bstp_timer *t)
 {
 	if (t->active == 0 || t->latched)
 		return (0);
 	t->value -= BSTP_TICK_VAL;
 	if (t->value <= 0) {
 		bstp_timer_stop(t);
 		return (1);
 	}
 	return (0);
 }
 
 static void
 bstp_hello_timer_expiry(struct bstp_state *bs, struct bstp_port *bp)
 {
 	if ((bp->bp_flags & BSTP_PORT_NEWINFO) ||
 	    bp->bp_role == BSTP_ROLE_DESIGNATED ||
 	    (bp->bp_role == BSTP_ROLE_ROOT &&
 	     bp->bp_tc_timer.active == 1)) {
 		bstp_timer_start(&bp->bp_hello_timer, bp->bp_desg_htime);
 		bp->bp_flags |= BSTP_PORT_NEWINFO;
 		bstp_transmit(bs, bp);
 	}
 }
 
 static void
 bstp_message_age_expiry(struct bstp_state *bs, struct bstp_port *bp)
 {
 	if (bp->bp_infois == BSTP_INFO_RECEIVED) {
 		bp->bp_infois = BSTP_INFO_AGED;
 		bstp_assign_roles(bs);
 		DPRINTF("aged info on %s\n", bp->bp_ifp->if_xname);
 	}
 }
 
 static void
 bstp_migrate_delay_expiry(struct bstp_state *bs, struct bstp_port *bp)
 {
 	bp->bp_flags |= BSTP_PORT_CANMIGRATE;
 }
 
 static void
 bstp_edge_delay_expiry(struct bstp_state *bs, struct bstp_port *bp)
 {
 	if ((bp->bp_flags & BSTP_PORT_AUTOEDGE) &&
 	    bp->bp_protover == BSTP_PROTO_RSTP && bp->bp_proposing &&
 	    bp->bp_role == BSTP_ROLE_DESIGNATED) {
 		bp->bp_operedge = 1;
 		DPRINTF("%s -> edge port\n", bp->bp_ifp->if_xname);
 	}
 }
 
 static int
 bstp_addr_cmp(const uint8_t *a, const uint8_t *b)
 {
 	int i, d;
 
 	for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) {
 		d = ((int)a[i]) - ((int)b[i]);
 	}
 
 	return (d);
 }
 
 /*
  * compare the bridge address component of the bridgeid
  */
 static int
 bstp_same_bridgeid(uint64_t id1, uint64_t id2)
 {
 	u_char addr1[ETHER_ADDR_LEN];
 	u_char addr2[ETHER_ADDR_LEN];
 
 	PV2ADDR(id1, addr1);
 	PV2ADDR(id2, addr2);
 
 	if (bstp_addr_cmp(addr1, addr2) == 0)
 		return (1);
 
 	return (0);
 }
 
 void
 bstp_reinit(struct bstp_state *bs)
 {
 	struct epoch_tracker et;
 	struct bstp_port *bp;
 	struct ifnet *ifp, *mif;
 	u_char *e_addr;
 	void *bridgeptr;
 	static const u_char llzero[ETHER_ADDR_LEN];	/* 00:00:00:00:00:00 */
 
 	BSTP_LOCK_ASSERT(bs);
 
 	if (LIST_EMPTY(&bs->bs_bplist))
 		goto disablestp;
 
 	mif = NULL;
 	bridgeptr = LIST_FIRST(&bs->bs_bplist)->bp_ifp->if_bridge;
 	KASSERT(bridgeptr != NULL, ("Invalid bridge pointer"));
 	/*
 	 * Search through the Ethernet adapters and find the one with the
 	 * lowest value. Make sure the adapter which we take the MAC address
 	 * from is part of this bridge, so we can have more than one independent
 	 * bridges in the same STP domain.
 	 */
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN)
 			continue;	/* Not Ethernet */
 
 		if (ifp->if_bridge != bridgeptr)
 			continue;	/* Not part of our bridge */
 
 		if (bstp_addr_cmp(IF_LLADDR(ifp), llzero) == 0)
 			continue;	/* No mac address set */
 
 		if (mif == NULL) {
 			mif = ifp;
 			continue;
 		}
 		if (bstp_addr_cmp(IF_LLADDR(ifp), IF_LLADDR(mif)) < 0) {
 			mif = ifp;
 			continue;
 		}
 	}
 	NET_EPOCH_EXIT(et);
 	if (mif == NULL)
 		goto disablestp;
 
 	e_addr = IF_LLADDR(mif);
 	bs->bs_bridge_pv.pv_dbridge_id =
 	    (((uint64_t)bs->bs_bridge_priority) << 48) |
 	    (((uint64_t)e_addr[0]) << 40) |
 	    (((uint64_t)e_addr[1]) << 32) |
 	    (((uint64_t)e_addr[2]) << 24) |
 	    (((uint64_t)e_addr[3]) << 16) |
 	    (((uint64_t)e_addr[4]) << 8) |
 	    (((uint64_t)e_addr[5]));
 
 	bs->bs_bridge_pv.pv_root_id = bs->bs_bridge_pv.pv_dbridge_id;
 	bs->bs_bridge_pv.pv_cost = 0;
 	bs->bs_bridge_pv.pv_dport_id = 0;
 	bs->bs_bridge_pv.pv_port_id = 0;
 
 	if (bs->bs_running && callout_pending(&bs->bs_bstpcallout) == 0)
 		callout_reset(&bs->bs_bstpcallout, hz, bstp_tick, bs);
 
 	LIST_FOREACH(bp, &bs->bs_bplist, bp_next) {
 		bp->bp_port_id = (bp->bp_priority << 8) |
 		    (bp->bp_ifp->if_index  & 0xfff);
 		taskqueue_enqueue(taskqueue_swi, &bp->bp_mediatask);
 	}
 
 	bstp_assign_roles(bs);
 	bstp_timer_start(&bs->bs_link_timer, BSTP_LINK_TIMER);
 	return;
 
 disablestp:
 	/* Set the bridge and root id (lower bits) to zero */
 	bs->bs_bridge_pv.pv_dbridge_id =
 	    ((uint64_t)bs->bs_bridge_priority) << 48;
 	bs->bs_bridge_pv.pv_root_id = bs->bs_bridge_pv.pv_dbridge_id;
 	bs->bs_root_pv = bs->bs_bridge_pv;
 	/* Disable any remaining ports, they will have no MAC address */
 	LIST_FOREACH(bp, &bs->bs_bplist, bp_next) {
 		bp->bp_infois = BSTP_INFO_DISABLED;
 		bstp_set_port_role(bp, BSTP_ROLE_DISABLED);
 	}
 	callout_stop(&bs->bs_bstpcallout);
 }
 
 static int
 bstp_modevent(module_t mod, int type, void *data)
 {
 	switch (type) {
 	case MOD_LOAD:
 		mtx_init(&bstp_list_mtx, "bridgestp list", NULL, MTX_DEF);
 		LIST_INIT(&bstp_list);
 		break;
 	case MOD_UNLOAD:
 		mtx_destroy(&bstp_list_mtx);
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t bstp_mod = {
 	"bridgestp",
 	bstp_modevent,
 	0
 };
 
 DECLARE_MODULE(bridgestp, bstp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(bridgestp, 1);
 
 void
 bstp_attach(struct bstp_state *bs, struct bstp_cb_ops *cb)
 {
 	BSTP_LOCK_INIT(bs);
 	callout_init_mtx(&bs->bs_bstpcallout, &bs->bs_mtx, 0);
 	LIST_INIT(&bs->bs_bplist);
 
 	bs->bs_bridge_max_age = BSTP_DEFAULT_MAX_AGE;
 	bs->bs_bridge_htime = BSTP_DEFAULT_HELLO_TIME;
 	bs->bs_bridge_fdelay = BSTP_DEFAULT_FORWARD_DELAY;
 	bs->bs_bridge_priority = BSTP_DEFAULT_BRIDGE_PRIORITY;
 	bs->bs_hold_time = BSTP_DEFAULT_HOLD_TIME;
 	bs->bs_migration_delay = BSTP_DEFAULT_MIGRATE_DELAY;
 	bs->bs_txholdcount = BSTP_DEFAULT_HOLD_COUNT;
 	bs->bs_protover = BSTP_PROTO_RSTP;
 	bs->bs_state_cb = cb->bcb_state;
 	bs->bs_rtage_cb = cb->bcb_rtage;
 	bs->bs_vnet = curvnet;
 
 	getmicrotime(&bs->bs_last_tc_time);
 
 	mtx_lock(&bstp_list_mtx);
 	LIST_INSERT_HEAD(&bstp_list, bs, bs_list);
 	mtx_unlock(&bstp_list_mtx);
 }
 
 void
 bstp_detach(struct bstp_state *bs)
 {
 	KASSERT(LIST_EMPTY(&bs->bs_bplist), ("bstp still active"));
 
 	mtx_lock(&bstp_list_mtx);
 	LIST_REMOVE(bs, bs_list);
 	mtx_unlock(&bstp_list_mtx);
 	callout_drain(&bs->bs_bstpcallout);
 	BSTP_LOCK_DESTROY(bs);
 }
 
 void
 bstp_init(struct bstp_state *bs)
 {
 	BSTP_LOCK(bs);
 	callout_reset(&bs->bs_bstpcallout, hz, bstp_tick, bs);
 	bs->bs_running = 1;
 	bstp_reinit(bs);
 	BSTP_UNLOCK(bs);
 }
 
 void
 bstp_stop(struct bstp_state *bs)
 {
 	struct bstp_port *bp;
 
 	BSTP_LOCK(bs);
 
 	LIST_FOREACH(bp, &bs->bs_bplist, bp_next)
 		bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING);
 
 	bs->bs_running = 0;
 	callout_stop(&bs->bs_bstpcallout);
 	BSTP_UNLOCK(bs);
 }
 
 int
 bstp_create(struct bstp_state *bs, struct bstp_port *bp, struct ifnet *ifp)
 {
 	bzero(bp, sizeof(struct bstp_port));
 
 	BSTP_LOCK(bs);
 	bp->bp_ifp = ifp;
 	bp->bp_bs = bs;
 	bp->bp_priority = BSTP_DEFAULT_PORT_PRIORITY;
 	TASK_INIT(&bp->bp_statetask, 0, bstp_notify_state, bp);
 	TASK_INIT(&bp->bp_rtagetask, 0, bstp_notify_rtage, bp);
 	TASK_INIT(&bp->bp_mediatask, 0, bstp_ifupdstatus, bp);
 
 	/* Init state */
 	bp->bp_infois = BSTP_INFO_DISABLED;
 	bp->bp_flags = BSTP_PORT_AUTOEDGE|BSTP_PORT_AUTOPTP;
 	bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING);
 	bstp_set_port_proto(bp, bs->bs_protover);
 	bstp_set_port_role(bp, BSTP_ROLE_DISABLED);
 	bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE);
 	bp->bp_path_cost = bstp_calc_path_cost(bp);
 	BSTP_UNLOCK(bs);
 	return (0);
 }
 
 int
 bstp_enable(struct bstp_port *bp)
 {
 	struct bstp_state *bs = bp->bp_bs;
 	struct ifnet *ifp = bp->bp_ifp;
 
 	KASSERT(bp->bp_active == 0, ("already a bstp member"));
 	NET_EPOCH_ASSERT(); /* Because bstp_update_roles() causes traffic. */
 
 	switch (ifp->if_type) {
 		case IFT_ETHER:	/* These can do spanning tree. */
 		case IFT_L2VLAN:
 			break;
 		default:
 			/* Nothing else can. */
 			return (EINVAL);
 	}
 
 	BSTP_LOCK(bs);
 	LIST_INSERT_HEAD(&bs->bs_bplist, bp, bp_next);
 	bp->bp_active = 1;
 	bp->bp_flags |= BSTP_PORT_NEWINFO;
 	bstp_reinit(bs);
 	bstp_update_roles(bs, bp);
 	BSTP_UNLOCK(bs);
 	return (0);
 }
 
 void
 bstp_disable(struct bstp_port *bp)
 {
 	struct bstp_state *bs = bp->bp_bs;
 
 	KASSERT(bp->bp_active == 1, ("not a bstp member"));
 
 	BSTP_LOCK(bs);
 	bstp_disable_port(bs, bp);
 	LIST_REMOVE(bp, bp_next);
 	bp->bp_active = 0;
 	bstp_reinit(bs);
 	BSTP_UNLOCK(bs);
 }
 
 /*
  * The bstp_port structure is about to be freed by the parent bridge.
  */
 void
 bstp_destroy(struct bstp_port *bp)
 {
 	KASSERT(bp->bp_active == 0, ("port is still attached"));
 	taskqueue_drain(taskqueue_swi, &bp->bp_statetask);
 	taskqueue_drain(taskqueue_swi, &bp->bp_rtagetask);
 	taskqueue_drain(taskqueue_swi, &bp->bp_mediatask);
 
 	if (bp->bp_bs->bs_root_port == bp)
 		bstp_assign_roles(bp->bp_bs);
 }
diff --git a/sys/net/debugnet.c b/sys/net/debugnet.c
index b41d93eca7fe..372801d3d970 100644
--- a/sys/net/debugnet.c
+++ b/sys/net/debugnet.c
@@ -1,1099 +1,1100 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2019 Isilon Systems, LLC.
  * Copyright (c) 2005-2014 Sandvine Incorporated. All rights reserved.
  * Copyright (c) 2000 Darrell Anderson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/endian.h>
 #include <sys/errno.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_lex.h>
 #endif
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/vnet.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #include <machine/in_cksum.h>
 #include <machine/pcb.h>
 
 #include <net/debugnet.h>
 #define	DEBUGNET_INTERNAL
 #include <net/debugnet_int.h>
 
 FEATURE(debugnet, "Debugnet support");
 
 SYSCTL_NODE(_net, OID_AUTO, debugnet, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
     "debugnet parameters");
 
 unsigned debugnet_debug;
 SYSCTL_UINT(_net_debugnet, OID_AUTO, debug, CTLFLAG_RWTUN,
     &debugnet_debug, 0,
     "Debug message verbosity (0: off; 1: on; 2: verbose)");
 
 int debugnet_npolls = 2000;
 SYSCTL_INT(_net_debugnet, OID_AUTO, npolls, CTLFLAG_RWTUN,
     &debugnet_npolls, 0,
     "Number of times to poll before assuming packet loss (0.5ms per poll)");
 int debugnet_nretries = 10;
 SYSCTL_INT(_net_debugnet, OID_AUTO, nretries, CTLFLAG_RWTUN,
     &debugnet_nretries, 0,
     "Number of retransmit attempts before giving up");
 int debugnet_fib = RT_DEFAULT_FIB;
 SYSCTL_INT(_net_debugnet, OID_AUTO, fib, CTLFLAG_RWTUN,
     &debugnet_fib, 0,
     "Fib to use when sending dump");
 
 static bool g_debugnet_pcb_inuse;
 static struct debugnet_pcb g_dnet_pcb;
 
 /*
  * Simple accessors for opaque PCB.
  */
 const unsigned char *
 debugnet_get_gw_mac(const struct debugnet_pcb *pcb)
 {
 	MPASS(g_debugnet_pcb_inuse && pcb == &g_dnet_pcb &&
 	    pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
 	return (pcb->dp_gw_mac.octet);
 }
 
 /*
  * Start of network primitives, beginning with output primitives.
  */
 
 /*
  * Handles creation of the ethernet header, then places outgoing packets into
  * the tx buffer for the NIC
  *
  * Parameters:
  *	m	The mbuf containing the packet to be sent (will be freed by
  *		this function or the NIC driver)
  *	ifp	The interface to send on
  *	dst	The destination ethernet address (source address will be looked
  *		up using ifp)
  *	etype	The ETHERTYPE_* value for the protocol that is being sent
  *
  * Returns:
  *	int	see errno.h, 0 for success
  */
 int
 debugnet_ether_output(struct mbuf *m, struct ifnet *ifp, struct ether_addr dst,
     u_short etype)
 {
 	struct ether_header *eh;
 
 	if (((ifp->if_flags & (IFF_MONITOR | IFF_UP)) != IFF_UP) ||
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING) != IFF_DRV_RUNNING) {
 		if_printf(ifp, "%s: interface isn't up\n", __func__);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Fill in the ethernet header. */
 	M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
 	if (m == NULL) {
 		printf("%s: out of mbufs\n", __func__);
 		return (ENOBUFS);
 	}
 	eh = mtod(m, struct ether_header *);
 	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 	memcpy(eh->ether_dhost, dst.octet, ETHER_ADDR_LEN);
 	eh->ether_type = htons(etype);
 	return (ifp->if_debugnet_methods->dn_transmit(ifp, m));
 }
 
 /*
  * Unreliable transmission of an mbuf chain to the debugnet server
  * Note: can't handle fragmentation; fails if the packet is larger than
  *	 ifp->if_mtu after adding the UDP/IP headers
  *
  * Parameters:
  *	pcb	The debugnet context block
  *	m	mbuf chain
  *
  * Returns:
  *	int	see errno.h, 0 for success
  */
 static int
 debugnet_udp_output(struct debugnet_pcb *pcb, struct mbuf *m)
 {
 	struct udphdr *udp;
 
 	MPASS(pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
 
 	M_PREPEND(m, sizeof(*udp), M_NOWAIT);
 	if (m == NULL) {
 		printf("%s: out of mbufs\n", __func__);
 		return (ENOBUFS);
 	}
 
 	udp = mtod(m, void *);
 	udp->uh_ulen = htons(m->m_pkthdr.len);
 	/* Use this src port so that the server can connect() the socket */
 	udp->uh_sport = htons(pcb->dp_client_port);
 	udp->uh_dport = htons(pcb->dp_server_port);
 	/* Computed later (protocol-dependent). */
 	udp->uh_sum = 0;
 
 	return (debugnet_ip_output(pcb, m));
 }
 
 int
 debugnet_ack_output(struct debugnet_pcb *pcb, uint32_t seqno /* net endian */)
 {
 	struct debugnet_ack *dn_ack;
 	struct mbuf *m;
 
 	DNETDEBUG("Acking with seqno %u\n", ntohl(seqno));
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL) {
 		printf("%s: Out of mbufs\n", __func__);
 		return (ENOBUFS);
 	}
 	m->m_len = sizeof(*dn_ack);
 	m->m_pkthdr.len = sizeof(*dn_ack);
 	MH_ALIGN(m, sizeof(*dn_ack));
 	dn_ack = mtod(m, void *);
 	dn_ack->da_seqno = seqno;
 
 	return (debugnet_udp_output(pcb, m));
 }
 
 /*
  * Dummy free function for debugnet clusters.
  */
 static void
 debugnet_mbuf_free(struct mbuf *m __unused)
 {
 }
 
 /*
  * Construct and reliably send a debugnet packet.  May fail from a resource
  * shortage or extreme number of unacknowledged retransmissions.  Wait for
  * an acknowledgement before returning.  Splits packets into chunks small
  * enough to be sent without fragmentation (looks up the interface MTU)
  *
  * Parameters:
  *	type	debugnet packet type (HERALD, FINISHED, ...)
  *	data	data
  *	datalen	data size (bytes)
  *	auxdata	optional auxiliary information
  *
  * Returns:
  *	int see errno.h, 0 for success
  */
 int
 debugnet_send(struct debugnet_pcb *pcb, uint32_t type, const void *data,
     uint32_t datalen, const struct debugnet_proto_aux *auxdata)
 {
 	struct debugnet_msg_hdr *dn_msg_hdr;
 	struct mbuf *m, *m2;
 	uint64_t want_acks;
 	uint32_t i, pktlen, sent_so_far;
 	int retries, polls, error;
 
 	if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
 		return (ECONNRESET);
 
 	want_acks = 0;
 	pcb->dp_rcvd_acks = 0;
 	retries = 0;
 
 retransmit:
 	/* Chunks can be too big to fit in packets. */
 	for (i = sent_so_far = 0; sent_so_far < datalen ||
 	    (i == 0 && datalen == 0); i++) {
 		pktlen = datalen - sent_so_far;
 
 		/* Bound: the interface MTU (assume no IP options). */
 		pktlen = min(pktlen, pcb->dp_ifp->if_mtu -
 		    sizeof(struct udpiphdr) - sizeof(struct debugnet_msg_hdr));
 
 		/*
 		 * Check if it is retransmitting and this has been ACKed
 		 * already.
 		 */
 		if ((pcb->dp_rcvd_acks & (1 << i)) != 0) {
 			sent_so_far += pktlen;
 			continue;
 		}
 
 		/*
 		 * Get and fill a header mbuf, then chain data as an extended
 		 * mbuf.
 		 */
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			printf("%s: Out of mbufs\n", __func__);
 			return (ENOBUFS);
 		}
 		m->m_len = sizeof(struct debugnet_msg_hdr);
 		m->m_pkthdr.len = sizeof(struct debugnet_msg_hdr);
 		MH_ALIGN(m, sizeof(struct debugnet_msg_hdr));
 		dn_msg_hdr = mtod(m, struct debugnet_msg_hdr *);
 		dn_msg_hdr->mh_seqno = htonl(pcb->dp_seqno + i);
 		dn_msg_hdr->mh_type = htonl(type);
 		dn_msg_hdr->mh_len = htonl(pktlen);
 
 		if (auxdata != NULL) {
 			dn_msg_hdr->mh_offset =
 			    htobe64(auxdata->dp_offset_start + sent_so_far);
 			dn_msg_hdr->mh_aux2 = htobe32(auxdata->dp_aux2);
 		} else {
 			dn_msg_hdr->mh_offset = htobe64(sent_so_far);
 			dn_msg_hdr->mh_aux2 = 0;
 		}
 
 		if (pktlen != 0) {
 			m2 = m_get(M_NOWAIT, MT_DATA);
 			if (m2 == NULL) {
 				m_freem(m);
 				printf("%s: Out of mbufs\n", __func__);
 				return (ENOBUFS);
 			}
 			MEXTADD(m2, __DECONST(char *, data) + sent_so_far,
 			    pktlen, debugnet_mbuf_free, NULL, NULL, 0,
 			    EXT_DISPOSABLE);
 			m2->m_len = pktlen;
 
 			m_cat(m, m2);
 			m->m_pkthdr.len += pktlen;
 		}
 		error = debugnet_udp_output(pcb, m);
 		if (error != 0)
 			return (error);
 
 		/* Note that we're waiting for this packet in the bitfield. */
 		want_acks |= (1 << i);
 		sent_so_far += pktlen;
 	}
 	if (i >= DEBUGNET_MAX_IN_FLIGHT)
 		printf("Warning: Sent more than %d packets (%d). "
 		    "Acknowledgements will fail unless the size of "
 		    "rcvd_acks/want_acks is increased.\n",
 		    DEBUGNET_MAX_IN_FLIGHT, i);
 
 	/*
 	 * Wait for acks.  A *real* window would speed things up considerably.
 	 */
 	polls = 0;
 	while (pcb->dp_rcvd_acks != want_acks) {
 		if (polls++ > debugnet_npolls) {
 			if (retries++ > debugnet_nretries)
 				return (ETIMEDOUT);
 			printf(". ");
 			goto retransmit;
 		}
 		debugnet_network_poll(pcb);
 		DELAY(500);
 		if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
 			return (ECONNRESET);
 	}
 	pcb->dp_seqno += i;
 	return (0);
 }
 
 /*
  * Network input primitives.
  */
 
 /*
  * Just introspect the header enough to fire off a seqno ack and validate
  * length fits.
  */
 static void
 debugnet_handle_rx_msg(struct debugnet_pcb *pcb, struct mbuf **mb)
 {
 	const struct debugnet_msg_hdr *dnh;
 	struct mbuf *m;
 	int error;
 
 	m = *mb;
 
 	if (m->m_pkthdr.len < sizeof(*dnh)) {
 		DNETDEBUG("ignoring small debugnet_msg packet\n");
 		return;
 	}
 
 	/* Get ND header. */
 	if (m->m_len < sizeof(*dnh)) {
 		m = m_pullup(m, sizeof(*dnh));
 		*mb = m;
 		if (m == NULL) {
 			DNETDEBUG("m_pullup failed\n");
 			return;
 		}
 	}
 	dnh = mtod(m, const void *);
 
 	if (ntohl(dnh->mh_len) + sizeof(*dnh) > m->m_pkthdr.len) {
 		DNETDEBUG("Dropping short packet.\n");
 		return;
 	}
 
 	/*
 	 * If the issue is transient (ENOBUFS), sender should resend.  If
 	 * non-transient (like driver objecting to rx -> tx from the same
 	 * thread), not much else we can do.
 	 */
 	error = debugnet_ack_output(pcb, dnh->mh_seqno);
 	if (error != 0)
 		return;
 
 	if (ntohl(dnh->mh_type) == DEBUGNET_FINISHED) {
 		printf("Remote shut down the connection on us!\n");
 		pcb->dp_state = DN_STATE_REMOTE_CLOSED;
 
 		/*
 		 * Continue through to the user handler so they are signalled
 		 * not to wait for further rx.
 		 */
 	}
 
 	pcb->dp_rx_handler(pcb, mb);
 }
 
 static void
 debugnet_handle_ack(struct debugnet_pcb *pcb, struct mbuf **mb, uint16_t sport)
 {
 	const struct debugnet_ack *dn_ack;
 	struct mbuf *m;
 	uint32_t rcv_ackno;
 
 	m = *mb;
 
 	/* Get Ack. */
 	if (m->m_len < sizeof(*dn_ack)) {
 		m = m_pullup(m, sizeof(*dn_ack));
 		*mb = m;
 		if (m == NULL) {
 			DNETDEBUG("m_pullup failed\n");
 			return;
 		}
 	}
 	dn_ack = mtod(m, const void *);
 
 	/* Debugnet processing. */
 	/*
 	 * Packet is meant for us.  Extract the ack sequence number and the
 	 * port number if necessary.
 	 */
 	rcv_ackno = ntohl(dn_ack->da_seqno);
 	if (pcb->dp_state < DN_STATE_GOT_HERALD_PORT) {
 		pcb->dp_server_port = sport;
 		pcb->dp_state = DN_STATE_GOT_HERALD_PORT;
 	}
 	if (rcv_ackno >= pcb->dp_seqno + DEBUGNET_MAX_IN_FLIGHT)
 		printf("%s: ACK %u too far in future!\n", __func__, rcv_ackno);
 	else if (rcv_ackno >= pcb->dp_seqno) {
 		/* We're interested in this ack. Record it. */
 		pcb->dp_rcvd_acks |= 1 << (rcv_ackno - pcb->dp_seqno);
 	}
 }
 
 void
 debugnet_handle_udp(struct debugnet_pcb *pcb, struct mbuf **mb)
 {
 	const struct udphdr *udp;
 	struct mbuf *m;
 	uint16_t sport, ulen;
 
 	/* UDP processing. */
 
 	m = *mb;
 	if (m->m_pkthdr.len < sizeof(*udp)) {
 		DNETDEBUG("ignoring small UDP packet\n");
 		return;
 	}
 
 	/* Get UDP headers. */
 	if (m->m_len < sizeof(*udp)) {
 		m = m_pullup(m, sizeof(*udp));
 		*mb = m;
 		if (m == NULL) {
 			DNETDEBUG("m_pullup failed\n");
 			return;
 		}
 	}
 	udp = mtod(m, const void *);
 
 	/* We expect to receive UDP packets on the configured client port. */
 	if (ntohs(udp->uh_dport) != pcb->dp_client_port) {
 		DNETDEBUG("not on the expected port.\n");
 		return;
 	}
 
 	/* Check that ulen does not exceed actual size of data. */
 	ulen = ntohs(udp->uh_ulen);
 	if (m->m_pkthdr.len < ulen) {
 		DNETDEBUG("ignoring runt UDP packet\n");
 		return;
 	}
 
 	sport = ntohs(udp->uh_sport);
 
 	m_adj(m, sizeof(*udp));
 	ulen -= sizeof(*udp);
 
 	if (ulen == sizeof(struct debugnet_ack)) {
 		debugnet_handle_ack(pcb, mb, sport);
 		return;
 	}
 
 	if (pcb->dp_rx_handler == NULL) {
 		if (ulen < sizeof(struct debugnet_ack))
 			DNETDEBUG("ignoring small ACK packet\n");
 		else
 			DNETDEBUG("ignoring unexpected non-ACK packet on "
 			    "half-duplex connection.\n");
 		return;
 	}
 
 	debugnet_handle_rx_msg(pcb, mb);
 }
 
 /*
  * Handler for incoming packets directly from the network adapter
  * Identifies the packet type (IP or ARP) and passes it along to one of the
  * helper functions debugnet_handle_ip or debugnet_handle_arp.
  *
  * It needs to partially replicate the behaviour of ether_input() and
  * ether_demux().
  *
  * Parameters:
  *	ifp	the interface the packet came from
  *	m	an mbuf containing the packet received
  */
 static void
 debugnet_input_one(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ifreq ifr;
 	struct ether_header *eh;
 	u_short etype;
 
 	/* Ethernet processing. */
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		DNETDEBUG_IF(ifp, "discard frame without packet header\n");
 		goto done;
 	}
 	if (m->m_len < ETHER_HDR_LEN) {
 		DNETDEBUG_IF(ifp,
 	    "discard frame without leading eth header (len %u pktlen %u)\n",
 		    m->m_len, m->m_pkthdr.len);
 		goto done;
 	}
 	if ((m->m_flags & M_HASFCS) != 0) {
 		m_adj(m, -ETHER_CRC_LEN);
 		m->m_flags &= ~M_HASFCS;
 	}
 	eh = mtod(m, struct ether_header *);
 	etype = ntohs(eh->ether_type);
 	if ((m->m_flags & M_VLANTAG) != 0 || etype == ETHERTYPE_VLAN) {
 		DNETDEBUG_IF(ifp, "ignoring vlan packets\n");
 		goto done;
 	}
 	if (if_gethwaddr(ifp, &ifr) != 0) {
 		DNETDEBUG_IF(ifp, "failed to get hw addr for interface\n");
 		goto done;
 	}
 	if (memcmp(ifr.ifr_addr.sa_data, eh->ether_dhost,
 	    ETHER_ADDR_LEN) != 0 &&
 	    (etype != ETHERTYPE_ARP || !ETHER_IS_BROADCAST(eh->ether_dhost))) {
 		DNETDEBUG_IF(ifp,
 		    "discard frame with incorrect destination addr\n");
 		goto done;
 	}
 
 	MPASS(g_debugnet_pcb_inuse);
 
 	/* Done ethernet processing. Strip off the ethernet header. */
 	m_adj(m, ETHER_HDR_LEN);
 	switch (etype) {
 	case ETHERTYPE_ARP:
 		debugnet_handle_arp(&g_dnet_pcb, &m);
 		break;
 	case ETHERTYPE_IP:
 		debugnet_handle_ip(&g_dnet_pcb, &m);
 		break;
 	default:
 		DNETDEBUG_IF(ifp, "dropping unknown ethertype %hu\n", etype);
 		break;
 	}
 done:
 	if (m != NULL)
 		m_freem(m);
 }
 
 static void
 debugnet_input(struct ifnet *ifp, struct mbuf *m)
 {
 	struct mbuf *n;
 
 	do {
 		n = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		debugnet_input_one(ifp, m);
 		m = n;
 	} while (m != NULL);
 }
 
 /*
  * Network polling primitive.
  *
  * Instead of assuming that most of the network stack is sane, we just poll the
  * driver directly for packets.
  */
 void
 debugnet_network_poll(struct debugnet_pcb *pcb)
 {
 	struct ifnet *ifp;
 
 	ifp = pcb->dp_ifp;
 	ifp->if_debugnet_methods->dn_poll(ifp, 1000);
 }
 
 /*
  * Start of consumer API surface.
  */
 void
 debugnet_free(struct debugnet_pcb *pcb)
 {
 	struct ifnet *ifp;
 
 	MPASS(pcb == &g_dnet_pcb);
 	MPASS(pcb->dp_drv_input == NULL || g_debugnet_pcb_inuse);
 
 	ifp = pcb->dp_ifp;
 	if (ifp != NULL) {
 		if (pcb->dp_drv_input != NULL)
 			ifp->if_input = pcb->dp_drv_input;
 		if (pcb->dp_event_started)
 			ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_END);
 	}
 	debugnet_mbuf_finish();
 
 	g_debugnet_pcb_inuse = false;
 	memset(&g_dnet_pcb, 0xfd, sizeof(g_dnet_pcb));
 }
 
 int
 debugnet_connect(const struct debugnet_conn_params *dcp,
     struct debugnet_pcb **pcb_out)
 {
 	struct debugnet_proto_aux herald_auxdata;
 	struct debugnet_pcb *pcb;
 	struct ifnet *ifp;
 	int error;
 
 	if (g_debugnet_pcb_inuse) {
 		printf("%s: Only one connection at a time.\n", __func__);
 		return (EBUSY);
 	}
 
 	pcb = &g_dnet_pcb;
 	*pcb = (struct debugnet_pcb) {
 		.dp_state = DN_STATE_INIT,
 		.dp_client = dcp->dc_client,
 		.dp_server = dcp->dc_server,
 		.dp_gateway = dcp->dc_gateway,
 		.dp_server_port = dcp->dc_herald_port,	/* Initially */
 		.dp_client_port = dcp->dc_client_port,
 		.dp_seqno = 1,
 		.dp_ifp = dcp->dc_ifp,
 		.dp_rx_handler = dcp->dc_rx_handler,
 		.dp_drv_input = NULL,
 	};
 
 	/* Switch to the debugnet mbuf zones. */
 	debugnet_mbuf_start();
 
 	/* At least one needed parameter is missing; infer it. */
 	if (pcb->dp_client == INADDR_ANY || pcb->dp_gateway == INADDR_ANY ||
 	    pcb->dp_ifp == NULL) {
 		struct sockaddr_in dest_sin, *gw_sin, *local_sin;
 		struct ifnet *rt_ifp;
 		struct nhop_object *nh;
 
 		memset(&dest_sin, 0, sizeof(dest_sin));
 		dest_sin = (struct sockaddr_in) {
 			.sin_len = sizeof(dest_sin),
 			.sin_family = AF_INET,
 			.sin_addr.s_addr = pcb->dp_server,
 		};
 
 		CURVNET_SET(vnet0);
 		nh = fib4_lookup_debugnet(debugnet_fib, dest_sin.sin_addr, 0,
 		    NHR_NONE);
 		CURVNET_RESTORE();
 
 		if (nh == NULL) {
 			printf("%s: Could not get route for that server.\n",
 			    __func__);
 			error = ENOENT;
 			goto cleanup;
 		}
 
 		/* TODO support AF_INET6 */
 		if (nh->gw_sa.sa_family == AF_INET)
 			gw_sin = &nh->gw4_sa;
 		else {
 			if (nh->gw_sa.sa_family == AF_LINK)
 				DNETDEBUG("Destination address is on link.\n");
 			gw_sin = NULL;
 		}
 
 		MPASS(nh->nh_ifa->ifa_addr->sa_family == AF_INET);
 		local_sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
 
 		rt_ifp = nh->nh_ifp;
 
 		if (pcb->dp_client == INADDR_ANY)
 			pcb->dp_client = local_sin->sin_addr.s_addr;
 		if (pcb->dp_gateway == INADDR_ANY && gw_sin != NULL)
 			pcb->dp_gateway = gw_sin->sin_addr.s_addr;
 		if (pcb->dp_ifp == NULL)
 			pcb->dp_ifp = rt_ifp;
 	}
 
 	ifp = pcb->dp_ifp;
 
 	if (debugnet_debug > 0) {
 		char serbuf[INET_ADDRSTRLEN], clibuf[INET_ADDRSTRLEN],
 		    gwbuf[INET_ADDRSTRLEN];
 		inet_ntop(AF_INET, &pcb->dp_server, serbuf, sizeof(serbuf));
 		inet_ntop(AF_INET, &pcb->dp_client, clibuf, sizeof(clibuf));
 		if (pcb->dp_gateway != INADDR_ANY)
 			inet_ntop(AF_INET, &pcb->dp_gateway, gwbuf, sizeof(gwbuf));
 		DNETDEBUG("Connecting to %s:%d%s%s from %s:%d on %s\n",
 		    serbuf, pcb->dp_server_port,
 		    (pcb->dp_gateway == INADDR_ANY) ? "" : " via ",
 		    (pcb->dp_gateway == INADDR_ANY) ? "" : gwbuf,
 		    clibuf, pcb->dp_client_port, if_name(ifp));
 	}
 
 	/* Validate iface is online and supported. */
 	if (!DEBUGNET_SUPPORTED_NIC(ifp)) {
 		printf("%s: interface '%s' does not support debugnet\n",
 		    __func__, if_name(ifp));
 		error = ENODEV;
 		goto cleanup;
 	}
 	if ((if_getflags(ifp) & IFF_UP) == 0) {
 		printf("%s: interface '%s' link is down\n", __func__,
 		    if_name(ifp));
 		error = ENXIO;
 		goto cleanup;
 	}
 
 	ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_START);
 	pcb->dp_event_started = true;
 
 	/*
 	 * We maintain the invariant that g_debugnet_pcb_inuse is always true
 	 * while the debugnet ifp's if_input is overridden with
 	 * debugnet_input().
 	 */
 	g_debugnet_pcb_inuse = true;
 
 	/* Make the card use *our* receive callback. */
 	pcb->dp_drv_input = ifp->if_input;
 	ifp->if_input = debugnet_input;
 
 	printf("%s: searching for %s MAC...\n", __func__,
 	    (dcp->dc_gateway == INADDR_ANY) ? "server" : "gateway");
 
 	error = debugnet_arp_gw(pcb);
 	if (error != 0) {
 		printf("%s: failed to locate MAC address\n", __func__);
 		goto cleanup;
 	}
 	MPASS(pcb->dp_state == DN_STATE_HAVE_GW_MAC);
 
 	herald_auxdata = (struct debugnet_proto_aux) {
 		.dp_offset_start = dcp->dc_herald_offset,
 		.dp_aux2 = dcp->dc_herald_aux2,
 	};
 	error = debugnet_send(pcb, DEBUGNET_HERALD, dcp->dc_herald_data,
 	    dcp->dc_herald_datalen, &herald_auxdata);
 	if (error != 0) {
 		printf("%s: failed to herald debugnet server\n", __func__);
 		goto cleanup;
 	}
 
 	*pcb_out = pcb;
 	return (0);
 
 cleanup:
 	debugnet_free(pcb);
 	return (error);
 }
 
 /*
  * Pre-allocated dump-time mbuf tracking.
  *
  * We just track the high water mark we've ever seen and allocate appropriately
  * for that iface/mtu combo.
  */
 static struct {
 	int nmbuf;
 	int ncl;
 	int clsize;
 } dn_hwm;
 static struct mtx dn_hwm_lk;
 MTX_SYSINIT(debugnet_hwm_lock, &dn_hwm_lk, "Debugnet HWM lock", MTX_DEF);
 
 static void
 dn_maybe_reinit_mbufs(int nmbuf, int ncl, int clsize)
 {
 	bool any;
 
 	any = false;
 	mtx_lock(&dn_hwm_lk);
 
 	if (nmbuf > dn_hwm.nmbuf) {
 		any = true;
 		dn_hwm.nmbuf = nmbuf;
 	} else
 		nmbuf = dn_hwm.nmbuf;
 
 	if (ncl > dn_hwm.ncl) {
 		any = true;
 		dn_hwm.ncl = ncl;
 	} else
 		ncl = dn_hwm.ncl;
 
 	if (clsize > dn_hwm.clsize) {
 		any = true;
 		dn_hwm.clsize = clsize;
 	} else
 		clsize = dn_hwm.clsize;
 
 	mtx_unlock(&dn_hwm_lk);
 
 	if (any)
 		debugnet_mbuf_reinit(nmbuf, ncl, clsize);
 }
 
 void
 debugnet_any_ifnet_update(struct ifnet *ifp)
 {
 	int clsize, nmbuf, ncl, nrxr;
 
 	if (!DEBUGNET_SUPPORTED_NIC(ifp))
 		return;
 
 	ifp->if_debugnet_methods->dn_init(ifp, &nrxr, &ncl, &clsize);
 	KASSERT(nrxr > 0, ("invalid receive ring count %d", nrxr));
 
 	/*
 	 * We need two headers per message on the transmit side. Multiply by
 	 * four to give us some breathing room.
 	 */
 	nmbuf = ncl * (4 + nrxr);
 	ncl *= nrxr;
 
 	/*
 	 * Bandaid for drivers that (incorrectly) advertise LinkUp before their
 	 * dn_init method is available.
 	 */
 	if (nmbuf == 0 || ncl == 0 || clsize == 0) {
 #ifndef INVARIANTS
 		if (bootverbose)
 #endif
 		printf("%s: Bad dn_init result from %s (ifp %p), ignoring.\n",
 		    __func__, if_name(ifp), ifp);
 		return;
 	}
 	dn_maybe_reinit_mbufs(nmbuf, ncl, clsize);
 }
 
 /*
  * Unfortunately, the ifnet_arrival_event eventhandler hook is mostly useless
  * for us because drivers tend to if_attach before invoking DEBUGNET_SET().
  *
  * On the other hand, hooking DEBUGNET_SET() itself may still be too early,
  * because the driver is still in attach.  Since we cannot use down interfaces,
  * maybe hooking ifnet_event:IFNET_EVENT_UP is sufficient?  ... Nope, at least
  * with vtnet and dhcpclient that event just never occurs.
  *
  * So that's how I've landed on the lower level ifnet_link_event.
  */
 
 static void
 dn_ifnet_event(void *arg __unused, struct ifnet *ifp, int link_state)
 {
 	if (link_state == LINK_STATE_UP)
 		debugnet_any_ifnet_update(ifp);
 }
 
 static eventhandler_tag dn_attach_cookie;
 static void
 dn_evh_init(void *ctx __unused)
 {
 	dn_attach_cookie = EVENTHANDLER_REGISTER(ifnet_link_event,
 	    dn_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
 }
 SYSINIT(dn_evh_init, SI_SUB_EVENTHANDLER + 1, SI_ORDER_ANY, dn_evh_init, NULL);
 
 /*
  * DDB parsing helpers for debugnet(4) consumers.
  */
 #ifdef DDB
 struct my_inet_opt {
 	bool has_opt;
 	const char *printname;
 	in_addr_t *result;
 };
 
 static int
 dn_parse_optarg_ipv4(struct my_inet_opt *opt)
 {
 	in_addr_t tmp;
 	unsigned octet;
 	int t;
 
 	tmp = 0;
 	for (octet = 0; octet < 4; octet++) {
 		t = db_read_token_flags(DRT_WSPACE | DRT_DECIMAL);
 		if (t != tNUMBER) {
 			db_printf("%s:%s: octet %u expected number; found %d\n",
 			    __func__, opt->printname, octet, t);
 			return (EINVAL);
 		}
 		/*
 		 * db_lex lexes '-' distinctly from the number itself, but
 		 * let's document that invariant.
 		 */
 		MPASS(db_tok_number >= 0);
 
 		if (db_tok_number > UINT8_MAX) {
 			db_printf("%s:%s: octet %u out of range: %jd\n", __func__,
 			    opt->printname, octet, (intmax_t)db_tok_number);
 			return (EDOM);
 		}
 
 		/* Constructed host-endian and converted to network later. */
 		tmp = (tmp << 8) | db_tok_number;
 
 		if (octet < 3) {
 			t = db_read_token_flags(DRT_WSPACE);
 			if (t != tDOT) {
 				db_printf("%s:%s: octet %u expected '.'; found"
 				    " %d\n", __func__, opt->printname, octet,
 				    t);
 				return (EINVAL);
 			}
 		}
 	}
 
 	*opt->result = htonl(tmp);
 	opt->has_opt = true;
 	return (0);
 }
 
 int
 debugnet_parse_ddb_cmd(const char *cmd, struct debugnet_ddb_config *result)
 {
 	struct ifnet *ifp;
 	int t, error;
 	bool want_ifp;
 	char ch;
 
 	struct my_inet_opt opt_client = {
 		.printname = "client",
 		.result = &result->dd_client,
 	},
 	opt_server = {
 		.printname = "server",
 		.result = &result->dd_server,
 	},
 	opt_gateway = {
 		.printname = "gateway",
 		.result = &result->dd_gateway,
 	},
 	*cur_inet_opt;
 
 	ifp = NULL;
 	memset(result, 0, sizeof(*result));
 
 	/*
 	 * command [space] [-] [opt] [[space] [optarg]] ...
 	 *
 	 * db_command has already lexed 'command' for us.
 	 */
 	t = db_read_token_flags(DRT_WSPACE);
 	if (t == tWSPACE)
 		t = db_read_token_flags(DRT_WSPACE);
 
 	while (t != tEOL) {
 		if (t != tMINUS) {
 			db_printf("%s: Bad syntax; expected '-', got %d\n",
 			    cmd, t);
 			goto usage;
 		}
 
 		t = db_read_token_flags(DRT_WSPACE);
 		if (t != tIDENT) {
 			db_printf("%s: Bad syntax; expected tIDENT, got %d\n",
 			    cmd, t);
 			goto usage;
 		}
 
 		if (strlen(db_tok_string) > 1) {
 			db_printf("%s: Bad syntax; expected single option "
 			    "flag, got '%s'\n", cmd, db_tok_string);
 			goto usage;
 		}
 
 		want_ifp = false;
 		cur_inet_opt = NULL;
 		switch ((ch = db_tok_string[0])) {
 		default:
 			DNETDEBUG("Unexpected: '%c'\n", ch);
 			/* FALLTHROUGH */
 		case 'h':
 			goto usage;
 		case 'c':
 			cur_inet_opt = &opt_client;
 			break;
 		case 'g':
 			cur_inet_opt = &opt_gateway;
 			break;
 		case 's':
 			cur_inet_opt = &opt_server;
 			break;
 		case 'i':
 			want_ifp = true;
 			break;
 		}
 
 		t = db_read_token_flags(DRT_WSPACE);
 		if (t != tWSPACE) {
 			db_printf("%s: Bad syntax; expected space after "
 			    "flag %c, got %d\n", cmd, ch, t);
 			goto usage;
 		}
 
 		if (want_ifp) {
 			t = db_read_token_flags(DRT_WSPACE);
 			if (t != tIDENT) {
 				db_printf("%s: Expected interface but got %d\n",
 				    cmd, t);
 				goto usage;
 			}
 
 			CURVNET_SET(vnet0);
 			/*
 			 * We *don't* take a ref here because the only current
 			 * consumer, db_netdump_cmd, does not need it.  It
 			 * (somewhat redundantly) extracts the if_name(),
 			 * re-lookups the ifp, and takes its own reference.
 			 */
 			ifp = ifunit(db_tok_string);
 			CURVNET_RESTORE();
 			if (ifp == NULL) {
 				db_printf("Could not locate interface %s\n",
 				    db_tok_string);
 				error = ENOENT;
 				goto cleanup;
 			}
 		} else {
 			MPASS(cur_inet_opt != NULL);
 			/* Assume IPv4 for now. */
 			error = dn_parse_optarg_ipv4(cur_inet_opt);
 			if (error != 0)
 				goto cleanup;
 		}
 
 		/* Skip (mandatory) whitespace after option, if not EOL. */
 		t = db_read_token_flags(DRT_WSPACE);
 		if (t == tEOL)
 			break;
 		if (t != tWSPACE) {
 			db_printf("%s: Bad syntax; expected space after "
 			    "flag %c option; got %d\n", cmd, ch, t);
 			goto usage;
 		}
 		t = db_read_token_flags(DRT_WSPACE);
 	}
 
 	if (!opt_server.has_opt) {
 		db_printf("%s: need a destination server address\n", cmd);
 		goto usage;
 	}
 
 	result->dd_has_client = opt_client.has_opt;
 	result->dd_has_gateway = opt_gateway.has_opt;
 	result->dd_ifp = ifp;
 
 	/* We parsed the full line to tEOL already, or bailed with an error. */
 	return (0);
 
 usage:
 	db_printf("Usage: %s -s <server> [-g <gateway> -c <localip> "
 	    "-i <interface>]\n", cmd);
 	error = EINVAL;
 	/* FALLTHROUGH */
 cleanup:
 	db_skip_to_eol();
 	return (error);
 }
 #endif /* DDB */
diff --git a/sys/net/debugnet_inet.c b/sys/net/debugnet_inet.c
index e7449113ba10..bd5195ab86d5 100644
--- a/sys/net/debugnet_inet.c
+++ b/sys/net/debugnet_inet.c
@@ -1,502 +1,503 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2019 Isilon Systems, LLC.
  * Copyright (c) 2005-2014 Sandvine Incorporated. All rights reserved.
  * Copyright (c) 2000 Darrell Anderson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #include <machine/in_cksum.h>
 #include <machine/pcb.h>
 
 #include <net/debugnet.h>
 #define	DEBUGNET_INTERNAL
 #include <net/debugnet_int.h>
 
 int debugnet_arp_nretries = 3;
 SYSCTL_INT(_net_debugnet, OID_AUTO, arp_nretries, CTLFLAG_RWTUN,
     &debugnet_arp_nretries, 0,
     "Number of ARP attempts before giving up");
 
 /*
  * Handler for IP packets: checks their sanity and then processes any debugnet
  * ACK packets it finds.
  *
  * It needs to partially replicate the behaviour of ip_input() and udp_input().
  *
  * Parameters:
  *	pcb	a pointer to the live debugnet PCB
  *	mb	a pointer to an mbuf * containing the packet received
  *		Updates *mb if m_pullup et al change the pointer
  *		Assumes the calling function will take care of freeing the mbuf
  */
 void
 debugnet_handle_ip(struct debugnet_pcb *pcb, struct mbuf **mb)
 {
 	struct ip *ip;
 	struct mbuf *m;
 	unsigned short hlen;
 
 	if (pcb->dp_state < DN_STATE_HAVE_GW_MAC)
 		return;
 
 	/* IP processing. */
 	m = *mb;
 	if (m->m_pkthdr.len < sizeof(struct ip)) {
 		DNETDEBUG("dropping packet too small for IP header\n");
 		return;
 	}
 	if (m->m_len < sizeof(struct ip)) {
 		m = m_pullup(m, sizeof(struct ip));
 		*mb = m;
 		if (m == NULL) {
 			DNETDEBUG("m_pullup failed\n");
 			return;
 		}
 	}
 	ip = mtod(m, struct ip *);
 
 	/* IP version. */
 	if (ip->ip_v != IPVERSION) {
 		DNETDEBUG("bad IP version %d\n", ip->ip_v);
 		return;
 	}
 
 	/* Header length. */
 	hlen = ip->ip_hl << 2;
 	if (hlen < sizeof(struct ip)) {
 		DNETDEBUG("bad IP header length (%hu)\n", hlen);
 		return;
 	}
 	if (hlen > m->m_len) {
 		m = m_pullup(m, hlen);
 		*mb = m;
 		if (m == NULL) {
 			DNETDEBUG("m_pullup failed\n");
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 	/* Ignore packets with IP options. */
 	if (hlen > sizeof(struct ip)) {
 		DNETDEBUG("drop packet with IP options\n");
 		return;
 	}
 
 #ifdef INVARIANTS
 	if ((IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) ||
 	    IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) &&
 	    (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) {
 		DNETDEBUG("Bad IP header (RFC1122)\n");
 		return;
 	}
 #endif
 
 	/* Checksum. */
 	if ((m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) != 0) {
 		if ((m->m_pkthdr.csum_flags & CSUM_IP_VALID) == 0) {
 			DNETDEBUG("bad IP checksum\n");
 			return;
 		}
 	} else {
 		/* XXX */ ;
 	}
 
 	/* Convert fields to host byte order. */
 	ip->ip_len = ntohs(ip->ip_len);
 	if (ip->ip_len < hlen) {
 		DNETDEBUG("IP packet smaller (%hu) than header (%hu)\n",
 		    ip->ip_len, hlen);
 		return;
 	}
 	if (m->m_pkthdr.len < ip->ip_len) {
 		DNETDEBUG("IP packet bigger (%hu) than ethernet packet (%d)\n",
 		    ip->ip_len, m->m_pkthdr.len);
 		return;
 	}
 	if (m->m_pkthdr.len > ip->ip_len) {
 		/* Truncate the packet to the IP length. */
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = ip->ip_len;
 			m->m_pkthdr.len = ip->ip_len;
 		} else
 			m_adj(m, ip->ip_len - m->m_pkthdr.len);
 	}
 
 	ip->ip_off = ntohs(ip->ip_off);
 
 	/* Check that the source is the server's IP. */
 	if (ip->ip_src.s_addr != pcb->dp_server) {
 		DNETDEBUG("drop packet not from server (from 0x%x)\n",
 		    ip->ip_src.s_addr);
 		return;
 	}
 
 	/* Check if the destination IP is ours. */
 	if (ip->ip_dst.s_addr != pcb->dp_client) {
 		DNETDEBUGV("drop packet not to our IP\n");
 		return;
 	}
 
 	if (ip->ip_p != IPPROTO_UDP) {
 		DNETDEBUG("drop non-UDP packet\n");
 		return;
 	}
 
 	/* Do not deal with fragments. */
 	if ((ip->ip_off & (IP_MF | IP_OFFMASK)) != 0) {
 		DNETDEBUG("drop fragmented packet\n");
 		return;
 	}
 
 	if ((m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) != 0) {
 		if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) == 0) {
 			DNETDEBUG("bad UDP checksum\n");
 			return;
 		}
 	} else {
 		/* XXX */ ;
 	}
 
 	/* UDP custom is to have packet length not include IP header. */
 	ip->ip_len -= hlen;
 
 	/* Checked above before decoding IP header. */
 	MPASS(m->m_pkthdr.len >= sizeof(struct ipovly));
 
 	/* Put the UDP header at start of chain. */
 	m_adj(m, sizeof(struct ipovly));
 	debugnet_handle_udp(pcb, mb);
 }
 
 /*
  * Builds and sends a single ARP request to locate the L2 address for a given
  * INET address.
  *
  * Return value:
  *	0 on success
  *	errno on error
  */
 static int
 debugnet_send_arp(struct debugnet_pcb *pcb, in_addr_t dst)
 {
 	struct ether_addr bcast;
 	struct arphdr *ah;
 	struct ifnet *ifp;
 	struct mbuf *m;
 	int pktlen;
 
 	ifp = pcb->dp_ifp;
 
 	/* Fill-up a broadcast address. */
 	memset(&bcast, 0xFF, ETHER_ADDR_LEN);
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL) {
 		printf("%s: Out of mbufs\n", __func__);
 		return (ENOBUFS);
 	}
 	pktlen = arphdr_len2(ETHER_ADDR_LEN, sizeof(struct in_addr));
 	m->m_len = pktlen;
 	m->m_pkthdr.len = pktlen;
 	MH_ALIGN(m, pktlen);
 	ah = mtod(m, struct arphdr *);
 	ah->ar_hrd = htons(ARPHRD_ETHER);
 	ah->ar_pro = htons(ETHERTYPE_IP);
 	ah->ar_hln = ETHER_ADDR_LEN;
 	ah->ar_pln = sizeof(struct in_addr);
 	ah->ar_op = htons(ARPOP_REQUEST);
 	memcpy(ar_sha(ah), IF_LLADDR(ifp), ETHER_ADDR_LEN);
 	((struct in_addr *)ar_spa(ah))->s_addr = pcb->dp_client;
 	bzero(ar_tha(ah), ETHER_ADDR_LEN);
 	((struct in_addr *)ar_tpa(ah))->s_addr = dst;
 	return (debugnet_ether_output(m, ifp, bcast, ETHERTYPE_ARP));
 }
 
 /*
  * Handler for ARP packets: checks their sanity and then
  * 1. If the ARP is a request for our IP, respond with our MAC address
  * 2. If the ARP is a response from our server, record its MAC address
  *
  * It needs to replicate partially the behaviour of arpintr() and
  * in_arpinput().
  *
  * Parameters:
  *	pcb	a pointer to the live debugnet PCB
  *	mb	a pointer to an mbuf * containing the packet received
  *		Updates *mb if m_pullup et al change the pointer
  *		Assumes the calling function will take care of freeing the mbuf
  */
 void
 debugnet_handle_arp(struct debugnet_pcb *pcb, struct mbuf **mb)
 {
 	char buf[INET_ADDRSTRLEN];
 	struct in_addr isaddr, itaddr;
 	struct ether_addr dst;
 	struct mbuf *m;
 	struct arphdr *ah;
 	struct ifnet *ifp;
 	uint8_t *enaddr;
 	int req_len, op;
 
 	m = *mb;
 	ifp = m->m_pkthdr.rcvif;
 	if (m->m_len < sizeof(struct arphdr)) {
 		m = m_pullup(m, sizeof(struct arphdr));
 		*mb = m;
 		if (m == NULL) {
 			DNETDEBUG("runt packet: m_pullup failed\n");
 			return;
 		}
 	}
 
 	ah = mtod(m, struct arphdr *);
 	if (ntohs(ah->ar_hrd) != ARPHRD_ETHER) {
 		DNETDEBUG("unknown hardware address 0x%2D)\n",
 		    (unsigned char *)&ah->ar_hrd, "");
 		return;
 	}
 	if (ntohs(ah->ar_pro) != ETHERTYPE_IP) {
 		DNETDEBUG("drop ARP for unknown protocol %d\n",
 		    ntohs(ah->ar_pro));
 		return;
 	}
 	req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr));
 	if (m->m_len < req_len) {
 		m = m_pullup(m, req_len);
 		*mb = m;
 		if (m == NULL) {
 			DNETDEBUG("runt packet: m_pullup failed\n");
 			return;
 		}
 	}
 	ah = mtod(m, struct arphdr *);
 
 	op = ntohs(ah->ar_op);
 	memcpy(&isaddr, ar_spa(ah), sizeof(isaddr));
 	memcpy(&itaddr, ar_tpa(ah), sizeof(itaddr));
 	enaddr = (uint8_t *)IF_LLADDR(ifp);
 
 	if (memcmp(ar_sha(ah), enaddr, ifp->if_addrlen) == 0) {
 		DNETDEBUG("ignoring ARP from myself\n");
 		return;
 	}
 
 	if (isaddr.s_addr == pcb->dp_client) {
 		printf("%s: %*D is using my IP address %s!\n", __func__,
 		    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 		    inet_ntoa_r(isaddr, buf));
 		return;
 	}
 
 	if (memcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen) == 0) {
 		DNETDEBUG("ignoring ARP from broadcast address\n");
 		return;
 	}
 
 	if (op == ARPOP_REPLY) {
 		if (isaddr.s_addr != pcb->dp_gateway &&
 		    isaddr.s_addr != pcb->dp_server) {
 			inet_ntoa_r(isaddr, buf);
 			DNETDEBUG("ignoring ARP reply from %s (not configured"
 			    " server or gateway)\n", buf);
 			return;
 		}
 		if (pcb->dp_state >= DN_STATE_HAVE_GW_MAC) {
 			inet_ntoa_r(isaddr, buf);
 			DNETDEBUG("ignoring server ARP reply from %s (already"
 			    " have gateway address)\n", buf);
 			return;
 		}
 		MPASS(pcb->dp_state == DN_STATE_INIT);
 		memcpy(pcb->dp_gw_mac.octet, ar_sha(ah),
 		    min(ah->ar_hln, ETHER_ADDR_LEN));
 		
 		DNETDEBUG("got server MAC address %6D\n",
 		    pcb->dp_gw_mac.octet, ":");
 
 		pcb->dp_state = DN_STATE_HAVE_GW_MAC;
 		return;
 	}
 
 	if (op != ARPOP_REQUEST) {
 		DNETDEBUG("ignoring ARP non-request/reply\n");
 		return;
 	}
 
 	if (itaddr.s_addr != pcb->dp_client) {
 		DNETDEBUG("ignoring ARP not to our IP\n");
 		return;
 	}
 
 	memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 	memcpy(ar_sha(ah), enaddr, ah->ar_hln);
 	memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln);
 	memcpy(ar_spa(ah), &itaddr, ah->ar_pln);
 	ah->ar_op = htons(ARPOP_REPLY);
 	ah->ar_pro = htons(ETHERTYPE_IP);
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 	m->m_len = arphdr_len(ah);
 	m->m_pkthdr.len = m->m_len;
 
 	memcpy(dst.octet, ar_tha(ah), ETHER_ADDR_LEN);
 	debugnet_ether_output(m, ifp, dst, ETHERTYPE_ARP);
 	*mb = NULL;
 }
 
 /*
  * Sends ARP requests to locate the server and waits for a response.
  * We first try to ARP the server itself, and fall back to the provided
  * gateway if the server appears to be off-link.
  *
  * Return value:
  *	0 on success
  *	errno on error
  */
 int
 debugnet_arp_gw(struct debugnet_pcb *pcb)
 {
 	in_addr_t dst;
 	int error, polls, retries;
 
 	dst = pcb->dp_server;
 restart:
 	for (retries = 0; retries < debugnet_arp_nretries; retries++) {
 		error = debugnet_send_arp(pcb, dst);
 		if (error != 0)
 			return (error);
 		for (polls = 0; polls < debugnet_npolls &&
 		    pcb->dp_state < DN_STATE_HAVE_GW_MAC; polls++) {
 			debugnet_network_poll(pcb);
 			DELAY(500);
 		}
 		if (pcb->dp_state >= DN_STATE_HAVE_GW_MAC)
 			break;
 		printf("(ARP retry)");
 	}
 	if (pcb->dp_state >= DN_STATE_HAVE_GW_MAC)
 		return (0);
 	if (dst == pcb->dp_server) {
 		printf("\nFailed to ARP server");
 		if (pcb->dp_gateway != INADDR_ANY) {
 			printf(", trying to reach gateway...\n");
 			dst = pcb->dp_gateway;
 			goto restart;
 		} else
 			printf(".\n");
 	} else
 		printf("\nFailed to ARP gateway.\n");
 
 	return (ETIMEDOUT);
 }
 
 /*
  * Unreliable IPv4 transmission of an mbuf chain to the debugnet server
  * Note: can't handle fragmentation; fails if the packet is larger than
  *	 ifp->if_mtu after adding the UDP/IP headers
  *
  * Parameters:
  *	pcb	The debugnet context block
  *	m	mbuf chain
  *
  * Returns:
  *	int	see errno.h, 0 for success
  */
 int
 debugnet_ip_output(struct debugnet_pcb *pcb, struct mbuf *m)
 {
 	struct udphdr *udp;
 	struct ifnet *ifp;
 	struct ip *ip;
 
 	MPASS(pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
 
 	ifp = pcb->dp_ifp;
 
 	M_PREPEND(m, sizeof(*ip), M_NOWAIT);
 	if (m == NULL) {
 		printf("%s: out of mbufs\n", __func__);
 		return (ENOBUFS);
 	}
 
 	if (m->m_pkthdr.len > ifp->if_mtu) {
 		printf("%s: Packet is too big: %d > MTU %u\n", __func__,
 		    m->m_pkthdr.len, ifp->if_mtu);
 		m_freem(m);
 		return (ENOBUFS);
 	}
 
 	ip = mtod(m, void *);
 	udp = (void *)(ip + 1);
 
 	memset(ip, 0, offsetof(struct ip, ip_p));
 	ip->ip_p = IPPROTO_UDP;
 	ip->ip_sum = udp->uh_ulen;
 	ip->ip_src = (struct in_addr) { pcb->dp_client };
 	ip->ip_dst = (struct in_addr) { pcb->dp_server };
 
 	/* Compute UDP-IPv4 checksum. */
 	udp->uh_sum = in_cksum(m, m->m_pkthdr.len);
 	if (udp->uh_sum == 0)
 		udp->uh_sum = 0xffff;
 
 	ip->ip_v = IPVERSION;
 	ip->ip_hl = sizeof(*ip) >> 2;
 	ip->ip_tos = 0;
 	ip->ip_len = htons(m->m_pkthdr.len);
 	ip->ip_id = 0;
 	ip->ip_off = htons(IP_DF);
 	ip->ip_ttl = 255;
 	ip->ip_sum = 0;
 	ip->ip_sum = in_cksum(m, sizeof(struct ip));
 
 	return (debugnet_ether_output(m, ifp, pcb->dp_gw_mac, ETHERTYPE_IP));
 }
diff --git a/sys/net/ieee8023ad_lacp.c b/sys/net/ieee8023ad_lacp.c
index 65b3a337eedc..78345aae68e5 100644
--- a/sys/net/ieee8023ad_lacp.c
+++ b/sys/net/ieee8023ad_lacp.c
@@ -1,2220 +1,2221 @@
 /*	$NetBSD: ieee8023ad_lacp.c,v 1.3 2005/12/11 12:24:54 christos Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-NetBSD
  *
  * Copyright (c)2005 YAMAMOTO Takashi,
  * Copyright (c)2008 Andrew Thompson <thompsa@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 
 #include <sys/param.h>
 #include <sys/callout.h>
 #include <sys/eventhandler.h>
 #include <sys/mbuf.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h> /* hz */
 #include <sys/socket.h> /* for net/if.h */
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <machine/stdarg.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_dl.h>
 #include <net/ethernet.h>
 #include <net/infiniband.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 
 #include <net/if_lagg.h>
 #include <net/ieee8023ad_lacp.h>
 
 /*
  * actor system priority and port priority.
  * XXX should be configurable.
  */
 
 #define	LACP_SYSTEM_PRIO	0x8000
 #define	LACP_PORT_PRIO		0x8000
 
 const uint8_t ethermulticastaddr_slowprotocols[ETHER_ADDR_LEN] =
     { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x02 };
 
 static const struct tlv_template lacp_info_tlv_template[] = {
 	{ LACP_TYPE_ACTORINFO,
 	    sizeof(struct tlvhdr) + sizeof(struct lacp_peerinfo) },
 	{ LACP_TYPE_PARTNERINFO,
 	    sizeof(struct tlvhdr) + sizeof(struct lacp_peerinfo) },
 	{ LACP_TYPE_COLLECTORINFO,
 	    sizeof(struct tlvhdr) + sizeof(struct lacp_collectorinfo) },
 	{ 0, 0 },
 };
 
 static const struct tlv_template marker_info_tlv_template[] = {
 	{ MARKER_TYPE_INFO,
 	    sizeof(struct tlvhdr) + sizeof(struct lacp_markerinfo) },
 	{ 0, 0 },
 };
 
 static const struct tlv_template marker_response_tlv_template[] = {
 	{ MARKER_TYPE_RESPONSE,
 	    sizeof(struct tlvhdr) + sizeof(struct lacp_markerinfo) },
 	{ 0, 0 },
 };
 
 typedef void (*lacp_timer_func_t)(struct lacp_port *);
 
 static void	lacp_fill_actorinfo(struct lacp_port *, struct lacp_peerinfo *);
 static void	lacp_fill_markerinfo(struct lacp_port *,
 		    struct lacp_markerinfo *);
 
 static uint64_t	lacp_aggregator_bandwidth(struct lacp_aggregator *);
 static void	lacp_suppress_distributing(struct lacp_softc *,
 		    struct lacp_aggregator *);
 static void	lacp_transit_expire(void *);
 static void	lacp_update_portmap(struct lacp_softc *);
 static void	lacp_select_active_aggregator(struct lacp_softc *);
 static uint16_t	lacp_compose_key(struct lacp_port *);
 static int	tlv_check(const void *, size_t, const struct tlvhdr *,
 		    const struct tlv_template *, boolean_t);
 static void	lacp_tick(void *);
 
 static void	lacp_fill_aggregator_id(struct lacp_aggregator *,
 		    const struct lacp_port *);
 static void	lacp_fill_aggregator_id_peer(struct lacp_peerinfo *,
 		    const struct lacp_peerinfo *);
 static int	lacp_aggregator_is_compatible(const struct lacp_aggregator *,
 		    const struct lacp_port *);
 static int	lacp_peerinfo_is_compatible(const struct lacp_peerinfo *,
 		    const struct lacp_peerinfo *);
 
 static struct lacp_aggregator *lacp_aggregator_get(struct lacp_softc *,
 		    struct lacp_port *);
 static void	lacp_aggregator_addref(struct lacp_softc *,
 		    struct lacp_aggregator *);
 static void	lacp_aggregator_delref(struct lacp_softc *,
 		    struct lacp_aggregator *);
 
 /* receive machine */
 
 static int	lacp_pdu_input(struct lacp_port *, struct mbuf *);
 static int	lacp_marker_input(struct lacp_port *, struct mbuf *);
 static void	lacp_sm_rx(struct lacp_port *, const struct lacpdu *);
 static void	lacp_sm_rx_timer(struct lacp_port *);
 static void	lacp_sm_rx_set_expired(struct lacp_port *);
 static void	lacp_sm_rx_update_ntt(struct lacp_port *,
 		    const struct lacpdu *);
 static void	lacp_sm_rx_record_pdu(struct lacp_port *,
 		    const struct lacpdu *);
 static void	lacp_sm_rx_update_selected(struct lacp_port *,
 		    const struct lacpdu *);
 static void	lacp_sm_rx_record_default(struct lacp_port *);
 static void	lacp_sm_rx_update_default_selected(struct lacp_port *);
 static void	lacp_sm_rx_update_selected_from_peerinfo(struct lacp_port *,
 		    const struct lacp_peerinfo *);
 
 /* mux machine */
 
 static void	lacp_sm_mux(struct lacp_port *);
 static void	lacp_set_mux(struct lacp_port *, enum lacp_mux_state);
 static void	lacp_sm_mux_timer(struct lacp_port *);
 
 /* periodic transmit machine */
 
 static void	lacp_sm_ptx_update_timeout(struct lacp_port *, uint8_t);
 static void	lacp_sm_ptx_tx_schedule(struct lacp_port *);
 static void	lacp_sm_ptx_timer(struct lacp_port *);
 
 /* transmit machine */
 
 static void	lacp_sm_tx(struct lacp_port *);
 static void	lacp_sm_assert_ntt(struct lacp_port *);
 
 static void	lacp_run_timers(struct lacp_port *);
 static int	lacp_compare_peerinfo(const struct lacp_peerinfo *,
 		    const struct lacp_peerinfo *);
 static int	lacp_compare_systemid(const struct lacp_systemid *,
 		    const struct lacp_systemid *);
 static void	lacp_port_enable(struct lacp_port *);
 static void	lacp_port_disable(struct lacp_port *);
 static void	lacp_select(struct lacp_port *);
 static void	lacp_unselect(struct lacp_port *);
 static void	lacp_disable_collecting(struct lacp_port *);
 static void	lacp_enable_collecting(struct lacp_port *);
 static void	lacp_disable_distributing(struct lacp_port *);
 static void	lacp_enable_distributing(struct lacp_port *);
 static int	lacp_xmit_lacpdu(struct lacp_port *);
 static int	lacp_xmit_marker(struct lacp_port *);
 
 /* Debugging */
 
 static void	lacp_dump_lacpdu(const struct lacpdu *);
 static const char *lacp_format_partner(const struct lacp_peerinfo *, char *,
 		    size_t);
 static const char *lacp_format_lagid(const struct lacp_peerinfo *,
 		    const struct lacp_peerinfo *, char *, size_t);
 static const char *lacp_format_lagid_aggregator(const struct lacp_aggregator *,
 		    char *, size_t);
 static const char *lacp_format_state(uint8_t, char *, size_t);
 static const char *lacp_format_mac(const uint8_t *, char *, size_t);
 static const char *lacp_format_systemid(const struct lacp_systemid *, char *,
 		    size_t);
 static const char *lacp_format_portid(const struct lacp_portid *, char *,
 		    size_t);
 static void	lacp_dprintf(const struct lacp_port *, const char *, ...)
 		    __attribute__((__format__(__printf__, 2, 3)));
 
 VNET_DEFINE_STATIC(int, lacp_debug);
 #define	V_lacp_debug	VNET(lacp_debug)
 SYSCTL_NODE(_net_link_lagg, OID_AUTO, lacp, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "ieee802.3ad");
 SYSCTL_INT(_net_link_lagg_lacp, OID_AUTO, debug, CTLFLAG_RWTUN | CTLFLAG_VNET,
     &VNET_NAME(lacp_debug), 0, "Enable LACP debug logging (1=debug, 2=trace)");
 
 VNET_DEFINE_STATIC(int, lacp_default_strict_mode) = 1;
 SYSCTL_INT(_net_link_lagg_lacp, OID_AUTO, default_strict_mode,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(lacp_default_strict_mode), 0,
     "LACP strict protocol compliance default");
 #define LACP_DPRINTF(a) if (V_lacp_debug & 0x01) { lacp_dprintf a ; }
 #define LACP_TRACE(a) if (V_lacp_debug & 0x02) { lacp_dprintf(a,"%s\n",__func__); }
 #define LACP_TPRINTF(a) if (V_lacp_debug & 0x04) { lacp_dprintf a ; }
 
 /*
  * partner administration variables.
  * XXX should be configurable.
  */
 
 static const struct lacp_peerinfo lacp_partner_admin_optimistic = {
 	.lip_systemid = { .lsi_prio = 0xffff },
 	.lip_portid = { .lpi_prio = 0xffff },
 	.lip_state = LACP_STATE_SYNC | LACP_STATE_AGGREGATION |
 	    LACP_STATE_COLLECTING | LACP_STATE_DISTRIBUTING,
 };
 
 static const struct lacp_peerinfo lacp_partner_admin_strict = {
 	.lip_systemid = { .lsi_prio = 0xffff },
 	.lip_portid = { .lpi_prio = 0xffff },
 	.lip_state = 0,
 };
 
 static const lacp_timer_func_t lacp_timer_funcs[LACP_NTIMER] = {
 	[LACP_TIMER_CURRENT_WHILE] = lacp_sm_rx_timer,
 	[LACP_TIMER_PERIODIC] = lacp_sm_ptx_timer,
 	[LACP_TIMER_WAIT_WHILE] = lacp_sm_mux_timer,
 };
 
 struct mbuf *
 lacp_input(struct lagg_port *lgp, struct mbuf *m)
 {
 	struct lacp_port *lp = LACP_PORT(lgp);
 	uint8_t subtype;
 
 	if (m->m_pkthdr.len < sizeof(struct ether_header) + sizeof(subtype)) {
 		m_freem(m);
 		return (NULL);
 	}
 
 	m_copydata(m, sizeof(struct ether_header), sizeof(subtype), &subtype);
 	switch (subtype) {
 		case SLOWPROTOCOLS_SUBTYPE_LACP:
 			lacp_pdu_input(lp, m);
 			return (NULL);
 
 		case SLOWPROTOCOLS_SUBTYPE_MARKER:
 			lacp_marker_input(lp, m);
 			return (NULL);
 	}
 
 	/* Not a subtype we are interested in */
 	return (m);
 }
 
 /*
  * lacp_pdu_input: process lacpdu
  */
 static int
 lacp_pdu_input(struct lacp_port *lp, struct mbuf *m)
 {
 	struct lacp_softc *lsc = lp->lp_lsc;
 	struct lacpdu *du;
 	int error = 0;
 
 	if (m->m_pkthdr.len != sizeof(*du)) {
 		goto bad;
 	}
 
 	if ((m->m_flags & M_MCAST) == 0) {
 		goto bad;
 	}
 
 	if (m->m_len < sizeof(*du)) {
 		m = m_pullup(m, sizeof(*du));
 		if (m == NULL) {
 			return (ENOMEM);
 		}
 	}
 
 	du = mtod(m, struct lacpdu *);
 
 	if (memcmp(&du->ldu_eh.ether_dhost,
 	    &ethermulticastaddr_slowprotocols, ETHER_ADDR_LEN)) {
 		goto bad;
 	}
 
 	/*
 	 * ignore the version for compatibility with
 	 * the future protocol revisions.
 	 */
 #if 0
 	if (du->ldu_sph.sph_version != 1) {
 		goto bad;
 	}
 #endif
 
 	/*
 	 * ignore tlv types for compatibility with
 	 * the future protocol revisions.
 	 */
 	if (tlv_check(du, sizeof(*du), &du->ldu_tlv_actor,
 	    lacp_info_tlv_template, FALSE)) {
 		goto bad;
 	}
 
         if (V_lacp_debug > 0) {
 		lacp_dprintf(lp, "lacpdu receive\n");
 		lacp_dump_lacpdu(du);
 	}
 
 	if ((1 << lp->lp_ifp->if_dunit) & lp->lp_lsc->lsc_debug.lsc_rx_test) {
 		LACP_TPRINTF((lp, "Dropping RX PDU\n"));
 		goto bad;
 	}
 
 	LACP_LOCK(lsc);
 	lacp_sm_rx(lp, du);
 	LACP_UNLOCK(lsc);
 
 	m_freem(m);
 	return (error);
 
 bad:
 	m_freem(m);
 	return (EINVAL);
 }
 
 static void
 lacp_fill_actorinfo(struct lacp_port *lp, struct lacp_peerinfo *info)
 {
 	struct lagg_port *lgp = lp->lp_lagg;
 	struct lagg_softc *sc = lgp->lp_softc;
 
 	info->lip_systemid.lsi_prio = htons(LACP_SYSTEM_PRIO);
 	memcpy(&info->lip_systemid.lsi_mac,
 	    IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
 	info->lip_portid.lpi_prio = htons(LACP_PORT_PRIO);
 	info->lip_portid.lpi_portno = htons(lp->lp_ifp->if_index);
 	info->lip_state = lp->lp_state;
 }
 
 static void
 lacp_fill_markerinfo(struct lacp_port *lp, struct lacp_markerinfo *info)
 {
 	struct ifnet *ifp = lp->lp_ifp;
 
 	/* Fill in the port index and system id (encoded as the MAC) */
 	info->mi_rq_port = htons(ifp->if_index);
 	memcpy(&info->mi_rq_system, lp->lp_systemid.lsi_mac, ETHER_ADDR_LEN);
 	info->mi_rq_xid = htonl(0);
 }
 
 static int
 lacp_xmit_lacpdu(struct lacp_port *lp)
 {
 	struct lagg_port *lgp = lp->lp_lagg;
 	struct mbuf *m;
 	struct lacpdu *du;
 	int error;
 
 	LACP_LOCK_ASSERT(lp->lp_lsc);
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL) {
 		return (ENOMEM);
 	}
 	m->m_len = m->m_pkthdr.len = sizeof(*du);
 
 	du = mtod(m, struct lacpdu *);
 	memset(du, 0, sizeof(*du));
 
 	memcpy(&du->ldu_eh.ether_dhost, ethermulticastaddr_slowprotocols,
 	    ETHER_ADDR_LEN);
 	memcpy(&du->ldu_eh.ether_shost, lgp->lp_lladdr, ETHER_ADDR_LEN);
 	du->ldu_eh.ether_type = htons(ETHERTYPE_SLOW);
 
 	du->ldu_sph.sph_subtype = SLOWPROTOCOLS_SUBTYPE_LACP;
 	du->ldu_sph.sph_version = 1;
 
 	TLV_SET(&du->ldu_tlv_actor, LACP_TYPE_ACTORINFO, sizeof(du->ldu_actor));
 	du->ldu_actor = lp->lp_actor;
 
 	TLV_SET(&du->ldu_tlv_partner, LACP_TYPE_PARTNERINFO,
 	    sizeof(du->ldu_partner));
 	du->ldu_partner = lp->lp_partner;
 
 	TLV_SET(&du->ldu_tlv_collector, LACP_TYPE_COLLECTORINFO,
 	    sizeof(du->ldu_collector));
 	du->ldu_collector.lci_maxdelay = 0;
 
 	if (V_lacp_debug > 0) {
 		lacp_dprintf(lp, "lacpdu transmit\n");
 		lacp_dump_lacpdu(du);
 	}
 
 	m->m_flags |= M_MCAST;
 
 	/*
 	 * XXX should use higher priority queue.
 	 * otherwise network congestion can break aggregation.
 	 */
 
 	error = lagg_enqueue(lp->lp_ifp, m);
 	return (error);
 }
 
 static int
 lacp_xmit_marker(struct lacp_port *lp)
 {
 	struct lagg_port *lgp = lp->lp_lagg;
 	struct mbuf *m;
 	struct markerdu *mdu;
 	int error;
 
 	LACP_LOCK_ASSERT(lp->lp_lsc);
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL) {
 		return (ENOMEM);
 	}
 	m->m_len = m->m_pkthdr.len = sizeof(*mdu);
 
 	mdu = mtod(m, struct markerdu *);
 	memset(mdu, 0, sizeof(*mdu));
 
 	memcpy(&mdu->mdu_eh.ether_dhost, ethermulticastaddr_slowprotocols,
 	    ETHER_ADDR_LEN);
 	memcpy(&mdu->mdu_eh.ether_shost, lgp->lp_lladdr, ETHER_ADDR_LEN);
 	mdu->mdu_eh.ether_type = htons(ETHERTYPE_SLOW);
 
 	mdu->mdu_sph.sph_subtype = SLOWPROTOCOLS_SUBTYPE_MARKER;
 	mdu->mdu_sph.sph_version = 1;
 
 	/* Bump the transaction id and copy over the marker info */
 	lp->lp_marker.mi_rq_xid = htonl(ntohl(lp->lp_marker.mi_rq_xid) + 1);
 	TLV_SET(&mdu->mdu_tlv, MARKER_TYPE_INFO, sizeof(mdu->mdu_info));
 	mdu->mdu_info = lp->lp_marker;
 
 	LACP_DPRINTF((lp, "marker transmit, port=%u, sys=%6D, id=%u\n",
 	    ntohs(mdu->mdu_info.mi_rq_port), mdu->mdu_info.mi_rq_system, ":",
 	    ntohl(mdu->mdu_info.mi_rq_xid)));
 
 	m->m_flags |= M_MCAST;
 	error = lagg_enqueue(lp->lp_ifp, m);
 	return (error);
 }
 
 void
 lacp_linkstate(struct lagg_port *lgp)
 {
 	struct lacp_port *lp = LACP_PORT(lgp);
 	struct lacp_softc *lsc = lp->lp_lsc;
 	struct ifnet *ifp = lgp->lp_ifp;
 	struct ifmediareq ifmr;
 	int error = 0;
 	u_int media;
 	uint8_t old_state;
 	uint16_t old_key;
 
 	bzero((char *)&ifmr, sizeof(ifmr));
 	error = (*ifp->if_ioctl)(ifp, SIOCGIFXMEDIA, (caddr_t)&ifmr);
 	if (error != 0) {
 		bzero((char *)&ifmr, sizeof(ifmr));
 		error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (caddr_t)&ifmr);
 	}
 	if (error != 0)
 		return;
 
 	LACP_LOCK(lsc);
 	media = ifmr.ifm_active;
 	LACP_DPRINTF((lp, "media changed 0x%x -> 0x%x, ether = %d, fdx = %d, "
 	    "link = %d\n", lp->lp_media, media, IFM_TYPE(media) == IFM_ETHER,
 	    (media & IFM_FDX) != 0, ifp->if_link_state == LINK_STATE_UP));
 	old_state = lp->lp_state;
 	old_key = lp->lp_key;
 
 	lp->lp_media = media;
 	/*
 	 * If the port is not an active full duplex Ethernet link then it can
 	 * not be aggregated.
 	 */
 	if (IFM_TYPE(media) != IFM_ETHER || (media & IFM_FDX) == 0 ||
 	    ifp->if_link_state != LINK_STATE_UP) {
 		lacp_port_disable(lp);
 	} else {
 		lacp_port_enable(lp);
 	}
 	lp->lp_key = lacp_compose_key(lp);
 
 	if (old_state != lp->lp_state || old_key != lp->lp_key) {
 		LACP_DPRINTF((lp, "-> UNSELECTED\n"));
 		lp->lp_selected = LACP_UNSELECTED;
 	}
 	LACP_UNLOCK(lsc);
 }
 
 static void
 lacp_tick(void *arg)
 {
 	struct lacp_softc *lsc = arg;
 	struct lacp_port *lp;
 
 	LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) {
 		if ((lp->lp_state & LACP_STATE_AGGREGATION) == 0)
 			continue;
 
 		CURVNET_SET(lp->lp_ifp->if_vnet);
 		lacp_run_timers(lp);
 
 		lacp_select(lp);
 		lacp_sm_mux(lp);
 		lacp_sm_tx(lp);
 		lacp_sm_ptx_tx_schedule(lp);
 		CURVNET_RESTORE();
 	}
 	callout_reset(&lsc->lsc_callout, hz, lacp_tick, lsc);
 }
 
 int
 lacp_port_create(struct lagg_port *lgp)
 {
 	struct lagg_softc *sc = lgp->lp_softc;
 	struct lacp_softc *lsc = LACP_SOFTC(sc);
 	struct lacp_port *lp;
 	struct ifnet *ifp = lgp->lp_ifp;
 	struct sockaddr_dl sdl;
 	struct ifmultiaddr *rifma = NULL;
 	int error;
 
 	link_init_sdl(ifp, (struct sockaddr *)&sdl, IFT_ETHER);
 	sdl.sdl_alen = ETHER_ADDR_LEN;
 
 	bcopy(&ethermulticastaddr_slowprotocols,
 	    LLADDR(&sdl), ETHER_ADDR_LEN);
 	error = if_addmulti(ifp, (struct sockaddr *)&sdl, &rifma);
 	if (error) {
 		printf("%s: ADDMULTI failed on %s\n", __func__,
 		    lgp->lp_ifp->if_xname);
 		return (error);
 	}
 
 	lp = malloc(sizeof(struct lacp_port),
 	    M_DEVBUF, M_NOWAIT|M_ZERO);
 	if (lp == NULL)
 		return (ENOMEM);
 
 	LACP_LOCK(lsc);
 	lgp->lp_psc = lp;
 	lp->lp_ifp = ifp;
 	lp->lp_lagg = lgp;
 	lp->lp_lsc = lsc;
 	lp->lp_ifma = rifma;
 
 	LIST_INSERT_HEAD(&lsc->lsc_ports, lp, lp_next);
 
 	lacp_fill_actorinfo(lp, &lp->lp_actor);
 	lacp_fill_markerinfo(lp, &lp->lp_marker);
 	lp->lp_state = LACP_STATE_ACTIVITY;
 	lp->lp_aggregator = NULL;
 	lacp_sm_rx_set_expired(lp);
 	LACP_UNLOCK(lsc);
 	lacp_linkstate(lgp);
 
 	return (0);
 }
 
 void
 lacp_port_destroy(struct lagg_port *lgp)
 {
 	struct lacp_port *lp = LACP_PORT(lgp);
 	struct lacp_softc *lsc = lp->lp_lsc;
 	int i;
 
 	LACP_LOCK(lsc);
 	for (i = 0; i < LACP_NTIMER; i++) {
 		LACP_TIMER_DISARM(lp, i);
 	}
 
 	lacp_disable_collecting(lp);
 	lacp_disable_distributing(lp);
 	lacp_unselect(lp);
 
 	LIST_REMOVE(lp, lp_next);
 	LACP_UNLOCK(lsc);
 
 	/* The address may have already been removed by if_purgemaddrs() */
 	if (!lgp->lp_detaching)
 		if_delmulti_ifma(lp->lp_ifma);
 
 	free(lp, M_DEVBUF);
 }
 
 void
 lacp_req(struct lagg_softc *sc, void *data)
 {
 	struct lacp_opreq *req = (struct lacp_opreq *)data;
 	struct lacp_softc *lsc = LACP_SOFTC(sc);
 	struct lacp_aggregator *la;
 
 	bzero(req, sizeof(struct lacp_opreq));
 
 	/*
 	 * If the LACP softc is NULL, return with the opreq structure full of
 	 * zeros.  It is normal for the softc to be NULL while the lagg is
 	 * being destroyed.
 	 */
 	if (NULL == lsc)
 		return;
 
 	la = lsc->lsc_active_aggregator;
 	LACP_LOCK(lsc);
 	if (la != NULL) {
 		req->actor_prio = ntohs(la->la_actor.lip_systemid.lsi_prio);
 		memcpy(&req->actor_mac, &la->la_actor.lip_systemid.lsi_mac,
 		    ETHER_ADDR_LEN);
 		req->actor_key = ntohs(la->la_actor.lip_key);
 		req->actor_portprio = ntohs(la->la_actor.lip_portid.lpi_prio);
 		req->actor_portno = ntohs(la->la_actor.lip_portid.lpi_portno);
 		req->actor_state = la->la_actor.lip_state;
 
 		req->partner_prio = ntohs(la->la_partner.lip_systemid.lsi_prio);
 		memcpy(&req->partner_mac, &la->la_partner.lip_systemid.lsi_mac,
 		    ETHER_ADDR_LEN);
 		req->partner_key = ntohs(la->la_partner.lip_key);
 		req->partner_portprio = ntohs(la->la_partner.lip_portid.lpi_prio);
 		req->partner_portno = ntohs(la->la_partner.lip_portid.lpi_portno);
 		req->partner_state = la->la_partner.lip_state;
 	}
 	LACP_UNLOCK(lsc);
 }
 
 void
 lacp_portreq(struct lagg_port *lgp, void *data)
 {
 	struct lacp_opreq *req = (struct lacp_opreq *)data;
 	struct lacp_port *lp = LACP_PORT(lgp);
 	struct lacp_softc *lsc = lp->lp_lsc;
 
 	LACP_LOCK(lsc);
 	req->actor_prio = ntohs(lp->lp_actor.lip_systemid.lsi_prio);
 	memcpy(&req->actor_mac, &lp->lp_actor.lip_systemid.lsi_mac,
 	    ETHER_ADDR_LEN);
 	req->actor_key = ntohs(lp->lp_actor.lip_key);
 	req->actor_portprio = ntohs(lp->lp_actor.lip_portid.lpi_prio);
 	req->actor_portno = ntohs(lp->lp_actor.lip_portid.lpi_portno);
 	req->actor_state = lp->lp_actor.lip_state;
 
 	req->partner_prio = ntohs(lp->lp_partner.lip_systemid.lsi_prio);
 	memcpy(&req->partner_mac, &lp->lp_partner.lip_systemid.lsi_mac,
 	    ETHER_ADDR_LEN);
 	req->partner_key = ntohs(lp->lp_partner.lip_key);
 	req->partner_portprio = ntohs(lp->lp_partner.lip_portid.lpi_prio);
 	req->partner_portno = ntohs(lp->lp_partner.lip_portid.lpi_portno);
 	req->partner_state = lp->lp_partner.lip_state;
 	LACP_UNLOCK(lsc);
 }
 
 static void
 lacp_disable_collecting(struct lacp_port *lp)
 {
 	LACP_DPRINTF((lp, "collecting disabled\n"));
 	lp->lp_state &= ~LACP_STATE_COLLECTING;
 }
 
 static void
 lacp_enable_collecting(struct lacp_port *lp)
 {
 	LACP_DPRINTF((lp, "collecting enabled\n"));
 	lp->lp_state |= LACP_STATE_COLLECTING;
 }
 
 static void
 lacp_disable_distributing(struct lacp_port *lp)
 {
 	struct lacp_aggregator *la = lp->lp_aggregator;
 	struct lacp_softc *lsc = lp->lp_lsc;
 	struct lagg_softc *sc = lsc->lsc_softc;
 	char buf[LACP_LAGIDSTR_MAX+1];
 
 	LACP_LOCK_ASSERT(lsc);
 
 	if (la == NULL || (lp->lp_state & LACP_STATE_DISTRIBUTING) == 0) {
 		return;
 	}
 
 	KASSERT(!TAILQ_EMPTY(&la->la_ports), ("no aggregator ports"));
 	KASSERT(la->la_nports > 0, ("nports invalid (%d)", la->la_nports));
 	KASSERT(la->la_refcnt >= la->la_nports, ("aggregator refcnt invalid"));
 
 	LACP_DPRINTF((lp, "disable distributing on aggregator %s, "
 	    "nports %d -> %d\n",
 	    lacp_format_lagid_aggregator(la, buf, sizeof(buf)),
 	    la->la_nports, la->la_nports - 1));
 
 	TAILQ_REMOVE(&la->la_ports, lp, lp_dist_q);
 	la->la_nports--;
 	sc->sc_active = la->la_nports;
 
 	if (lsc->lsc_active_aggregator == la) {
 		lacp_suppress_distributing(lsc, la);
 		lacp_select_active_aggregator(lsc);
 		/* regenerate the port map, the active aggregator has changed */
 		lacp_update_portmap(lsc);
 	}
 
 	lp->lp_state &= ~LACP_STATE_DISTRIBUTING;
 	if_link_state_change(sc->sc_ifp,
 	    sc->sc_active ? LINK_STATE_UP : LINK_STATE_DOWN);
 }
 
 static void
 lacp_enable_distributing(struct lacp_port *lp)
 {
 	struct lacp_aggregator *la = lp->lp_aggregator;
 	struct lacp_softc *lsc = lp->lp_lsc;
 	struct lagg_softc *sc = lsc->lsc_softc;
 	char buf[LACP_LAGIDSTR_MAX+1];
 
 	LACP_LOCK_ASSERT(lsc);
 
 	if ((lp->lp_state & LACP_STATE_DISTRIBUTING) != 0) {
 		return;
 	}
 
 	LACP_DPRINTF((lp, "enable distributing on aggregator %s, "
 	    "nports %d -> %d\n",
 	    lacp_format_lagid_aggregator(la, buf, sizeof(buf)),
 	    la->la_nports, la->la_nports + 1));
 
 	KASSERT(la->la_refcnt > la->la_nports, ("aggregator refcnt invalid"));
 	TAILQ_INSERT_HEAD(&la->la_ports, lp, lp_dist_q);
 	la->la_nports++;
 	sc->sc_active = la->la_nports;
 
 	lp->lp_state |= LACP_STATE_DISTRIBUTING;
 
 	if (lsc->lsc_active_aggregator == la) {
 		lacp_suppress_distributing(lsc, la);
 		lacp_update_portmap(lsc);
 	} else
 		/* try to become the active aggregator */
 		lacp_select_active_aggregator(lsc);
 
 	if_link_state_change(sc->sc_ifp,
 	    sc->sc_active ? LINK_STATE_UP : LINK_STATE_DOWN);
 }
 
 static void
 lacp_transit_expire(void *vp)
 {
 	struct lacp_softc *lsc = vp;
 
 	LACP_LOCK_ASSERT(lsc);
 
 	CURVNET_SET(lsc->lsc_softc->sc_ifp->if_vnet);
 	LACP_TRACE(NULL);
 	CURVNET_RESTORE();
 
 	lsc->lsc_suppress_distributing = FALSE;
 }
 
 void
 lacp_attach(struct lagg_softc *sc)
 {
 	struct lacp_softc *lsc;
 
 	lsc = malloc(sizeof(struct lacp_softc), M_DEVBUF, M_WAITOK | M_ZERO);
 
 	sc->sc_psc = lsc;
 	lsc->lsc_softc = sc;
 
 	lsc->lsc_hashkey = m_ether_tcpip_hash_init();
 	lsc->lsc_active_aggregator = NULL;
 	lsc->lsc_strict_mode = VNET(lacp_default_strict_mode);
 	LACP_LOCK_INIT(lsc);
 	TAILQ_INIT(&lsc->lsc_aggregators);
 	LIST_INIT(&lsc->lsc_ports);
 
 	callout_init_mtx(&lsc->lsc_transit_callout, &lsc->lsc_mtx, 0);
 	callout_init_mtx(&lsc->lsc_callout, &lsc->lsc_mtx, 0);
 
 	/* if the lagg is already up then do the same */
 	if (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING)
 		lacp_init(sc);
 }
 
 void
 lacp_detach(void *psc)
 {
 	struct lacp_softc *lsc = (struct lacp_softc *)psc;
 
 	KASSERT(TAILQ_EMPTY(&lsc->lsc_aggregators),
 	    ("aggregators still active"));
 	KASSERT(lsc->lsc_active_aggregator == NULL,
 	    ("aggregator still attached"));
 
 	callout_drain(&lsc->lsc_transit_callout);
 	callout_drain(&lsc->lsc_callout);
 
 	LACP_LOCK_DESTROY(lsc);
 	free(lsc, M_DEVBUF);
 }
 
 void
 lacp_init(struct lagg_softc *sc)
 {
 	struct lacp_softc *lsc = LACP_SOFTC(sc);
 
 	LACP_LOCK(lsc);
 	callout_reset(&lsc->lsc_callout, hz, lacp_tick, lsc);
 	LACP_UNLOCK(lsc);
 }
 
 void
 lacp_stop(struct lagg_softc *sc)
 {
 	struct lacp_softc *lsc = LACP_SOFTC(sc);
 
 	LACP_LOCK(lsc);
 	callout_stop(&lsc->lsc_transit_callout);
 	callout_stop(&lsc->lsc_callout);
 	LACP_UNLOCK(lsc);
 }
 
 struct lagg_port *
 lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t hash,
     uint8_t numa_domain, int *err)
 {
 	struct lacp_softc *lsc = LACP_SOFTC(sc);
 	struct lacp_portmap *pm;
 	struct lacp_port *lp;
 	struct lacp_port **map;
 	int count;
 
 	if (__predict_false(lsc->lsc_suppress_distributing)) {
 		LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__));
 		*err = ENOBUFS;
 		return (NULL);
 	}
 
 	pm = &lsc->lsc_pmap[lsc->lsc_activemap];
 	if (pm->pm_count == 0) {
 		LACP_DPRINTF((NULL, "%s: no active aggregator\n", __func__));
 		*err = ENETDOWN;
 		return (NULL);
 	}
 
 #ifdef NUMA
 	if ((sc->sc_opts & LAGG_OPT_USE_NUMA) &&
 	    pm->pm_num_dom > 1 && numa_domain < MAXMEMDOM) {
 		count = pm->pm_numa[numa_domain].count;
 		if (count > 0) {
 			map = pm->pm_numa[numa_domain].map;
 		} else {
 			/* No ports on this domain; use global hash. */
 			map = pm->pm_map;
 			count = pm->pm_count;
 		}
 	} else
 #endif
 	{
 		map = pm->pm_map;
 		count = pm->pm_count;
 	}
 
 	hash %= count;
 	lp = map[hash];
 
 	return (lp->lp_lagg);
 }
 
 struct lagg_port *
 lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m, int *err)
 {
 	struct lacp_softc *lsc = LACP_SOFTC(sc);
 	uint32_t hash;
 	uint8_t numa_domain;
 
 	if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) &&
 	    M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
 		hash = m->m_pkthdr.flowid >> sc->flowid_shift;
 	else
 		hash = m_ether_tcpip_hash(sc->sc_flags, m, lsc->lsc_hashkey);
 
 	numa_domain = m->m_pkthdr.numa_domain;
 	return (lacp_select_tx_port_by_hash(sc, hash, numa_domain, err));
 }
 
 /*
  * lacp_suppress_distributing: drop transmit packets for a while
  * to preserve packet ordering.
  */
 
 static void
 lacp_suppress_distributing(struct lacp_softc *lsc, struct lacp_aggregator *la)
 {
 	struct lacp_port *lp;
 
 	if (lsc->lsc_active_aggregator != la) {
 		return;
 	}
 
 	LACP_TRACE(NULL);
 
 	lsc->lsc_suppress_distributing = TRUE;
 
 	/* send a marker frame down each port to verify the queues are empty */
 	LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) {
 		lp->lp_flags |= LACP_PORT_MARK;
 		if (lacp_xmit_marker(lp) != 0)
 			lp->lp_flags &= ~LACP_PORT_MARK;
 	}
 
 	/* set a timeout for the marker frames */
 	callout_reset(&lsc->lsc_transit_callout,
 	    LACP_TRANSIT_DELAY * hz / 1000, lacp_transit_expire, lsc);
 }
 
 static int
 lacp_compare_peerinfo(const struct lacp_peerinfo *a,
     const struct lacp_peerinfo *b)
 {
 	return (memcmp(a, b, offsetof(struct lacp_peerinfo, lip_state)));
 }
 
 static int
 lacp_compare_systemid(const struct lacp_systemid *a,
     const struct lacp_systemid *b)
 {
 	return (memcmp(a, b, sizeof(*a)));
 }
 
 #if 0	/* unused */
 static int
 lacp_compare_portid(const struct lacp_portid *a,
     const struct lacp_portid *b)
 {
 	return (memcmp(a, b, sizeof(*a)));
 }
 #endif
 
 static uint64_t
 lacp_aggregator_bandwidth(struct lacp_aggregator *la)
 {
 	struct lacp_port *lp;
 	uint64_t speed;
 
 	lp = TAILQ_FIRST(&la->la_ports);
 	if (lp == NULL) {
 		return (0);
 	}
 
 	speed = ifmedia_baudrate(lp->lp_media);
 	speed *= la->la_nports;
 	if (speed == 0) {
 		LACP_DPRINTF((lp, "speed 0? media=0x%x nports=%d\n",
 		    lp->lp_media, la->la_nports));
 	}
 
 	return (speed);
 }
 
 /*
  * lacp_select_active_aggregator: select an aggregator to be used to transmit
  * packets from lagg(4) interface.
  */
 
 static void
 lacp_select_active_aggregator(struct lacp_softc *lsc)
 {
 	struct lacp_aggregator *la;
 	struct lacp_aggregator *best_la = NULL;
 	uint64_t best_speed = 0;
 	char buf[LACP_LAGIDSTR_MAX+1];
 
 	LACP_TRACE(NULL);
 
 	TAILQ_FOREACH(la, &lsc->lsc_aggregators, la_q) {
 		uint64_t speed;
 
 		if (la->la_nports == 0) {
 			continue;
 		}
 
 		speed = lacp_aggregator_bandwidth(la);
 		LACP_DPRINTF((NULL, "%s, speed=%jd, nports=%d\n",
 		    lacp_format_lagid_aggregator(la, buf, sizeof(buf)),
 		    speed, la->la_nports));
 
 		/*
 		 * This aggregator is chosen if the partner has a better
 		 * system priority or, the total aggregated speed is higher
 		 * or, it is already the chosen aggregator
 		 */
 		if ((best_la != NULL && LACP_SYS_PRI(la->la_partner) <
 		    LACP_SYS_PRI(best_la->la_partner)) ||
 		    speed > best_speed ||
 		    (speed == best_speed &&
 		    la == lsc->lsc_active_aggregator)) {
 			best_la = la;
 			best_speed = speed;
 		}
 	}
 
 	KASSERT(best_la == NULL || best_la->la_nports > 0,
 	    ("invalid aggregator refcnt"));
 	KASSERT(best_la == NULL || !TAILQ_EMPTY(&best_la->la_ports),
 	    ("invalid aggregator list"));
 
 	if (lsc->lsc_active_aggregator != best_la) {
 		LACP_DPRINTF((NULL, "active aggregator changed\n"));
 		LACP_DPRINTF((NULL, "old %s\n",
 		    lacp_format_lagid_aggregator(lsc->lsc_active_aggregator,
 		    buf, sizeof(buf))));
 	} else {
 		LACP_DPRINTF((NULL, "active aggregator not changed\n"));
 	}
 	LACP_DPRINTF((NULL, "new %s\n",
 	    lacp_format_lagid_aggregator(best_la, buf, sizeof(buf))));
 
 	if (lsc->lsc_active_aggregator != best_la) {
 		lsc->lsc_active_aggregator = best_la;
 		lacp_update_portmap(lsc);
 		if (best_la) {
 			lacp_suppress_distributing(lsc, best_la);
 		}
 	}
 }
 
 /*
  * Updated the inactive portmap array with the new list of ports and
  * make it live.
  */
 static void
 lacp_update_portmap(struct lacp_softc *lsc)
 {
 	struct lagg_softc *sc = lsc->lsc_softc;
 	struct lacp_aggregator *la;
 	struct lacp_portmap *p;
 	struct lacp_port *lp;
 	uint64_t speed;
 	u_int newmap;
 	int i;
 #ifdef NUMA
 	int count;
 	uint8_t domain;
 #endif
 
 	newmap = lsc->lsc_activemap == 0 ? 1 : 0;
 	p = &lsc->lsc_pmap[newmap];
 	la = lsc->lsc_active_aggregator;
 	speed = 0;
 	bzero(p, sizeof(struct lacp_portmap));
 
 	if (la != NULL && la->la_nports > 0) {
 		p->pm_count = la->la_nports;
 		i = 0;
 		TAILQ_FOREACH(lp, &la->la_ports, lp_dist_q) {
 			p->pm_map[i++] = lp;
 #ifdef NUMA
 			domain = lp->lp_ifp->if_numa_domain;
 			if (domain >= MAXMEMDOM)
 				continue;
 			count = p->pm_numa[domain].count;
 			p->pm_numa[domain].map[count] = lp;
 			p->pm_numa[domain].count++;
 #endif
 		}
 		KASSERT(i == p->pm_count, ("Invalid port count"));
 
 #ifdef NUMA
 		for (i = 0; i < MAXMEMDOM; i++) {
 			if (p->pm_numa[i].count != 0)
 				p->pm_num_dom++;
 		}
 #endif
 		speed = lacp_aggregator_bandwidth(la);
 	}
 	sc->sc_ifp->if_baudrate = speed;
 	EVENTHANDLER_INVOKE(ifnet_event, sc->sc_ifp,
 	    IFNET_EVENT_UPDATE_BAUDRATE);
 
 	/* switch the active portmap over */
 	atomic_store_rel_int(&lsc->lsc_activemap, newmap);
 	LACP_DPRINTF((NULL, "Set table %d with %d ports\n",
 		    lsc->lsc_activemap,
 		    lsc->lsc_pmap[lsc->lsc_activemap].pm_count));
 }
 
 static uint16_t
 lacp_compose_key(struct lacp_port *lp)
 {
 	struct lagg_port *lgp = lp->lp_lagg;
 	struct lagg_softc *sc = lgp->lp_softc;
 	u_int media = lp->lp_media;
 	uint16_t key;
 
 	if ((lp->lp_state & LACP_STATE_AGGREGATION) == 0) {
 		/*
 		 * non-aggregatable links should have unique keys.
 		 *
 		 * XXX this isn't really unique as if_index is 16 bit.
 		 */
 
 		/* bit 0..14:	(some bits of) if_index of this port */
 		key = lp->lp_ifp->if_index;
 		/* bit 15:	1 */
 		key |= 0x8000;
 	} else {
 		u_int subtype = IFM_SUBTYPE(media);
 
 		KASSERT(IFM_TYPE(media) == IFM_ETHER, ("invalid media type"));
 		KASSERT((media & IFM_FDX) != 0, ("aggregating HDX interface"));
 
 		/* bit 0..4:	IFM_SUBTYPE modulo speed */
 		switch (subtype) {
 		case IFM_10_T:
 		case IFM_10_2:
 		case IFM_10_5:
 		case IFM_10_STP:
 		case IFM_10_FL:
 			key = IFM_10_T;
 			break;
 		case IFM_100_TX:
 		case IFM_100_FX:
 		case IFM_100_T4:
 		case IFM_100_VG:
 		case IFM_100_T2:
 		case IFM_100_T:
 		case IFM_100_SGMII:
 			key = IFM_100_TX;
 			break;
 		case IFM_1000_SX:
 		case IFM_1000_LX:
 		case IFM_1000_CX:
 		case IFM_1000_T:
 		case IFM_1000_KX:
 		case IFM_1000_SGMII:
 		case IFM_1000_CX_SGMII:
 			key = IFM_1000_SX;
 			break;
 		case IFM_10G_LR:
 		case IFM_10G_SR:
 		case IFM_10G_CX4:
 		case IFM_10G_TWINAX:
 		case IFM_10G_TWINAX_LONG:
 		case IFM_10G_LRM:
 		case IFM_10G_T:
 		case IFM_10G_KX4:
 		case IFM_10G_KR:
 		case IFM_10G_CR1:
 		case IFM_10G_ER:
 		case IFM_10G_SFI:
 		case IFM_10G_AOC:
 			key = IFM_10G_LR;
 			break;
 		case IFM_20G_KR2:
 			key = IFM_20G_KR2;
 			break;
 		case IFM_2500_KX:
 		case IFM_2500_T:
 		case IFM_2500_X:
 			key = IFM_2500_KX;
 			break;
 		case IFM_5000_T:
 		case IFM_5000_KR:
 		case IFM_5000_KR_S:
 		case IFM_5000_KR1:
 			key = IFM_5000_T;
 			break;
 		case IFM_50G_PCIE:
 		case IFM_50G_CR2:
 		case IFM_50G_KR2:
 		case IFM_50G_KR4:
 		case IFM_50G_SR2:
 		case IFM_50G_LR2:
 		case IFM_50G_LAUI2_AC:
 		case IFM_50G_LAUI2:
 		case IFM_50G_AUI2_AC:
 		case IFM_50G_AUI2:
 		case IFM_50G_CP:
 		case IFM_50G_SR:
 		case IFM_50G_LR:
 		case IFM_50G_FR:
 		case IFM_50G_KR_PAM4:
 		case IFM_50G_AUI1_AC:
 		case IFM_50G_AUI1:
 			key = IFM_50G_PCIE;
 			break;
 		case IFM_56G_R4:
 			key = IFM_56G_R4;
 			break;
 		case IFM_25G_PCIE:
 		case IFM_25G_CR:
 		case IFM_25G_KR:
 		case IFM_25G_SR:
 		case IFM_25G_LR:
 		case IFM_25G_ACC:
 		case IFM_25G_AOC:
 		case IFM_25G_T:
 		case IFM_25G_CR_S:
 		case IFM_25G_CR1:
 		case IFM_25G_KR_S:
 		case IFM_25G_AUI:
 		case IFM_25G_KR1:
 			key = IFM_25G_PCIE;
 			break;
 		case IFM_40G_CR4:
 		case IFM_40G_SR4:
 		case IFM_40G_LR4:
 		case IFM_40G_LM4:
 		case IFM_40G_XLPPI:
 		case IFM_40G_KR4:
 		case IFM_40G_XLAUI:
 		case IFM_40G_XLAUI_AC:
 		case IFM_40G_ER4:
 			key = IFM_40G_CR4;
 			break;
 		case IFM_100G_CR4:
 		case IFM_100G_SR4:
 		case IFM_100G_KR4:
 		case IFM_100G_LR4:
 		case IFM_100G_CAUI4_AC:
 		case IFM_100G_CAUI4:
 		case IFM_100G_AUI4_AC:
 		case IFM_100G_AUI4:
 		case IFM_100G_CR_PAM4:
 		case IFM_100G_KR_PAM4:
 		case IFM_100G_CP2:
 		case IFM_100G_SR2:
 		case IFM_100G_DR:
 		case IFM_100G_KR2_PAM4:
 		case IFM_100G_CAUI2_AC:
 		case IFM_100G_CAUI2:
 		case IFM_100G_AUI2_AC:
 		case IFM_100G_AUI2:
 			key = IFM_100G_CR4;
 			break;
 		case IFM_200G_CR4_PAM4:
 		case IFM_200G_SR4:
 		case IFM_200G_FR4:
 		case IFM_200G_LR4:
 		case IFM_200G_DR4:
 		case IFM_200G_KR4_PAM4:
 		case IFM_200G_AUI4_AC:
 		case IFM_200G_AUI4:
 		case IFM_200G_AUI8_AC:
 		case IFM_200G_AUI8:
 			key = IFM_200G_CR4_PAM4;
 			break;
 		case IFM_400G_FR8:
 		case IFM_400G_LR8:
 		case IFM_400G_DR4:
 		case IFM_400G_AUI8_AC:
 		case IFM_400G_AUI8:
 			key = IFM_400G_FR8;
 			break;
 		default:
 			key = subtype;
 			break;
 		}
 		/* bit 5..14:	(some bits of) if_index of lagg device */
 		key |= 0x7fe0 & ((sc->sc_ifp->if_index) << 5);
 		/* bit 15:	0 */
 	}
 	return (htons(key));
 }
 
 static void
 lacp_aggregator_addref(struct lacp_softc *lsc, struct lacp_aggregator *la)
 {
 	char buf[LACP_LAGIDSTR_MAX+1];
 
 	LACP_DPRINTF((NULL, "%s: lagid=%s, refcnt %d -> %d\n",
 	    __func__,
 	    lacp_format_lagid(&la->la_actor, &la->la_partner,
 	    buf, sizeof(buf)),
 	    la->la_refcnt, la->la_refcnt + 1));
 
 	KASSERT(la->la_refcnt > 0, ("refcount <= 0"));
 	la->la_refcnt++;
 	KASSERT(la->la_refcnt > la->la_nports, ("invalid refcount"));
 }
 
 static void
 lacp_aggregator_delref(struct lacp_softc *lsc, struct lacp_aggregator *la)
 {
 	char buf[LACP_LAGIDSTR_MAX+1];
 
 	LACP_DPRINTF((NULL, "%s: lagid=%s, refcnt %d -> %d\n",
 	    __func__,
 	    lacp_format_lagid(&la->la_actor, &la->la_partner,
 	    buf, sizeof(buf)),
 	    la->la_refcnt, la->la_refcnt - 1));
 
 	KASSERT(la->la_refcnt > la->la_nports, ("invalid refcnt"));
 	la->la_refcnt--;
 	if (la->la_refcnt > 0) {
 		return;
 	}
 
 	KASSERT(la->la_refcnt == 0, ("refcount not zero"));
 	KASSERT(lsc->lsc_active_aggregator != la, ("aggregator active"));
 
 	TAILQ_REMOVE(&lsc->lsc_aggregators, la, la_q);
 
 	free(la, M_DEVBUF);
 }
 
 /*
  * lacp_aggregator_get: allocate an aggregator.
  */
 
 static struct lacp_aggregator *
 lacp_aggregator_get(struct lacp_softc *lsc, struct lacp_port *lp)
 {
 	struct lacp_aggregator *la;
 
 	la = malloc(sizeof(*la), M_DEVBUF, M_NOWAIT);
 	if (la) {
 		la->la_refcnt = 1;
 		la->la_nports = 0;
 		TAILQ_INIT(&la->la_ports);
 		la->la_pending = 0;
 		TAILQ_INSERT_TAIL(&lsc->lsc_aggregators, la, la_q);
 	}
 
 	return (la);
 }
 
 /*
  * lacp_fill_aggregator_id: setup a newly allocated aggregator from a port.
  */
 
 static void
 lacp_fill_aggregator_id(struct lacp_aggregator *la, const struct lacp_port *lp)
 {
 	lacp_fill_aggregator_id_peer(&la->la_partner, &lp->lp_partner);
 	lacp_fill_aggregator_id_peer(&la->la_actor, &lp->lp_actor);
 
 	la->la_actor.lip_state = lp->lp_state & LACP_STATE_AGGREGATION;
 }
 
 static void
 lacp_fill_aggregator_id_peer(struct lacp_peerinfo *lpi_aggr,
     const struct lacp_peerinfo *lpi_port)
 {
 	memset(lpi_aggr, 0, sizeof(*lpi_aggr));
 	lpi_aggr->lip_systemid = lpi_port->lip_systemid;
 	lpi_aggr->lip_key = lpi_port->lip_key;
 }
 
 /*
  * lacp_aggregator_is_compatible: check if a port can join to an aggregator.
  */
 
 static int
 lacp_aggregator_is_compatible(const struct lacp_aggregator *la,
     const struct lacp_port *lp)
 {
 	if (!(lp->lp_state & LACP_STATE_AGGREGATION) ||
 	    !(lp->lp_partner.lip_state & LACP_STATE_AGGREGATION)) {
 		return (0);
 	}
 
 	if (!(la->la_actor.lip_state & LACP_STATE_AGGREGATION)) {
 		return (0);
 	}
 
 	if (!lacp_peerinfo_is_compatible(&la->la_partner, &lp->lp_partner)) {
 		return (0);
 	}
 
 	if (!lacp_peerinfo_is_compatible(&la->la_actor, &lp->lp_actor)) {
 		return (0);
 	}
 
 	return (1);
 }
 
 static int
 lacp_peerinfo_is_compatible(const struct lacp_peerinfo *a,
     const struct lacp_peerinfo *b)
 {
 	if (memcmp(&a->lip_systemid, &b->lip_systemid,
 	    sizeof(a->lip_systemid))) {
 		return (0);
 	}
 
 	if (memcmp(&a->lip_key, &b->lip_key, sizeof(a->lip_key))) {
 		return (0);
 	}
 
 	return (1);
 }
 
 static void
 lacp_port_enable(struct lacp_port *lp)
 {
 	lp->lp_state |= LACP_STATE_AGGREGATION;
 }
 
 static void
 lacp_port_disable(struct lacp_port *lp)
 {
 	lacp_set_mux(lp, LACP_MUX_DETACHED);
 
 	lp->lp_state &= ~LACP_STATE_AGGREGATION;
 	lp->lp_selected = LACP_UNSELECTED;
 	lacp_sm_rx_record_default(lp);
 	lp->lp_partner.lip_state &= ~LACP_STATE_AGGREGATION;
 	lp->lp_state &= ~LACP_STATE_EXPIRED;
 }
 
 /*
  * lacp_select: select an aggregator.  create one if necessary.
  */
 static void
 lacp_select(struct lacp_port *lp)
 {
 	struct lacp_softc *lsc = lp->lp_lsc;
 	struct lacp_aggregator *la;
 	char buf[LACP_LAGIDSTR_MAX+1];
 
 	if (lp->lp_aggregator) {
 		return;
 	}
 
 	/* If we haven't heard from our peer, skip this step. */
 	if (lp->lp_state & LACP_STATE_DEFAULTED)
 		return;
 
 	KASSERT(!LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE),
 	    ("timer_wait_while still active"));
 
 	LACP_DPRINTF((lp, "port lagid=%s\n",
 	    lacp_format_lagid(&lp->lp_actor, &lp->lp_partner,
 	    buf, sizeof(buf))));
 
 	TAILQ_FOREACH(la, &lsc->lsc_aggregators, la_q) {
 		if (lacp_aggregator_is_compatible(la, lp)) {
 			break;
 		}
 	}
 
 	if (la == NULL) {
 		la = lacp_aggregator_get(lsc, lp);
 		if (la == NULL) {
 			LACP_DPRINTF((lp, "aggregator creation failed\n"));
 
 			/*
 			 * will retry on the next tick.
 			 */
 
 			return;
 		}
 		lacp_fill_aggregator_id(la, lp);
 		LACP_DPRINTF((lp, "aggregator created\n"));
 	} else {
 		LACP_DPRINTF((lp, "compatible aggregator found\n"));
 		if (la->la_refcnt == LACP_MAX_PORTS)
 			return;
 		lacp_aggregator_addref(lsc, la);
 	}
 
 	LACP_DPRINTF((lp, "aggregator lagid=%s\n",
 	    lacp_format_lagid(&la->la_actor, &la->la_partner,
 	    buf, sizeof(buf))));
 
 	lp->lp_aggregator = la;
 	lp->lp_selected = LACP_SELECTED;
 }
 
 /*
  * lacp_unselect: finish unselect/detach process.
  */
 
 static void
 lacp_unselect(struct lacp_port *lp)
 {
 	struct lacp_softc *lsc = lp->lp_lsc;
 	struct lacp_aggregator *la = lp->lp_aggregator;
 
 	KASSERT(!LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE),
 	    ("timer_wait_while still active"));
 
 	if (la == NULL) {
 		return;
 	}
 
 	lp->lp_aggregator = NULL;
 	lacp_aggregator_delref(lsc, la);
 }
 
 /* mux machine */
 
 static void
 lacp_sm_mux(struct lacp_port *lp)
 {
 	struct lagg_port *lgp = lp->lp_lagg;
 	struct lagg_softc *sc = lgp->lp_softc;
 	enum lacp_mux_state new_state;
 	boolean_t p_sync =
 		    (lp->lp_partner.lip_state & LACP_STATE_SYNC) != 0;
 	boolean_t p_collecting =
 	    (lp->lp_partner.lip_state & LACP_STATE_COLLECTING) != 0;
 	enum lacp_selected selected = lp->lp_selected;
 	struct lacp_aggregator *la;
 
 	if (V_lacp_debug > 1)
 		lacp_dprintf(lp, "%s: state= 0x%x, selected= 0x%x, "
 		    "p_sync= 0x%x, p_collecting= 0x%x\n", __func__,
 		    lp->lp_mux_state, selected, p_sync, p_collecting);
 
 re_eval:
 	la = lp->lp_aggregator;
 	KASSERT(lp->lp_mux_state == LACP_MUX_DETACHED || la != NULL,
 	    ("MUX not detached"));
 	new_state = lp->lp_mux_state;
 	switch (lp->lp_mux_state) {
 	case LACP_MUX_DETACHED:
 		if (selected != LACP_UNSELECTED) {
 			new_state = LACP_MUX_WAITING;
 		}
 		break;
 	case LACP_MUX_WAITING:
 		KASSERT(la->la_pending > 0 ||
 		    !LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE),
 		    ("timer_wait_while still active"));
 		if (selected == LACP_SELECTED && la->la_pending == 0) {
 			new_state = LACP_MUX_ATTACHED;
 		} else if (selected == LACP_UNSELECTED) {
 			new_state = LACP_MUX_DETACHED;
 		}
 		break;
 	case LACP_MUX_ATTACHED:
 		if (selected == LACP_SELECTED && p_sync) {
 			new_state = LACP_MUX_COLLECTING;
 		} else if (selected != LACP_SELECTED) {
 			new_state = LACP_MUX_DETACHED;
 		}
 		break;
 	case LACP_MUX_COLLECTING:
 		if (selected == LACP_SELECTED && p_sync && p_collecting) {
 			new_state = LACP_MUX_DISTRIBUTING;
 		} else if (selected != LACP_SELECTED || !p_sync) {
 			new_state = LACP_MUX_ATTACHED;
 		}
 		break;
 	case LACP_MUX_DISTRIBUTING:
 		if (selected != LACP_SELECTED || !p_sync || !p_collecting) {
 			new_state = LACP_MUX_COLLECTING;
 			lacp_dprintf(lp, "Interface stopped DISTRIBUTING, possible flapping\n");
 			sc->sc_flapping++;
 		}
 		break;
 	default:
 		panic("%s: unknown state", __func__);
 	}
 
 	if (lp->lp_mux_state == new_state) {
 		return;
 	}
 
 	lacp_set_mux(lp, new_state);
 	goto re_eval;
 }
 
 static void
 lacp_set_mux(struct lacp_port *lp, enum lacp_mux_state new_state)
 {
 	struct lacp_aggregator *la = lp->lp_aggregator;
 
 	if (lp->lp_mux_state == new_state) {
 		return;
 	}
 
 	switch (new_state) {
 	case LACP_MUX_DETACHED:
 		lp->lp_state &= ~LACP_STATE_SYNC;
 		lacp_disable_distributing(lp);
 		lacp_disable_collecting(lp);
 		lacp_sm_assert_ntt(lp);
 		/* cancel timer */
 		if (LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE)) {
 			KASSERT(la->la_pending > 0,
 			    ("timer_wait_while not active"));
 			la->la_pending--;
 		}
 		LACP_TIMER_DISARM(lp, LACP_TIMER_WAIT_WHILE);
 		lacp_unselect(lp);
 		break;
 	case LACP_MUX_WAITING:
 		LACP_TIMER_ARM(lp, LACP_TIMER_WAIT_WHILE,
 		    LACP_AGGREGATE_WAIT_TIME);
 		la->la_pending++;
 		break;
 	case LACP_MUX_ATTACHED:
 		lp->lp_state |= LACP_STATE_SYNC;
 		lacp_disable_collecting(lp);
 		lacp_sm_assert_ntt(lp);
 		break;
 	case LACP_MUX_COLLECTING:
 		lacp_enable_collecting(lp);
 		lacp_disable_distributing(lp);
 		lacp_sm_assert_ntt(lp);
 		break;
 	case LACP_MUX_DISTRIBUTING:
 		lacp_enable_distributing(lp);
 		break;
 	default:
 		panic("%s: unknown state", __func__);
 	}
 
 	LACP_DPRINTF((lp, "mux_state %d -> %d\n", lp->lp_mux_state, new_state));
 
 	lp->lp_mux_state = new_state;
 }
 
 static void
 lacp_sm_mux_timer(struct lacp_port *lp)
 {
 	struct lacp_aggregator *la = lp->lp_aggregator;
 	char buf[LACP_LAGIDSTR_MAX+1];
 
 	KASSERT(la->la_pending > 0, ("no pending event"));
 
 	LACP_DPRINTF((lp, "%s: aggregator %s, pending %d -> %d\n", __func__,
 	    lacp_format_lagid(&la->la_actor, &la->la_partner,
 	    buf, sizeof(buf)),
 	    la->la_pending, la->la_pending - 1));
 
 	la->la_pending--;
 }
 
 /* periodic transmit machine */
 
 static void
 lacp_sm_ptx_update_timeout(struct lacp_port *lp, uint8_t oldpstate)
 {
 	if (LACP_STATE_EQ(oldpstate, lp->lp_partner.lip_state,
 	    LACP_STATE_TIMEOUT)) {
 		return;
 	}
 
 	LACP_DPRINTF((lp, "partner timeout changed\n"));
 
 	/*
 	 * FAST_PERIODIC -> SLOW_PERIODIC
 	 * or
 	 * SLOW_PERIODIC (-> PERIODIC_TX) -> FAST_PERIODIC
 	 *
 	 * let lacp_sm_ptx_tx_schedule to update timeout.
 	 */
 
 	LACP_TIMER_DISARM(lp, LACP_TIMER_PERIODIC);
 
 	/*
 	 * if timeout has been shortened, assert NTT.
 	 */
 
 	if ((lp->lp_partner.lip_state & LACP_STATE_TIMEOUT)) {
 		lacp_sm_assert_ntt(lp);
 	}
 }
 
 static void
 lacp_sm_ptx_tx_schedule(struct lacp_port *lp)
 {
 	int timeout;
 
 	if (!(lp->lp_state & LACP_STATE_ACTIVITY) &&
 	    !(lp->lp_partner.lip_state & LACP_STATE_ACTIVITY)) {
 		/*
 		 * NO_PERIODIC
 		 */
 
 		LACP_TIMER_DISARM(lp, LACP_TIMER_PERIODIC);
 		return;
 	}
 
 	if (LACP_TIMER_ISARMED(lp, LACP_TIMER_PERIODIC)) {
 		return;
 	}
 
 	timeout = (lp->lp_partner.lip_state & LACP_STATE_TIMEOUT) ?
 	    LACP_FAST_PERIODIC_TIME : LACP_SLOW_PERIODIC_TIME;
 
 	LACP_TIMER_ARM(lp, LACP_TIMER_PERIODIC, timeout);
 }
 
 static void
 lacp_sm_ptx_timer(struct lacp_port *lp)
 {
 	lacp_sm_assert_ntt(lp);
 }
 
 static void
 lacp_sm_rx(struct lacp_port *lp, const struct lacpdu *du)
 {
 	int timeout;
 
 	/*
 	 * check LACP_DISABLED first
 	 */
 
 	if (!(lp->lp_state & LACP_STATE_AGGREGATION)) {
 		return;
 	}
 
 	/*
 	 * check loopback condition.
 	 */
 
 	if (!lacp_compare_systemid(&du->ldu_actor.lip_systemid,
 	    &lp->lp_actor.lip_systemid)) {
 		return;
 	}
 
 	/*
 	 * EXPIRED, DEFAULTED, CURRENT -> CURRENT
 	 */
 
 	microuptime(&lp->lp_last_lacpdu_rx);
 	lacp_sm_rx_update_selected(lp, du);
 	lacp_sm_rx_update_ntt(lp, du);
 	lacp_sm_rx_record_pdu(lp, du);
 
 	timeout = (lp->lp_state & LACP_STATE_TIMEOUT) ?
 	    LACP_SHORT_TIMEOUT_TIME : LACP_LONG_TIMEOUT_TIME;
 	LACP_TIMER_ARM(lp, LACP_TIMER_CURRENT_WHILE, timeout);
 
 	lp->lp_state &= ~LACP_STATE_EXPIRED;
 
 	/*
 	 * kick transmit machine without waiting the next tick.
 	 */
 
 	lacp_sm_tx(lp);
 }
 
 static void
 lacp_sm_rx_set_expired(struct lacp_port *lp)
 {
 	lp->lp_partner.lip_state &= ~LACP_STATE_SYNC;
 	lp->lp_partner.lip_state |= LACP_STATE_TIMEOUT;
 	LACP_TIMER_ARM(lp, LACP_TIMER_CURRENT_WHILE, LACP_SHORT_TIMEOUT_TIME);
 	lp->lp_state |= LACP_STATE_EXPIRED;
 }
 
 static void
 lacp_sm_rx_timer(struct lacp_port *lp)
 {
 	if ((lp->lp_state & LACP_STATE_EXPIRED) == 0) {
 		/* CURRENT -> EXPIRED */
 		LACP_DPRINTF((lp, "%s: CURRENT -> EXPIRED\n", __func__));
 		lacp_sm_rx_set_expired(lp);
 	} else {
 		/* EXPIRED -> DEFAULTED */
 		LACP_DPRINTF((lp, "%s: EXPIRED -> DEFAULTED\n", __func__));
 		lacp_sm_rx_update_default_selected(lp);
 		lacp_sm_rx_record_default(lp);
 		lp->lp_state &= ~LACP_STATE_EXPIRED;
 	}
 }
 
 static void
 lacp_sm_rx_record_pdu(struct lacp_port *lp, const struct lacpdu *du)
 {
 	boolean_t active;
 	uint8_t oldpstate;
 	char buf[LACP_STATESTR_MAX+1];
 
 	LACP_TRACE(lp);
 
 	oldpstate = lp->lp_partner.lip_state;
 
 	active = (du->ldu_actor.lip_state & LACP_STATE_ACTIVITY)
 	    || ((lp->lp_state & LACP_STATE_ACTIVITY) &&
 	    (du->ldu_partner.lip_state & LACP_STATE_ACTIVITY));
 
 	lp->lp_partner = du->ldu_actor;
 	if (active &&
 	    ((LACP_STATE_EQ(lp->lp_state, du->ldu_partner.lip_state,
 	    LACP_STATE_AGGREGATION) &&
 	    !lacp_compare_peerinfo(&lp->lp_actor, &du->ldu_partner))
 	    || (du->ldu_partner.lip_state & LACP_STATE_AGGREGATION) == 0)) {
 		/*
 		 * XXX Maintain legacy behavior of leaving the
 		 * LACP_STATE_SYNC bit unchanged from the partner's
 		 * advertisement if lsc_strict_mode is false.
 		 * TODO: We should re-examine the concept of the "strict mode"
 		 * to ensure it makes sense to maintain a non-strict mode.
 		 */
 		if (lp->lp_lsc->lsc_strict_mode)
 			lp->lp_partner.lip_state |= LACP_STATE_SYNC;
 	} else {
 		lp->lp_partner.lip_state &= ~LACP_STATE_SYNC;
 	}
 
 	lp->lp_state &= ~LACP_STATE_DEFAULTED;
 
 	if (oldpstate != lp->lp_partner.lip_state) {
 		LACP_DPRINTF((lp, "old pstate %s\n",
 		    lacp_format_state(oldpstate, buf, sizeof(buf))));
 		LACP_DPRINTF((lp, "new pstate %s\n",
 		    lacp_format_state(lp->lp_partner.lip_state, buf,
 		    sizeof(buf))));
 	}
 
 	lacp_sm_ptx_update_timeout(lp, oldpstate);
 }
 
 static void
 lacp_sm_rx_update_ntt(struct lacp_port *lp, const struct lacpdu *du)
 {
 
 	LACP_TRACE(lp);
 
 	if (lacp_compare_peerinfo(&lp->lp_actor, &du->ldu_partner) ||
 	    !LACP_STATE_EQ(lp->lp_state, du->ldu_partner.lip_state,
 	    LACP_STATE_ACTIVITY | LACP_STATE_SYNC | LACP_STATE_AGGREGATION)) {
 		LACP_DPRINTF((lp, "%s: assert ntt\n", __func__));
 		lacp_sm_assert_ntt(lp);
 	}
 }
 
 static void
 lacp_sm_rx_record_default(struct lacp_port *lp)
 {
 	uint8_t oldpstate;
 
 	LACP_TRACE(lp);
 
 	oldpstate = lp->lp_partner.lip_state;
 	if (lp->lp_lsc->lsc_strict_mode)
 		lp->lp_partner = lacp_partner_admin_strict;
 	else
 		lp->lp_partner = lacp_partner_admin_optimistic;
 	lp->lp_state |= LACP_STATE_DEFAULTED;
 	lacp_sm_ptx_update_timeout(lp, oldpstate);
 }
 
 static void
 lacp_sm_rx_update_selected_from_peerinfo(struct lacp_port *lp,
     const struct lacp_peerinfo *info)
 {
 
 	LACP_TRACE(lp);
 
 	if (lacp_compare_peerinfo(&lp->lp_partner, info) ||
 	    !LACP_STATE_EQ(lp->lp_partner.lip_state, info->lip_state,
 	    LACP_STATE_AGGREGATION)) {
 		lp->lp_selected = LACP_UNSELECTED;
 		/* mux machine will clean up lp->lp_aggregator */
 	}
 }
 
 static void
 lacp_sm_rx_update_selected(struct lacp_port *lp, const struct lacpdu *du)
 {
 
 	LACP_TRACE(lp);
 
 	lacp_sm_rx_update_selected_from_peerinfo(lp, &du->ldu_actor);
 }
 
 static void
 lacp_sm_rx_update_default_selected(struct lacp_port *lp)
 {
 
 	LACP_TRACE(lp);
 
 	if (lp->lp_lsc->lsc_strict_mode)
 		lacp_sm_rx_update_selected_from_peerinfo(lp,
 		    &lacp_partner_admin_strict);
 	else
 		lacp_sm_rx_update_selected_from_peerinfo(lp,
 		    &lacp_partner_admin_optimistic);
 }
 
 /* transmit machine */
 
 static void
 lacp_sm_tx(struct lacp_port *lp)
 {
 	int error = 0;
 
 	if (!(lp->lp_state & LACP_STATE_AGGREGATION)
 #if 1
 	    || (!(lp->lp_state & LACP_STATE_ACTIVITY)
 	    && !(lp->lp_partner.lip_state & LACP_STATE_ACTIVITY))
 #endif
 	    ) {
 		lp->lp_flags &= ~LACP_PORT_NTT;
 	}
 
 	if (!(lp->lp_flags & LACP_PORT_NTT)) {
 		return;
 	}
 
 	/* Rate limit to 3 PDUs per LACP_FAST_PERIODIC_TIME */
 	if (ppsratecheck(&lp->lp_last_lacpdu, &lp->lp_lacpdu_sent,
 		    (3 / LACP_FAST_PERIODIC_TIME)) == 0) {
 		LACP_DPRINTF((lp, "rate limited pdu\n"));
 		return;
 	}
 
 	if (((1 << lp->lp_ifp->if_dunit) & lp->lp_lsc->lsc_debug.lsc_tx_test) == 0) {
 		error = lacp_xmit_lacpdu(lp);
 	} else {
 		LACP_TPRINTF((lp, "Dropping TX PDU\n"));
 	}
 
 	if (error == 0) {
 		lp->lp_flags &= ~LACP_PORT_NTT;
 	} else {
 		LACP_DPRINTF((lp, "lacpdu transmit failure, error %d\n",
 		    error));
 	}
 }
 
 static void
 lacp_sm_assert_ntt(struct lacp_port *lp)
 {
 
 	lp->lp_flags |= LACP_PORT_NTT;
 }
 
 static void
 lacp_run_timers(struct lacp_port *lp)
 {
 	int i;
 	struct timeval time_diff;
 
 	for (i = 0; i < LACP_NTIMER; i++) {
 		KASSERT(lp->lp_timer[i] >= 0,
 		    ("invalid timer value %d", lp->lp_timer[i]));
 		if (lp->lp_timer[i] == 0) {
 			continue;
 		} else {
 			if (i == LACP_TIMER_CURRENT_WHILE) {
 				microuptime(&time_diff);
 				timevalsub(&time_diff, &lp->lp_last_lacpdu_rx);
 				if (time_diff.tv_sec) {
 					/* At least one sec has elapsed since last LACP packet. */
 					--lp->lp_timer[i];
 				}
 			} else {
 				--lp->lp_timer[i];
 			}
 
 			if ((lp->lp_timer[i] <= 0) && (lacp_timer_funcs[i])) {
 				(*lacp_timer_funcs[i])(lp);
 			}
 		}
 	}
 }
 
 int
 lacp_marker_input(struct lacp_port *lp, struct mbuf *m)
 {
 	struct lacp_softc *lsc = lp->lp_lsc;
 	struct lagg_port *lgp = lp->lp_lagg;
 	struct lacp_port *lp2;
 	struct markerdu *mdu;
 	int error = 0;
 	int pending = 0;
 
 	if (m->m_pkthdr.len != sizeof(*mdu)) {
 		goto bad;
 	}
 
 	if ((m->m_flags & M_MCAST) == 0) {
 		goto bad;
 	}
 
 	if (m->m_len < sizeof(*mdu)) {
 		m = m_pullup(m, sizeof(*mdu));
 		if (m == NULL) {
 			return (ENOMEM);
 		}
 	}
 
 	mdu = mtod(m, struct markerdu *);
 
 	if (memcmp(&mdu->mdu_eh.ether_dhost,
 	    &ethermulticastaddr_slowprotocols, ETHER_ADDR_LEN)) {
 		goto bad;
 	}
 
 	if (mdu->mdu_sph.sph_version != 1) {
 		goto bad;
 	}
 
 	switch (mdu->mdu_tlv.tlv_type) {
 	case MARKER_TYPE_INFO:
 		if (tlv_check(mdu, sizeof(*mdu), &mdu->mdu_tlv,
 		    marker_info_tlv_template, TRUE)) {
 			goto bad;
 		}
 		mdu->mdu_tlv.tlv_type = MARKER_TYPE_RESPONSE;
 		memcpy(&mdu->mdu_eh.ether_dhost,
 		    &ethermulticastaddr_slowprotocols, ETHER_ADDR_LEN);
 		memcpy(&mdu->mdu_eh.ether_shost,
 		    lgp->lp_lladdr, ETHER_ADDR_LEN);
 		error = lagg_enqueue(lp->lp_ifp, m);
 		break;
 
 	case MARKER_TYPE_RESPONSE:
 		if (tlv_check(mdu, sizeof(*mdu), &mdu->mdu_tlv,
 		    marker_response_tlv_template, TRUE)) {
 			goto bad;
 		}
 		LACP_DPRINTF((lp, "marker response, port=%u, sys=%6D, id=%u\n",
 		    ntohs(mdu->mdu_info.mi_rq_port), mdu->mdu_info.mi_rq_system,
 		    ":", ntohl(mdu->mdu_info.mi_rq_xid)));
 
 		/* Verify that it is the last marker we sent out */
 		if (memcmp(&mdu->mdu_info, &lp->lp_marker,
 		    sizeof(struct lacp_markerinfo)))
 			goto bad;
 
 		LACP_LOCK(lsc);
 		lp->lp_flags &= ~LACP_PORT_MARK;
 
 		if (lsc->lsc_suppress_distributing) {
 			/* Check if any ports are waiting for a response */
 			LIST_FOREACH(lp2, &lsc->lsc_ports, lp_next) {
 				if (lp2->lp_flags & LACP_PORT_MARK) {
 					pending = 1;
 					break;
 				}
 			}
 
 			if (pending == 0) {
 				/* All interface queues are clear */
 				LACP_DPRINTF((NULL, "queue flush complete\n"));
 				lsc->lsc_suppress_distributing = FALSE;
 			}
 		}
 		LACP_UNLOCK(lsc);
 		m_freem(m);
 		break;
 
 	default:
 		goto bad;
 	}
 
 	return (error);
 
 bad:
 	LACP_DPRINTF((lp, "bad marker frame\n"));
 	m_freem(m);
 	return (EINVAL);
 }
 
 static int
 tlv_check(const void *p, size_t size, const struct tlvhdr *tlv,
     const struct tlv_template *tmpl, boolean_t check_type)
 {
 	while (/* CONSTCOND */ 1) {
 		if ((const char *)tlv - (const char *)p + sizeof(*tlv) > size) {
 			return (EINVAL);
 		}
 		if ((check_type && tlv->tlv_type != tmpl->tmpl_type) ||
 		    tlv->tlv_length != tmpl->tmpl_length) {
 			return (EINVAL);
 		}
 		if (tmpl->tmpl_type == 0) {
 			break;
 		}
 		tlv = (const struct tlvhdr *)
 		    ((const char *)tlv + tlv->tlv_length);
 		tmpl++;
 	}
 
 	return (0);
 }
 
 /* Debugging */
 const char *
 lacp_format_mac(const uint8_t *mac, char *buf, size_t buflen)
 {
 	snprintf(buf, buflen, "%02X-%02X-%02X-%02X-%02X-%02X",
 	    (int)mac[0],
 	    (int)mac[1],
 	    (int)mac[2],
 	    (int)mac[3],
 	    (int)mac[4],
 	    (int)mac[5]);
 
 	return (buf);
 }
 
 const char *
 lacp_format_systemid(const struct lacp_systemid *sysid,
     char *buf, size_t buflen)
 {
 	char macbuf[LACP_MACSTR_MAX+1];
 
 	snprintf(buf, buflen, "%04X,%s",
 	    ntohs(sysid->lsi_prio),
 	    lacp_format_mac(sysid->lsi_mac, macbuf, sizeof(macbuf)));
 
 	return (buf);
 }
 
 const char *
 lacp_format_portid(const struct lacp_portid *portid, char *buf, size_t buflen)
 {
 	snprintf(buf, buflen, "%04X,%04X",
 	    ntohs(portid->lpi_prio),
 	    ntohs(portid->lpi_portno));
 
 	return (buf);
 }
 
 const char *
 lacp_format_partner(const struct lacp_peerinfo *peer, char *buf, size_t buflen)
 {
 	char sysid[LACP_SYSTEMIDSTR_MAX+1];
 	char portid[LACP_PORTIDSTR_MAX+1];
 
 	snprintf(buf, buflen, "(%s,%04X,%s)",
 	    lacp_format_systemid(&peer->lip_systemid, sysid, sizeof(sysid)),
 	    ntohs(peer->lip_key),
 	    lacp_format_portid(&peer->lip_portid, portid, sizeof(portid)));
 
 	return (buf);
 }
 
 const char *
 lacp_format_lagid(const struct lacp_peerinfo *a,
     const struct lacp_peerinfo *b, char *buf, size_t buflen)
 {
 	char astr[LACP_PARTNERSTR_MAX+1];
 	char bstr[LACP_PARTNERSTR_MAX+1];
 
 #if 0
 	/*
 	 * there's a convention to display small numbered peer
 	 * in the left.
 	 */
 
 	if (lacp_compare_peerinfo(a, b) > 0) {
 		const struct lacp_peerinfo *t;
 
 		t = a;
 		a = b;
 		b = t;
 	}
 #endif
 
 	snprintf(buf, buflen, "[%s,%s]",
 	    lacp_format_partner(a, astr, sizeof(astr)),
 	    lacp_format_partner(b, bstr, sizeof(bstr)));
 
 	return (buf);
 }
 
 const char *
 lacp_format_lagid_aggregator(const struct lacp_aggregator *la,
     char *buf, size_t buflen)
 {
 	if (la == NULL) {
 		return ("(none)");
 	}
 
 	return (lacp_format_lagid(&la->la_actor, &la->la_partner, buf, buflen));
 }
 
 const char *
 lacp_format_state(uint8_t state, char *buf, size_t buflen)
 {
 	snprintf(buf, buflen, "%b", state, LACP_STATE_BITS);
 	return (buf);
 }
 
 static void
 lacp_dump_lacpdu(const struct lacpdu *du)
 {
 	char buf[LACP_PARTNERSTR_MAX+1];
 	char buf2[LACP_STATESTR_MAX+1];
 
 	printf("actor=%s\n",
 	    lacp_format_partner(&du->ldu_actor, buf, sizeof(buf)));
 	printf("actor.state=%s\n",
 	    lacp_format_state(du->ldu_actor.lip_state, buf2, sizeof(buf2)));
 	printf("partner=%s\n",
 	    lacp_format_partner(&du->ldu_partner, buf, sizeof(buf)));
 	printf("partner.state=%s\n",
 	    lacp_format_state(du->ldu_partner.lip_state, buf2, sizeof(buf2)));
 
 	printf("maxdelay=%d\n", ntohs(du->ldu_collector.lci_maxdelay));
 }
 
 static void
 lacp_dprintf(const struct lacp_port *lp, const char *fmt, ...)
 {
 	va_list va;
 
 	if (lp) {
 		printf("%s: ", lp->lp_ifp->if_xname);
 	}
 
 	va_start(va, fmt);
 	vprintf(fmt, va);
 	va_end(va);
 }
diff --git a/sys/net/if.c b/sys/net/if.c
index c7a6cf18d4b7..a18e4f1d8f56 100644
--- a/sys/net/if.c
+++ b/sys/net/if.c
@@ -1,4882 +1,4883 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2010 Bjoern A. Zeeb <bz@FreeBSD.org>
  * Copyright (c) 1980, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if.c	8.5 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #include "opt_bpf.h"
 #include "opt_inet6.h"
 #include "opt_inet.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/domainset.h>
 #include <sys/sbuf.h>
 #include <sys/bus.h>
 #include <sys/epoch.h>
 #include <sys/mbuf.h>
 #include <sys/systm.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/refcount.h>
 #include <sys/module.h>
 #include <sys/nv.h>
 #include <sys/rwlock.h>
 #include <sys/sockio.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/taskqueue.h>
 #include <sys/domain.h>
 #include <sys/jail.h>
 #include <sys/priv.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <machine/stdarg.h>
 #include <vm/uma.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_mib.h>
+#include <net/if_private.h>
 #include <net/if_vlan_var.h>
 #include <net/radix.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <net/ethernet.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_carp.h>
 #ifdef INET
 #include <net/debugnet.h>
 #include <netinet/if_ether.h>
 #endif /* INET */
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif /* INET6 */
 #endif /* INET || INET6 */
 
 #include <security/mac/mac_framework.h>
 
 /*
  * Consumers of struct ifreq such as tcpdump assume no pad between ifr_name
  * and ifr_ifru when it is used in SIOCGIFCONF.
  */
 _Static_assert(sizeof(((struct ifreq *)0)->ifr_name) ==
     offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru");
 
 __read_mostly epoch_t net_epoch_preempt;
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 
 struct ifreq_buffer32 {
 	uint32_t	length;		/* (size_t) */
 	uint32_t	buffer;		/* (void *) */
 };
 
 /*
  * Interface request structure used for socket
  * ioctl's.  All interface ioctl's must have parameter
  * definitions which begin with ifr_name.  The
  * remainder may be interface specific.
  */
 struct ifreq32 {
 	char	ifr_name[IFNAMSIZ];		/* if name, e.g. "en0" */
 	union {
 		struct sockaddr	ifru_addr;
 		struct sockaddr	ifru_dstaddr;
 		struct sockaddr	ifru_broadaddr;
 		struct ifreq_buffer32 ifru_buffer;
 		short		ifru_flags[2];
 		short		ifru_index;
 		int		ifru_jid;
 		int		ifru_metric;
 		int		ifru_mtu;
 		int		ifru_phys;
 		int		ifru_media;
 		uint32_t	ifru_data;
 		int		ifru_cap[2];
 		u_int		ifru_fib;
 		u_char		ifru_vlan_pcp;
 	} ifr_ifru;
 };
 CTASSERT(sizeof(struct ifreq) == sizeof(struct ifreq32));
 CTASSERT(__offsetof(struct ifreq, ifr_ifru) ==
     __offsetof(struct ifreq32, ifr_ifru));
 
 struct ifconf32 {
 	int32_t	ifc_len;
 	union {
 		uint32_t	ifcu_buf;
 		uint32_t	ifcu_req;
 	} ifc_ifcu;
 };
 #define	SIOCGIFCONF32	_IOWR('i', 36, struct ifconf32)
 
 struct ifdrv32 {
 	char		ifd_name[IFNAMSIZ];
 	uint32_t	ifd_cmd;
 	uint32_t	ifd_len;
 	uint32_t	ifd_data;
 };
 #define SIOCSDRVSPEC32	_IOC_NEWTYPE(SIOCSDRVSPEC, struct ifdrv32)
 #define SIOCGDRVSPEC32	_IOC_NEWTYPE(SIOCGDRVSPEC, struct ifdrv32)
 
 struct ifgroupreq32 {
 	char	ifgr_name[IFNAMSIZ];
 	u_int	ifgr_len;
 	union {
 		char		ifgru_group[IFNAMSIZ];
 		uint32_t	ifgru_groups;
 	} ifgr_ifgru;
 };
 #define	SIOCAIFGROUP32	_IOC_NEWTYPE(SIOCAIFGROUP, struct ifgroupreq32)
 #define	SIOCGIFGROUP32	_IOC_NEWTYPE(SIOCGIFGROUP, struct ifgroupreq32)
 #define	SIOCDIFGROUP32	_IOC_NEWTYPE(SIOCDIFGROUP, struct ifgroupreq32)
 #define	SIOCGIFGMEMB32	_IOC_NEWTYPE(SIOCGIFGMEMB, struct ifgroupreq32)
 
 struct ifmediareq32 {
 	char		ifm_name[IFNAMSIZ];
 	int		ifm_current;
 	int		ifm_mask;
 	int		ifm_status;
 	int		ifm_active;
 	int		ifm_count;
 	uint32_t	ifm_ulist;	/* (int *) */
 };
 #define	SIOCGIFMEDIA32	_IOC_NEWTYPE(SIOCGIFMEDIA, struct ifmediareq32)
 #define	SIOCGIFXMEDIA32	_IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32)
 #endif /* COMPAT_FREEBSD32 */
 
 union ifreq_union {
 	struct ifreq	ifr;
 #ifdef COMPAT_FREEBSD32
 	struct ifreq32	ifr32;
 #endif
 };
 
 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Link layers");
 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Generic link-management");
 
 SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN,
     &ifqmaxlen, 0, "max send queue size");
 
 /* Log link state change events */
 static int log_link_state_change = 1;
 
 SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW,
 	&log_link_state_change, 0,
 	"log interface link state change events");
 
 /* Log promiscuous mode change events */
 static int log_promisc_mode_change = 1;
 
 SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN,
 	&log_promisc_mode_change, 1,
 	"log promiscuous mode change events");
 
 /* Interface description */
 static unsigned int ifdescr_maxlen = 1024;
 SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW,
 	&ifdescr_maxlen, 0,
 	"administrative maximum length for interface description");
 
 static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions");
 
 /* global sx for non-critical path ifdescr */
 static struct sx ifdescr_sx;
 SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr");
 
 void	(*ng_ether_link_state_p)(struct ifnet *ifp, int state);
 void	(*lagg_linkstate_p)(struct ifnet *ifp, int state);
 /* These are external hooks for CARP. */
 void	(*carp_linkstate_p)(struct ifnet *ifp);
 void	(*carp_demote_adj_p)(int, char *);
 int	(*carp_master_p)(struct ifaddr *);
 #if defined(INET) || defined(INET6)
 int	(*carp_forus_p)(struct ifnet *ifp, u_char *dhost);
 int	(*carp_output_p)(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *sa);
 int	(*carp_ioctl_p)(struct ifreq *, u_long, struct thread *);   
 int	(*carp_attach_p)(struct ifaddr *, int);
 void	(*carp_detach_p)(struct ifaddr *, bool);
 #endif
 #ifdef INET
 int	(*carp_iamatch_p)(struct ifaddr *, uint8_t **);
 #endif
 #ifdef INET6
 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6);
 caddr_t	(*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m,
     const struct in6_addr *taddr);
 #endif
 
 struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL;
 
 /*
  * XXX: Style; these should be sorted alphabetically, and unprototyped
  * static functions should be prototyped. Currently they are sorted by
  * declaration order.
  */
 static void	if_attachdomain(void *);
 static void	if_attachdomain1(struct ifnet *);
 static int	ifconf(u_long, caddr_t);
 static void	if_input_default(struct ifnet *, struct mbuf *);
 static int	if_requestencap_default(struct ifnet *, struct if_encap_req *);
 static void	if_route(struct ifnet *, int flag, int fam);
 static int	if_setflag(struct ifnet *, int, int, int *, int);
 static int	if_transmit_default(struct ifnet *ifp, struct mbuf *m);
 static void	if_unroute(struct ifnet *, int flag, int fam);
 static int	if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int);
 static void	do_link_state_change(void *, int);
 static int	if_getgroup(struct ifgroupreq *, struct ifnet *);
 static int	if_getgroupmembers(struct ifgroupreq *);
 static void	if_delgroups(struct ifnet *);
 static void	if_attach_internal(struct ifnet *, bool);
 static int	if_detach_internal(struct ifnet *, bool);
 static void	if_siocaddmulti(void *, int);
 static void	if_link_ifnet(struct ifnet *);
 static bool	if_unlink_ifnet(struct ifnet *, bool);
 #ifdef VIMAGE
 static int	if_vmove(struct ifnet *, struct vnet *);
 #endif
 
 #ifdef INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
  * should be more generalized?
  */
 extern void	nd6_setmtu(struct ifnet *);
 #endif
 
 /* ipsec helper hooks */
 VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]);
 VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]);
 
 int	ifqmaxlen = IFQ_MAXLEN;
 VNET_DEFINE(struct ifnethead, ifnet);	/* depend on static init XXX */
 VNET_DEFINE(struct ifgrouphead, ifg_head);
 
 /* Table of ifnet by index. */
 static int if_index;
 static int if_indexlim = 8;
 static struct ifindex_entry {
 	struct ifnet	*ife_ifnet;
 	uint16_t	ife_gencnt;
 } *ifindex_table;
 
 SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Variables global to all interfaces");
 static int
 sysctl_ifcount(SYSCTL_HANDLER_ARGS)
 {
 	int rv = 0;
 
 	IFNET_RLOCK();
 	for (int i = 1; i <= if_index; i++)
 		if (ifindex_table[i].ife_ifnet != NULL &&
 		    ifindex_table[i].ife_ifnet->if_vnet == curvnet)
 			rv = i;
 	IFNET_RUNLOCK();
 
 	return (sysctl_handle_int(oidp, &rv, 0, req));
 }
 SYSCTL_PROC(_net_link_generic_system, IFMIB_IFCOUNT, ifcount,
     CTLTYPE_INT | CTLFLAG_VNET | CTLFLAG_RD, NULL, 0, sysctl_ifcount, "I",
     "Maximum known interface index");
 
 /*
  * The global network interface list (V_ifnet) and related state (such as
  * if_index, if_indexlim, and ifindex_table) are protected by an sxlock.
  * This may be acquired to stabilise the list, or we may rely on NET_EPOCH.
  */
 struct sx ifnet_sxlock;
 SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE);
 
 struct sx ifnet_detach_sxlock;
 SX_SYSINIT_FLAGS(ifnet_detach, &ifnet_detach_sxlock, "ifnet_detach_sx",
     SX_RECURSE);
 
 #ifdef VIMAGE
 #define	VNET_IS_SHUTTING_DOWN(_vnet)					\
     ((_vnet)->vnet_shutdown && (_vnet)->vnet_state < SI_SUB_VNET_DONE)
 #endif
 
 static	if_com_alloc_t *if_com_alloc[256];
 static	if_com_free_t *if_com_free[256];
 
 static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals");
 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
 
 struct ifnet *
 ifnet_byindex(u_int idx)
 {
 	struct ifnet *ifp;
 
 	NET_EPOCH_ASSERT();
 
 	if (__predict_false(idx > if_index))
 		return (NULL);
 
 	ifp = ck_pr_load_ptr(&ifindex_table[idx].ife_ifnet);
 
 	if (curvnet != NULL && ifp != NULL && ifp->if_vnet != curvnet)
 		ifp = NULL;
 
 	return (ifp);
 }
 
 struct ifnet *
 ifnet_byindex_ref(u_int idx)
 {
 	struct ifnet *ifp;
 
 	ifp = ifnet_byindex(idx);
 	if (ifp == NULL || (ifp->if_flags & IFF_DYING))
 		return (NULL);
 	if (!if_try_ref(ifp))
 		return (NULL);
 	return (ifp);
 }
 
 struct ifnet *
 ifnet_byindexgen(uint16_t idx, uint16_t gen)
 {
 	struct ifnet *ifp;
 
 	NET_EPOCH_ASSERT();
 
 	if (__predict_false(idx > if_index))
 		return (NULL);
 
 	ifp = ck_pr_load_ptr(&ifindex_table[idx].ife_ifnet);
 
 	if (ifindex_table[idx].ife_gencnt == gen)
 		return (ifp);
 	else
 		return (NULL);
 }
 
 /*
  * Network interface utility routines.
  *
  * Routines with ifa_ifwith* names take sockaddr *'s as
  * parameters.
  */
 
 static void
 if_init_idxtable(void *arg __unused)
 {
 
 	ifindex_table = malloc(if_indexlim * sizeof(*ifindex_table),
 	    M_IFNET, M_WAITOK | M_ZERO);
 }
 SYSINIT(if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, if_init_idxtable, NULL);
 
 static void
 vnet_if_init(const void *unused __unused)
 {
 
 	CK_STAILQ_INIT(&V_ifnet);
 	CK_STAILQ_INIT(&V_ifg_head);
 	vnet_if_clone_init();
 }
 VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init,
     NULL);
 
 static void
 if_link_ifnet(struct ifnet *ifp)
 {
 
 	IFNET_WLOCK();
 	CK_STAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link);
 #ifdef VIMAGE
 	curvnet->vnet_ifcnt++;
 #endif
 	IFNET_WUNLOCK();
 }
 
 static bool
 if_unlink_ifnet(struct ifnet *ifp, bool vmove)
 {
 	struct ifnet *iter;
 	int found = 0;
 
 	IFNET_WLOCK();
 	CK_STAILQ_FOREACH(iter, &V_ifnet, if_link)
 		if (iter == ifp) {
 			CK_STAILQ_REMOVE(&V_ifnet, ifp, ifnet, if_link);
 			if (!vmove)
 				ifp->if_flags |= IFF_DYING;
 			found = 1;
 			break;
 		}
 #ifdef VIMAGE
 	curvnet->vnet_ifcnt--;
 #endif
 	IFNET_WUNLOCK();
 
 	return (found);
 }
 
 #ifdef VIMAGE
 static void
 vnet_if_return(const void *unused __unused)
 {
 	struct ifnet *ifp, *nifp;
 	struct ifnet **pending;
 	int found __diagused;
 	int i;
 
 	i = 0;
 
 	/*
 	 * We need to protect our access to the V_ifnet tailq. Ordinarily we'd
 	 * enter NET_EPOCH, but that's not possible, because if_vmove() calls
 	 * if_detach_internal(), which waits for NET_EPOCH callbacks to
 	 * complete. We can't do that from within NET_EPOCH.
 	 *
 	 * However, we can also use the IFNET_xLOCK, which is the V_ifnet
 	 * read/write lock. We cannot hold the lock as we call if_vmove()
 	 * though, as that presents LOR w.r.t ifnet_sx, in_multi_sx and iflib
 	 * ctx lock.
 	 */
 	IFNET_WLOCK();
 
 	pending = malloc(sizeof(struct ifnet *) * curvnet->vnet_ifcnt,
 	    M_IFNET, M_WAITOK | M_ZERO);
 
 	/* Return all inherited interfaces to their parent vnets. */
 	CK_STAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) {
 		if (ifp->if_home_vnet != ifp->if_vnet) {
 			found = if_unlink_ifnet(ifp, true);
 			MPASS(found);
 
 			pending[i++] = ifp;
 		}
 	}
 	IFNET_WUNLOCK();
 
 	for (int j = 0; j < i; j++) {
 		sx_xlock(&ifnet_detach_sxlock);
 		if_vmove(pending[j], pending[j]->if_home_vnet);
 		sx_xunlock(&ifnet_detach_sxlock);
 	}
 
 	free(pending, M_IFNET);
 }
 VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY,
     vnet_if_return, NULL);
 #endif
 
 /*
  * Allocate a struct ifnet and an index for an interface.  A layer 2
  * common structure will also be allocated if an allocation routine is
  * registered for the passed type.
  */
 static struct ifnet *
 if_alloc_domain(u_char type, int numa_domain)
 {
 	struct ifnet *ifp;
 	u_short idx;
 
 	KASSERT(numa_domain <= IF_NODOM, ("numa_domain too large"));
 	if (numa_domain == IF_NODOM)
 		ifp = malloc(sizeof(struct ifnet), M_IFNET,
 		    M_WAITOK | M_ZERO);
 	else
 		ifp = malloc_domainset(sizeof(struct ifnet), M_IFNET,
 		    DOMAINSET_PREF(numa_domain), M_WAITOK | M_ZERO);
 	ifp->if_type = type;
 	ifp->if_alloctype = type;
 	ifp->if_numa_domain = numa_domain;
 #ifdef VIMAGE
 	ifp->if_vnet = curvnet;
 #endif
 	if (if_com_alloc[type] != NULL) {
 		ifp->if_l2com = if_com_alloc[type](type, ifp);
 		KASSERT(ifp->if_l2com, ("%s: if_com_alloc[%u] failed", __func__,
 		    type));
 	}
 
 	IF_ADDR_LOCK_INIT(ifp);
 	TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp);
 	TASK_INIT(&ifp->if_addmultitask, 0, if_siocaddmulti, ifp);
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_LOCK_INIT(ifp);
 	CK_STAILQ_INIT(&ifp->if_addrhead);
 	CK_STAILQ_INIT(&ifp->if_multiaddrs);
 	CK_STAILQ_INIT(&ifp->if_groups);
 #ifdef MAC
 	mac_ifnet_init(ifp);
 #endif
 	ifq_init(&ifp->if_snd, ifp);
 
 	refcount_init(&ifp->if_refcount, 1);	/* Index reference. */
 	for (int i = 0; i < IFCOUNTERS; i++)
 		ifp->if_counters[i] = counter_u64_alloc(M_WAITOK);
 	ifp->if_get_counter = if_get_counter_default;
 	ifp->if_pcp = IFNET_PCP_NONE;
 
 	/* Allocate an ifindex array entry. */
 	IFNET_WLOCK();
 	/*
 	 * Try to find an empty slot below if_index.  If we fail, take the
 	 * next slot.
 	 */
 	for (idx = 1; idx <= if_index; idx++) {
 		if (ifindex_table[idx].ife_ifnet == NULL)
 			break;
 	}
 
 	/* Catch if_index overflow. */
 	if (idx >= if_indexlim) {
 		struct ifindex_entry *new, *old;
 		int newlim;
 
 		newlim = if_indexlim * 2;
 		new = malloc(newlim * sizeof(*new), M_IFNET, M_WAITOK | M_ZERO);
 		memcpy(new, ifindex_table, if_indexlim * sizeof(*new));
 		old = ifindex_table;
 		ck_pr_store_ptr(&ifindex_table, new);
 		if_indexlim = newlim;
 		epoch_wait_preempt(net_epoch_preempt);
 		free(old, M_IFNET);
 	}
 	if (idx > if_index)
 		if_index = idx;
 
 	ifp->if_index = idx;
 	ifp->if_idxgen = ifindex_table[idx].ife_gencnt;
 	ck_pr_store_ptr(&ifindex_table[idx].ife_ifnet, ifp);
 	IFNET_WUNLOCK();
 
 	return (ifp);
 }
 
 struct ifnet *
 if_alloc_dev(u_char type, device_t dev)
 {
 	int numa_domain;
 
 	if (dev == NULL || bus_get_domain(dev, &numa_domain) != 0)
 		return (if_alloc_domain(type, IF_NODOM));
 	return (if_alloc_domain(type, numa_domain));
 }
 
 struct ifnet *
 if_alloc(u_char type)
 {
 
 	return (if_alloc_domain(type, IF_NODOM));
 }
 /*
  * Do the actual work of freeing a struct ifnet, and layer 2 common
  * structure.  This call is made when the network epoch guarantees
  * us that nobody holds a pointer to the interface.
  */
 static void
 if_free_deferred(epoch_context_t ctx)
 {
 	struct ifnet *ifp = __containerof(ctx, struct ifnet, if_epoch_ctx);
 
 	KASSERT((ifp->if_flags & IFF_DYING),
 	    ("%s: interface not dying", __func__));
 
 	if (if_com_free[ifp->if_alloctype] != NULL)
 		if_com_free[ifp->if_alloctype](ifp->if_l2com,
 		    ifp->if_alloctype);
 
 #ifdef MAC
 	mac_ifnet_destroy(ifp);
 #endif /* MAC */
 	IF_AFDATA_DESTROY(ifp);
 	IF_ADDR_LOCK_DESTROY(ifp);
 	ifq_delete(&ifp->if_snd);
 
 	for (int i = 0; i < IFCOUNTERS; i++)
 		counter_u64_free(ifp->if_counters[i]);
 
 	if_freedescr(ifp->if_description);
 	free(ifp->if_hw_addr, M_IFADDR);
 	free(ifp, M_IFNET);
 }
 
 /*
  * Deregister an interface and free the associated storage.
  */
 void
 if_free(struct ifnet *ifp)
 {
 
 	ifp->if_flags |= IFF_DYING;			/* XXX: Locking */
 
 	/*
 	 * XXXGL: An interface index is really an alias to ifp pointer.
 	 * Why would we clear the alias now, and not in the deferred
 	 * context?  Indeed there is nothing wrong with some network
 	 * thread obtaining ifp via ifnet_byindex() inside the network
 	 * epoch and then dereferencing ifp while we perform if_free(),
 	 * and after if_free() finished, too.
 	 *
 	 * This early index freeing was important back when ifindex was
 	 * virtualized and interface would outlive the vnet.
 	 */
 	IFNET_WLOCK();
 	MPASS(ifindex_table[ifp->if_index].ife_ifnet == ifp);
 	ck_pr_store_ptr(&ifindex_table[ifp->if_index].ife_ifnet, NULL);
 	ifindex_table[ifp->if_index].ife_gencnt++;
 	while (if_index > 0 && ifindex_table[if_index].ife_ifnet == NULL)
 		if_index--;
 	IFNET_WUNLOCK();
 
 	if (refcount_release(&ifp->if_refcount))
 		NET_EPOCH_CALL(if_free_deferred, &ifp->if_epoch_ctx);
 }
 
 /*
  * Interfaces to keep an ifnet type-stable despite the possibility of the
  * driver calling if_free().  If there are additional references, we defer
  * freeing the underlying data structure.
  */
 void
 if_ref(struct ifnet *ifp)
 {
 	u_int old __diagused;
 
 	/* We don't assert the ifnet list lock here, but arguably should. */
 	old = refcount_acquire(&ifp->if_refcount);
 	KASSERT(old > 0, ("%s: ifp %p has 0 refs", __func__, ifp));
 }
 
 bool
 if_try_ref(struct ifnet *ifp)
 {
 	NET_EPOCH_ASSERT();
 	return (refcount_acquire_if_not_zero(&ifp->if_refcount));
 }
 
 void
 if_rele(struct ifnet *ifp)
 {
 
 	if (!refcount_release(&ifp->if_refcount))
 		return;
 	NET_EPOCH_CALL(if_free_deferred, &ifp->if_epoch_ctx);
 }
 
 void
 ifq_init(struct ifaltq *ifq, struct ifnet *ifp)
 {
 
 	mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF);
 
 	if (ifq->ifq_maxlen == 0) 
 		ifq->ifq_maxlen = ifqmaxlen;
 
 	ifq->altq_type = 0;
 	ifq->altq_disc = NULL;
 	ifq->altq_flags &= ALTQF_CANTCHANGE;
 	ifq->altq_tbr  = NULL;
 	ifq->altq_ifp  = ifp;
 }
 
 void
 ifq_delete(struct ifaltq *ifq)
 {
 	mtx_destroy(&ifq->ifq_mtx);
 }
 
 /*
  * Perform generic interface initialization tasks and attach the interface
  * to the list of "active" interfaces.  If vmove flag is set on entry
  * to if_attach_internal(), perform only a limited subset of initialization
  * tasks, given that we are moving from one vnet to another an ifnet which
  * has already been fully initialized.
  *
  * Note that if_detach_internal() removes group membership unconditionally
  * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL.
  * Thus, when if_vmove() is applied to a cloned interface, group membership
  * is lost while a cloned one always joins a group whose name is
  * ifc->ifc_name.  To recover this after if_detach_internal() and
  * if_attach_internal(), the cloner should be specified to
  * if_attach_internal() via ifc.  If it is non-NULL, if_attach_internal()
  * attempts to join a group whose name is ifc->ifc_name.
  *
  * XXX:
  *  - The decision to return void and thus require this function to
  *    succeed is questionable.
  *  - We should probably do more sanity checking.  For instance we don't
  *    do anything to insure if_xname is unique or non-empty.
  */
 void
 if_attach(struct ifnet *ifp)
 {
 
 	if_attach_internal(ifp, false);
 }
 
 /*
  * Compute the least common TSO limit.
  */
 void
 if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax)
 {
 	/*
 	 * 1) If there is no limit currently, take the limit from
 	 * the network adapter.
 	 *
 	 * 2) If the network adapter has a limit below the current
 	 * limit, apply it.
 	 */
 	if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 &&
 	    ifp->if_hw_tsomax < pmax->tsomaxbytes)) {
 		pmax->tsomaxbytes = ifp->if_hw_tsomax;
 	}
 	if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 &&
 	    ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) {
 		pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 	}
 	if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 &&
 	    ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) {
 		pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 	}
 }
 
 /*
  * Update TSO limit of a network adapter.
  *
  * Returns zero if no change. Else non-zero.
  */
 int
 if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax)
 {
 	int retval = 0;
 	if (ifp->if_hw_tsomax != pmax->tsomaxbytes) {
 		ifp->if_hw_tsomax = pmax->tsomaxbytes;
 		retval++;
 	}
 	if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) {
 		ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize;
 		retval++;
 	}
 	if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) {
 		ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount;
 		retval++;
 	}
 	return (retval);
 }
 
 static void
 if_attach_internal(struct ifnet *ifp, bool vmove)
 {
 	unsigned socksize, ifasize;
 	int namelen, masklen;
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 
 	MPASS(ifindex_table[ifp->if_index].ife_ifnet == ifp);
 
 #ifdef VIMAGE
 	ifp->if_vnet = curvnet;
 	if (ifp->if_home_vnet == NULL)
 		ifp->if_home_vnet = curvnet;
 #endif
 
 	if_addgroup(ifp, IFG_ALL);
 
 #ifdef VIMAGE
 	/* Restore group membership for cloned interface. */
 	if (vmove)
 		if_clone_restoregroup(ifp);
 #endif
 
 	getmicrotime(&ifp->if_lastchange);
 	ifp->if_epoch = time_uptime;
 
 	KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) ||
 	    (ifp->if_transmit != NULL && ifp->if_qflush != NULL),
 	    ("transmit and qflush must both either be set or both be NULL"));
 	if (ifp->if_transmit == NULL) {
 		ifp->if_transmit = if_transmit_default;
 		ifp->if_qflush = if_qflush;
 	}
 	if (ifp->if_input == NULL)
 		ifp->if_input = if_input_default;
 
 	if (ifp->if_requestencap == NULL)
 		ifp->if_requestencap = if_requestencap_default;
 
 	if (!vmove) {
 #ifdef MAC
 		mac_ifnet_create(ifp);
 #endif
 
 		/*
 		 * Create a Link Level name for this device.
 		 */
 		namelen = strlen(ifp->if_xname);
 		/*
 		 * Always save enough space for any possiable name so we
 		 * can do a rename in place later.
 		 */
 		masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ;
 		socksize = masklen + ifp->if_addrlen;
 		if (socksize < sizeof(*sdl))
 			socksize = sizeof(*sdl);
 		socksize = roundup2(socksize, sizeof(long));
 		ifasize = sizeof(*ifa) + 2 * socksize;
 		ifa = ifa_alloc(ifasize, M_WAITOK);
 		sdl = (struct sockaddr_dl *)(ifa + 1);
 		sdl->sdl_len = socksize;
 		sdl->sdl_family = AF_LINK;
 		bcopy(ifp->if_xname, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl->sdl_index = ifp->if_index;
 		sdl->sdl_type = ifp->if_type;
 		ifp->if_addr = ifa;
 		ifa->ifa_ifp = ifp;
 		ifa->ifa_addr = (struct sockaddr *)sdl;
 		sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
 		ifa->ifa_netmask = (struct sockaddr *)sdl;
 		sdl->sdl_len = masklen;
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		CK_STAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
 		/* Reliably crash if used uninitialized. */
 		ifp->if_broadcastaddr = NULL;
 
 		if (ifp->if_type == IFT_ETHER) {
 			ifp->if_hw_addr = malloc(ifp->if_addrlen, M_IFADDR,
 			    M_WAITOK | M_ZERO);
 		}
 
 #if defined(INET) || defined(INET6)
 		/* Use defaults for TSO, if nothing is set */
 		if (ifp->if_hw_tsomax == 0 &&
 		    ifp->if_hw_tsomaxsegcount == 0 &&
 		    ifp->if_hw_tsomaxsegsize == 0) {
 			/*
 			 * The TSO defaults needs to be such that an
 			 * NFS mbuf list of 35 mbufs totalling just
 			 * below 64K works and that a chain of mbufs
 			 * can be defragged into at most 32 segments:
 			 */
 			ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) -
 			    (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
 			ifp->if_hw_tsomaxsegcount = 35;
 			ifp->if_hw_tsomaxsegsize = 2048;	/* 2K */
 
 			/* XXX some drivers set IFCAP_TSO after ethernet attach */
 			if (ifp->if_capabilities & IFCAP_TSO) {
 				if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n",
 				    ifp->if_hw_tsomax,
 				    ifp->if_hw_tsomaxsegcount,
 				    ifp->if_hw_tsomaxsegsize);
 			}
 		}
 #endif
 	}
 #ifdef VIMAGE
 	else {
 		/*
 		 * Update the interface index in the link layer address
 		 * of the interface.
 		 */
 		for (ifa = ifp->if_addr; ifa != NULL;
 		    ifa = CK_STAILQ_NEXT(ifa, ifa_link)) {
 			if (ifa->ifa_addr->sa_family == AF_LINK) {
 				sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 				sdl->sdl_index = ifp->if_index;
 			}
 		}
 	}
 #endif
 
 	if_link_ifnet(ifp);
 
 	if (domain_init_status >= 2)
 		if_attachdomain1(ifp);
 
 	EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
 }
 
 static void
 if_epochalloc(void *dummy __unused)
 {
 
 	net_epoch_preempt = epoch_alloc("Net preemptible", EPOCH_PREEMPT);
 }
 SYSINIT(ifepochalloc, SI_SUB_EPOCH, SI_ORDER_ANY, if_epochalloc, NULL);
 
 static void
 if_attachdomain(void *dummy)
 {
 	struct ifnet *ifp;
 
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link)
 		if_attachdomain1(ifp);
 }
 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND,
     if_attachdomain, NULL);
 
 static void
 if_attachdomain1(struct ifnet *ifp)
 {
 	struct domain *dp;
 
 	/*
 	 * Since dp->dom_ifattach calls malloc() with M_WAITOK, we
 	 * cannot lock ifp->if_afdata initialization, entirely.
 	 */
 	IF_AFDATA_LOCK(ifp);
 	if (ifp->if_afdata_initialized >= domain_init_status) {
 		IF_AFDATA_UNLOCK(ifp);
 		log(LOG_WARNING, "%s called more than once on %s\n",
 		    __func__, ifp->if_xname);
 		return;
 	}
 	ifp->if_afdata_initialized = domain_init_status;
 	IF_AFDATA_UNLOCK(ifp);
 
 	/* address family dependent data region */
 	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
 	SLIST_FOREACH(dp, &domains, dom_next) {
 		if (dp->dom_ifattach)
 			ifp->if_afdata[dp->dom_family] =
 			    (*dp->dom_ifattach)(ifp);
 	}
 }
 
 /*
  * Remove any unicast or broadcast network addresses from an interface.
  */
 void
 if_purgeaddrs(struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 
 #ifdef INET6
 	/*
 	 * Need to leave multicast addresses of proxy NDP llentries
 	 * before in6_purgeifaddr() because the llentries are keys
 	 * for in6_multi objects of proxy NDP entries.
 	 * in6_purgeifaddr()s clean up llentries including proxy NDPs
 	 * then we would lose the keys if they are called earlier.
 	 */
 	in6_purge_proxy_ndp(ifp);
 #endif
 	while (1) {
 		struct epoch_tracker et;
 
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_LINK)
 				break;
 		}
 		NET_EPOCH_EXIT(et);
 
 		if (ifa == NULL)
 			break;
 #ifdef INET
 		/* XXX: Ugly!! ad hoc just for INET */
 		if (ifa->ifa_addr->sa_family == AF_INET) {
 			struct ifaliasreq ifr;
 
 			bzero(&ifr, sizeof(ifr));
 			ifr.ifra_addr = *ifa->ifa_addr;
 			if (ifa->ifa_dstaddr)
 				ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
 			if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
 			    NULL) == 0)
 				continue;
 		}
 #endif /* INET */
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6) {
 			in6_purgeifaddr((struct in6_ifaddr *)ifa);
 			/* ifp_addrhead is already updated */
 			continue;
 		}
 #endif /* INET6 */
 		IF_ADDR_WLOCK(ifp);
 		CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link);
 		IF_ADDR_WUNLOCK(ifp);
 		ifa_free(ifa);
 	}
 }
 
 /*
  * Remove any multicast network addresses from an interface when an ifnet
  * is going away.
  */
 static void
 if_purgemaddrs(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 
 	IF_ADDR_WLOCK(ifp);
 	while (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) {
 		ifma = CK_STAILQ_FIRST(&ifp->if_multiaddrs);
 		CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
 		if_delmulti_locked(ifp, ifma, 1);
 	}
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 /*
  * Detach an interface, removing it from the list of "active" interfaces.
  * If vmove flag is set on entry to if_detach_internal(), perform only a
  * limited subset of cleanup tasks, given that we are moving an ifnet from
  * one vnet to another, where it must be fully operational.
  *
  * XXXRW: There are some significant questions about event ordering, and
  * how to prevent things from starting to use the interface during detach.
  */
 void
 if_detach(struct ifnet *ifp)
 {
 	bool found;
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	found = if_unlink_ifnet(ifp, false);
 	if (found) {
 		sx_xlock(&ifnet_detach_sxlock);
 		if_detach_internal(ifp, false);
 		sx_xunlock(&ifnet_detach_sxlock);
 	}
 	CURVNET_RESTORE();
 }
 
 /*
  * The vmove flag, if set, indicates that we are called from a callpath
  * that is moving an interface to a different vnet instance.
  *
  * The shutdown flag, if set, indicates that we are called in the
  * process of shutting down a vnet instance.  Currently only the
  * vnet_if_return SYSUNINIT function sets it.  Note: we can be called
  * on a vnet instance shutdown without this flag being set, e.g., when
  * the cloned interfaces are destoyed as first thing of teardown.
  */
 static int
 if_detach_internal(struct ifnet *ifp, bool vmove)
 {
 	struct ifaddr *ifa;
 	int i;
 	struct domain *dp;
 #ifdef VIMAGE
 	bool shutdown;
 
 	shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet);
 #endif
 
 	/*
 	 * At this point we know the interface still was on the ifnet list
 	 * and we removed it so we are in a stable state.
 	 */
 	epoch_wait_preempt(net_epoch_preempt);
 
 	/*
 	 * Ensure all pending EPOCH(9) callbacks have been executed. This
 	 * fixes issues about late destruction of multicast options
 	 * which lead to leave group calls, which in turn access the
 	 * belonging ifnet structure:
 	 */
 	NET_EPOCH_DRAIN_CALLBACKS();
 
 	/*
 	 * In any case (destroy or vmove) detach us from the groups
 	 * and remove/wait for pending events on the taskq.
 	 * XXX-BZ in theory an interface could still enqueue a taskq change?
 	 */
 	if_delgroups(ifp);
 
 	taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
 	taskqueue_drain(taskqueue_swi, &ifp->if_addmultitask);
 
 	if_down(ifp);
 
 #ifdef VIMAGE
 	/*
 	 * On VNET shutdown abort here as the stack teardown will do all
 	 * the work top-down for us.
 	 */
 	if (shutdown) {
 		/* Give interface users the chance to clean up. */
 		EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 
 		/*
 		 * In case of a vmove we are done here without error.
 		 * If we would signal an error it would lead to the same
 		 * abort as if we did not find the ifnet anymore.
 		 * if_detach() calls us in void context and does not care
 		 * about an early abort notification, so life is splendid :)
 		 */
 		goto finish_vnet_shutdown;
 	}
 #endif
 
 	/*
 	 * At this point we are not tearing down a VNET and are either
 	 * going to destroy or vmove the interface and have to cleanup
 	 * accordingly.
 	 */
 
 	/*
 	 * Remove routes and flush queues.
 	 */
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
 		altq_disable(&ifp->if_snd);
 	if (ALTQ_IS_ATTACHED(&ifp->if_snd))
 		altq_detach(&ifp->if_snd);
 #endif
 
 	if_purgeaddrs(ifp);
 
 #ifdef INET
 	in_ifdetach(ifp);
 #endif
 
 #ifdef INET6
 	/*
 	 * Remove all IPv6 kernel structs related to ifp.  This should be done
 	 * before removing routing entries below, since IPv6 interface direct
 	 * routes are expected to be removed by the IPv6-specific kernel API.
 	 * Otherwise, the kernel will detect some inconsistency and bark it.
 	 */
 	in6_ifdetach(ifp);
 #endif
 	if_purgemaddrs(ifp);
 
 	EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
 
 	if (!vmove) {
 		/*
 		 * Prevent further calls into the device driver via ifnet.
 		 */
 		if_dead(ifp);
 
 		/*
 		 * Clean up all addresses.
 		 */
 		IF_ADDR_WLOCK(ifp);
 		if (!CK_STAILQ_EMPTY(&ifp->if_addrhead)) {
 			ifa = CK_STAILQ_FIRST(&ifp->if_addrhead);
 			CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link);
 			IF_ADDR_WUNLOCK(ifp);
 			ifa_free(ifa);
 		} else
 			IF_ADDR_WUNLOCK(ifp);
 	}
 
 	rt_flushifroutes(ifp);
 
 #ifdef VIMAGE
 finish_vnet_shutdown:
 #endif
 	/*
 	 * We cannot hold the lock over dom_ifdetach calls as they might
 	 * sleep, for example trying to drain a callout, thus open up the
 	 * theoretical race with re-attaching.
 	 */
 	IF_AFDATA_LOCK(ifp);
 	i = ifp->if_afdata_initialized;
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_UNLOCK(ifp);
 	if (i == 0)
 		return (0);
 	SLIST_FOREACH(dp, &domains, dom_next) {
 		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) {
 			(*dp->dom_ifdetach)(ifp,
 			    ifp->if_afdata[dp->dom_family]);
 			ifp->if_afdata[dp->dom_family] = NULL;
 		}
 	}
 
 	return (0);
 }
 
 #ifdef VIMAGE
 /*
  * if_vmove() performs a limited version of if_detach() in current
  * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg.
  */
 static int
 if_vmove(struct ifnet *ifp, struct vnet *new_vnet)
 {
 #ifdef DEV_BPF
 	u_int bif_dlt, bif_hdrlen;
 #endif
 	int rc;
 
 #ifdef DEV_BPF
  	/*
 	 * if_detach_internal() will call the eventhandler to notify
 	 * interface departure.  That will detach if_bpf.  We need to
 	 * safe the dlt and hdrlen so we can re-attach it later.
 	 */
 	bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen);
 #endif
 
 	/*
 	 * Detach from current vnet, but preserve LLADDR info, do not
 	 * mark as dead etc. so that the ifnet can be reattached later.
 	 * If we cannot find it, we lost the race to someone else.
 	 */
 	rc = if_detach_internal(ifp, true);
 	if (rc != 0)
 		return (rc);
 
 	/*
 	 * Perform interface-specific reassignment tasks, if provided by
 	 * the driver.
 	 */
 	if (ifp->if_reassign != NULL)
 		ifp->if_reassign(ifp, new_vnet, NULL);
 
 	/*
 	 * Switch to the context of the target vnet.
 	 */
 	CURVNET_SET_QUIET(new_vnet);
 	if_attach_internal(ifp, true);
 
 #ifdef DEV_BPF
 	if (ifp->if_bpf == NULL)
 		bpfattach(ifp, bif_dlt, bif_hdrlen);
 #endif
 
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * Move an ifnet to or from another child prison/vnet, specified by the jail id.
  */
 static int
 if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct ifnet *difp;
 	int error;
 	bool found __diagused;
 	bool shutdown;
 
 	MPASS(ifindex_table[ifp->if_index].ife_ifnet == ifp);
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Do not try to move the iface from and to the same prison. */
 	if (pr->pr_vnet == ifp->if_vnet) {
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Make sure the named iface does not exists in the dst. prison/vnet. */
 	/* XXX Lock interfaces to avoid races. */
 	CURVNET_SET_QUIET(pr->pr_vnet);
 	difp = ifunit(ifname);
 	if (difp != NULL) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EEXIST);
 	}
 	sx_xlock(&ifnet_detach_sxlock);
 
 	/* Make sure the VNET is stable. */
 	shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet);
 	if (shutdown) {
 		sx_xunlock(&ifnet_detach_sxlock);
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EBUSY);
 	}
 	CURVNET_RESTORE();
 
 	found = if_unlink_ifnet(ifp, true);
 	if (! found) {
 		sx_xunlock(&ifnet_detach_sxlock);
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (ENODEV);
 	}
 
 	/* Move the interface into the child jail/vnet. */
 	error = if_vmove(ifp, pr->pr_vnet);
 
 	/* Report the new if_xname back to the userland on success. */
 	if (error == 0)
 		sprintf(ifname, "%s", ifp->if_xname);
 
 	sx_xunlock(&ifnet_detach_sxlock);
 
 	prison_free(pr);
 	return (error);
 }
 
 static int
 if_vmove_reclaim(struct thread *td, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct vnet *vnet_dst;
 	struct ifnet *ifp;
 	int error, found __diagused;
  	bool shutdown;
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Make sure the named iface exists in the source prison/vnet. */
 	CURVNET_SET(pr->pr_vnet);
 	ifp = ifunit(ifname);		/* XXX Lock to avoid races. */
 	if (ifp == NULL) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (ENXIO);
 	}
 
 	/* Do not try to move the iface from and to the same prison. */
 	vnet_dst = TD_TO_VNET(td);
 	if (vnet_dst == ifp->if_vnet) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Make sure the VNET is stable. */
 	shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet);
 	if (shutdown) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EBUSY);
 	}
 
 	/* Get interface back from child jail/vnet. */
 	found = if_unlink_ifnet(ifp, true);
 	MPASS(found);
 	sx_xlock(&ifnet_detach_sxlock);
 	error = if_vmove(ifp, vnet_dst);
 	sx_xunlock(&ifnet_detach_sxlock);
 	CURVNET_RESTORE();
 
 	/* Report the new if_xname back to the userland on success. */
 	if (error == 0)
 		sprintf(ifname, "%s", ifp->if_xname);
 
 	prison_free(pr);
 	return (error);
 }
 #endif /* VIMAGE */
 
 /*
  * Add a group to an interface
  */
 int
 if_addgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_group	*ifg = NULL;
 	struct ifg_member	*ifgm;
 	int 			 new = 0;
 
 	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
 	    groupname[strlen(groupname) - 1] <= '9')
 		return (EINVAL);
 
 	IFNET_WLOCK();
 	CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) {
 			IFNET_WUNLOCK();
 			return (EEXIST);
 		}
 
 	if ((ifgl = malloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL) {
 	    	IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	if ((ifgm = malloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) {
 		free(ifgl, M_TEMP);
 		IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, groupname))
 			break;
 
 	if (ifg == NULL) {
 		if ((ifg = malloc(sizeof(*ifg), M_TEMP, M_NOWAIT)) == NULL) {
 			free(ifgl, M_TEMP);
 			free(ifgm, M_TEMP);
 			IFNET_WUNLOCK();
 			return (ENOMEM);
 		}
 		strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
 		ifg->ifg_refcnt = 0;
 		CK_STAILQ_INIT(&ifg->ifg_members);
 		CK_STAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next);
 		new = 1;
 	}
 
 	ifg->ifg_refcnt++;
 	ifgl->ifgl_group = ifg;
 	ifgm->ifgm_ifp = ifp;
 
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
 	CK_STAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	IFNET_WUNLOCK();
 
 	if (new)
 		EVENTHANDLER_INVOKE(group_attach_event, ifg);
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Helper function to remove a group out of an interface.  Expects the global
  * ifnet lock to be write-locked, and drops it before returning.
  */
 static void
 _if_delgroup_locked(struct ifnet *ifp, struct ifg_list *ifgl,
     const char *groupname)
 {
 	struct ifg_member *ifgm;
 	bool freeifgl;
 
 	IFNET_WLOCK_ASSERT();
 
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) {
 		if (ifgm->ifgm_ifp == ifp) {
 			CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm,
 			    ifg_member, ifgm_next);
 			break;
 		}
 	}
 
 	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 		CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group,
 		    ifg_next);
 		freeifgl = true;
 	} else {
 		freeifgl = false;
 	}
 	IFNET_WUNLOCK();
 
 	epoch_wait_preempt(net_epoch_preempt);
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 	if (freeifgl) {
 		EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group);
 		free(ifgl->ifgl_group, M_TEMP);
 	}
 	free(ifgm, M_TEMP);
 	free(ifgl, M_TEMP);
 }
 
 /*
  * Remove a group from an interface
  */
 int
 if_delgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list *ifgl;
 
 	IFNET_WLOCK();
 	CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (strcmp(ifgl->ifgl_group->ifg_group, groupname) == 0)
 			break;
 	if (ifgl == NULL) {
 		IFNET_WUNLOCK();
 		return (ENOENT);
 	}
 
 	_if_delgroup_locked(ifp, ifgl, groupname);
 
 	return (0);
 }
 
 /*
  * Remove an interface from all groups
  */
 static void
 if_delgroups(struct ifnet *ifp)
 {
 	struct ifg_list *ifgl;
 	char groupname[IFNAMSIZ];
 
 	IFNET_WLOCK();
 	while ((ifgl = CK_STAILQ_FIRST(&ifp->if_groups)) != NULL) {
 		strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ);
 		_if_delgroup_locked(ifp, ifgl, groupname);
 		IFNET_WLOCK();
 	}
 	IFNET_WUNLOCK();
 }
 
 /*
  * Stores all groups from an interface in memory pointed to by ifgr.
  */
 static int
 if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp)
 {
 	int			 len, error;
 	struct ifg_list		*ifgl;
 	struct ifg_req		 ifgrq, *ifgp;
 
 	NET_EPOCH_ASSERT();
 
 	if (ifgr->ifgr_len == 0) {
 		CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 			ifgr->ifgr_len += sizeof(struct ifg_req);
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	/* XXX: wire */
 	CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
 		if (len < sizeof(ifgrq))
 			return (EINVAL);
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
 		    sizeof(ifgrq.ifgrq_group));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req))))
 			return (error);
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 
 	return (0);
 }
 
 /*
  * Stores all members of a group in memory pointed to by igfr
  */
 static int
 if_getgroupmembers(struct ifgroupreq *ifgr)
 {
 	struct ifg_group	*ifg;
 	struct ifg_member	*ifgm;
 	struct ifg_req		 ifgrq, *ifgp;
 	int			 len, error;
 
 	IFNET_RLOCK();
 	CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (strcmp(ifg->ifg_group, ifgr->ifgr_name) == 0)
 			break;
 	if (ifg == NULL) {
 		IFNET_RUNLOCK();
 		return (ENOENT);
 	}
 
 	if (ifgr->ifgr_len == 0) {
 		CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
 			ifgr->ifgr_len += sizeof(ifgrq);
 		IFNET_RUNLOCK();
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
 		if (len < sizeof(ifgrq)) {
 			IFNET_RUNLOCK();
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
 		    sizeof(ifgrq.ifgrq_member));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 			IFNET_RUNLOCK();
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IFNET_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * Return counter values from counter(9)s stored in ifnet.
  */
 uint64_t
 if_get_counter_default(struct ifnet *ifp, ift_counter cnt)
 {
 
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	return (counter_u64_fetch(ifp->if_counters[cnt]));
 }
 
 /*
  * Increase an ifnet counter. Usually used for counters shared
  * between the stack and a driver, but function supports them all.
  */
 void
 if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc)
 {
 
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	counter_u64_add(ifp->if_counters[cnt], inc);
 }
 
 /*
  * Copy data from ifnet to userland API structure if_data.
  */
 void
 if_data_copy(struct ifnet *ifp, struct if_data *ifd)
 {
 
 	ifd->ifi_type = ifp->if_type;
 	ifd->ifi_physical = 0;
 	ifd->ifi_addrlen = ifp->if_addrlen;
 	ifd->ifi_hdrlen = ifp->if_hdrlen;
 	ifd->ifi_link_state = ifp->if_link_state;
 	ifd->ifi_vhid = 0;
 	ifd->ifi_datalen = sizeof(struct if_data);
 	ifd->ifi_mtu = ifp->if_mtu;
 	ifd->ifi_metric = ifp->if_metric;
 	ifd->ifi_baudrate = ifp->if_baudrate;
 	ifd->ifi_hwassist = ifp->if_hwassist;
 	ifd->ifi_epoch = ifp->if_epoch;
 	ifd->ifi_lastchange = ifp->if_lastchange;
 
 	ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS);
 	ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS);
 	ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS);
 	ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS);
 	ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS);
 	ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES);
 	ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES);
 	ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS);
 	ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS);
 	ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS);
 	ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS);
 	ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO);
 }
 
 /*
  * Initialization, destruction and refcounting functions for ifaddrs.
  */
 struct ifaddr *
 ifa_alloc(size_t size, int flags)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(size >= sizeof(struct ifaddr),
 	    ("%s: invalid size %zu", __func__, size));
 
 	ifa = malloc(size, M_IFADDR, M_ZERO | flags);
 	if (ifa == NULL)
 		return (NULL);
 
 	if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 
 	refcount_init(&ifa->ifa_refcnt, 1);
 
 	return (ifa);
 
 fail:
 	/* free(NULL) is okay */
 	counter_u64_free(ifa->ifa_opackets);
 	counter_u64_free(ifa->ifa_ipackets);
 	counter_u64_free(ifa->ifa_obytes);
 	counter_u64_free(ifa->ifa_ibytes);
 	free(ifa, M_IFADDR);
 
 	return (NULL);
 }
 
 void
 ifa_ref(struct ifaddr *ifa)
 {
 	u_int old __diagused;
 
 	old = refcount_acquire(&ifa->ifa_refcnt);
 	KASSERT(old > 0, ("%s: ifa %p has 0 refs", __func__, ifa));
 }
 
 int
 ifa_try_ref(struct ifaddr *ifa)
 {
 
 	NET_EPOCH_ASSERT();
 	return (refcount_acquire_if_not_zero(&ifa->ifa_refcnt));
 }
 
 static void
 ifa_destroy(epoch_context_t ctx)
 {
 	struct ifaddr *ifa;
 
 	ifa = __containerof(ctx, struct ifaddr, ifa_epoch_ctx);
 	counter_u64_free(ifa->ifa_opackets);
 	counter_u64_free(ifa->ifa_ipackets);
 	counter_u64_free(ifa->ifa_obytes);
 	counter_u64_free(ifa->ifa_ibytes);
 	free(ifa, M_IFADDR);
 }
 
 void
 ifa_free(struct ifaddr *ifa)
 {
 
 	if (refcount_release(&ifa->ifa_refcnt))
 		NET_EPOCH_CALL(ifa_destroy, &ifa->ifa_epoch_ctx);
 }
 
 /*
  * XXX: Because sockaddr_dl has deeper structure than the sockaddr
  * structs used to represent other address families, it is necessary
  * to perform a different comparison.
  */
 
 #define	sa_dl_equal(a1, a2)	\
 	((((const struct sockaddr_dl *)(a1))->sdl_len ==		\
 	 ((const struct sockaddr_dl *)(a2))->sdl_len) &&		\
 	 (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)),		\
 	       CLLADDR((const struct sockaddr_dl *)(a2)),		\
 	       ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0))
 
 /*
  * Locate an interface based on a complete address.
  */
 /*ARGSUSED*/
 struct ifaddr *
 ifa_ifwithaddr(const struct sockaddr *addr)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	NET_EPOCH_ASSERT();
 
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (sa_equal(addr, ifa->ifa_addr)) {
 				goto done;
 			}
 			/* IP6 doesn't have broadcast */
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				goto done;
 			}
 		}
 	}
 	ifa = NULL;
 done:
 	return (ifa);
 }
 
 int
 ifa_ifwithaddr_check(const struct sockaddr *addr)
 {
 	struct epoch_tracker et;
 	int rc;
 
 	NET_EPOCH_ENTER(et);
 	rc = (ifa_ifwithaddr(addr) != NULL);
 	NET_EPOCH_EXIT(et);
 	return (rc);
 }
 
 /*
  * Locate an interface based on the broadcast address.
  */
 /* ARGSUSED */
 struct ifaddr *
 ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	NET_EPOCH_ASSERT();
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				goto done;
 			}
 		}
 	}
 	ifa = NULL;
 done:
 	return (ifa);
 }
 
 /*
  * Locate the point to point interface with a given destination address.
  */
 /*ARGSUSED*/
 struct ifaddr *
 ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	NET_EPOCH_ASSERT();
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
 			continue;
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (ifa->ifa_dstaddr != NULL &&
 			    sa_equal(addr, ifa->ifa_dstaddr)) {
 				goto done;
 			}
 		}
 	}
 	ifa = NULL;
 done:
 	return (ifa);
 }
 
 /*
  * Find an interface on a specific network.  If many, choice
  * is most specific found.
  */
 struct ifaddr *
 ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 	const char *addr_data = addr->sa_data, *cplim;
 
 	NET_EPOCH_ASSERT();
 	/*
 	 * AF_LINK addresses can be looked up directly by their index number,
 	 * so do that if we can.
 	 */
 	if (af == AF_LINK) {
 		ifp = ifnet_byindex(
 		    ((const struct sockaddr_dl *)addr)->sdl_index);
 		return (ifp ? ifp->if_addr : NULL);
 	}
 
 	/*
 	 * Scan though each interface, looking for ones that have addresses
 	 * in this address family and the requested fib.
 	 */
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			const char *cp, *cp2, *cp3;
 
 			if (ifa->ifa_addr->sa_family != af)
 next:				continue;
 			if (af == AF_INET && 
 			    ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) {
 				/*
 				 * This is a bit broken as it doesn't
 				 * take into account that the remote end may
 				 * be a single node in the network we are
 				 * looking for.
 				 * The trouble is that we don't know the
 				 * netmask for the remote end.
 				 */
 				if (ifa->ifa_dstaddr != NULL &&
 				    sa_equal(addr, ifa->ifa_dstaddr)) {
 					goto done;
 				}
 			} else {
 				/*
 				 * Scan all the bits in the ifa's address.
 				 * If a bit dissagrees with what we are
 				 * looking for, mask it with the netmask
 				 * to see if it really matters.
 				 * (A byte at a time)
 				 */
 				if (ifa->ifa_netmask == 0)
 					continue;
 				cp = addr_data;
 				cp2 = ifa->ifa_addr->sa_data;
 				cp3 = ifa->ifa_netmask->sa_data;
 				cplim = ifa->ifa_netmask->sa_len
 					+ (char *)ifa->ifa_netmask;
 				while (cp3 < cplim)
 					if ((*cp++ ^ *cp2++) & *cp3++)
 						goto next; /* next address! */
 				/*
 				 * If the netmask of what we just found
 				 * is more specific than what we had before
 				 * (if we had one), or if the virtual status
 				 * of new prefix is better than of the old one,
 				 * then remember the new one before continuing
 				 * to search for an even better one.
 				 */
 				if (ifa_maybe == NULL ||
 				    ifa_preferred(ifa_maybe, ifa) ||
 				    rn_refines((caddr_t)ifa->ifa_netmask,
 				    (caddr_t)ifa_maybe->ifa_netmask)) {
 					ifa_maybe = ifa;
 				}
 			}
 		}
 	}
 	ifa = ifa_maybe;
 	ifa_maybe = NULL;
 done:
 	return (ifa);
 }
 
 /*
  * Find an interface address specific to an interface best matching
  * a given address.
  */
 struct ifaddr *
 ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 	const char *cp, *cp2, *cp3;
 	char *cplim;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 
 	if (af >= AF_MAX)
 		return (NULL);
 
 	NET_EPOCH_ASSERT();
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != af)
 			continue;
 		if (ifa_maybe == NULL)
 			ifa_maybe = ifa;
 		if (ifa->ifa_netmask == 0) {
 			if (sa_equal(addr, ifa->ifa_addr) ||
 			    (ifa->ifa_dstaddr &&
 			    sa_equal(addr, ifa->ifa_dstaddr)))
 				goto done;
 			continue;
 		}
 		if (ifp->if_flags & IFF_POINTOPOINT) {
 			if (sa_equal(addr, ifa->ifa_dstaddr))
 				goto done;
 		} else {
 			cp = addr->sa_data;
 			cp2 = ifa->ifa_addr->sa_data;
 			cp3 = ifa->ifa_netmask->sa_data;
 			cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
 			for (; cp3 < cplim; cp3++)
 				if ((*cp++ ^ *cp2++) & *cp3)
 					break;
 			if (cp3 == cplim)
 				goto done;
 		}
 	}
 	ifa = ifa_maybe;
 done:
 	return (ifa);
 }
 
 /*
  * See whether new ifa is better than current one:
  * 1) A non-virtual one is preferred over virtual.
  * 2) A virtual in master state preferred over any other state.
  *
  * Used in several address selecting functions.
  */
 int
 ifa_preferred(struct ifaddr *cur, struct ifaddr *next)
 {
 
 	return (cur->ifa_carp && (!next->ifa_carp ||
 	    ((*carp_master_p)(next) && !(*carp_master_p)(cur))));
 }
 
 struct sockaddr_dl *
 link_alloc_sdl(size_t size, int flags)
 {
 
 	return (malloc(size, M_TEMP, flags));
 }
 
 void
 link_free_sdl(struct sockaddr *sa)
 {
 	free(sa, M_TEMP);
 }
 
 /*
  * Fills in given sdl with interface basic info.
  * Returns pointer to filled sdl.
  */
 struct sockaddr_dl *
 link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype)
 {
 	struct sockaddr_dl *sdl;
 
 	sdl = (struct sockaddr_dl *)paddr;
 	memset(sdl, 0, sizeof(struct sockaddr_dl));
 	sdl->sdl_len = sizeof(struct sockaddr_dl);
 	sdl->sdl_family = AF_LINK;
 	sdl->sdl_index = ifp->if_index;
 	sdl->sdl_type = iftype;
 
 	return (sdl);
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 static void
 if_unroute(struct ifnet *ifp, int flag, int fam)
 {
 
 	KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP"));
 
 	ifp->if_flags &= ~flag;
 	getmicrotime(&ifp->if_lastchange);
 	ifp->if_qflush(ifp);
 
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp, IFF_UP);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 static void
 if_route(struct ifnet *ifp, int flag, int fam)
 {
 
 	KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP"));
 
 	ifp->if_flags |= flag;
 	getmicrotime(&ifp->if_lastchange);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp, IFF_UP);
 #ifdef INET6
 	in6_if_up(ifp);
 #endif
 }
 
 void	(*vlan_link_state_p)(struct ifnet *);	/* XXX: private from if_vlan */
 void	(*vlan_trunk_cap_p)(struct ifnet *);		/* XXX: private from if_vlan */
 struct ifnet *(*vlan_trunkdev_p)(struct ifnet *);
 struct	ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t);
 int	(*vlan_tag_p)(struct ifnet *, uint16_t *);
 int	(*vlan_pcp_p)(struct ifnet *, uint16_t *);
 int	(*vlan_setcookie_p)(struct ifnet *, void *);
 void	*(*vlan_cookie_p)(struct ifnet *);
 
 /*
  * Handle a change in the interface link state. To avoid LORs
  * between driver lock and upper layer locks, as well as possible
  * recursions, we post event to taskqueue, and all job
  * is done in static do_link_state_change().
  */
 void
 if_link_state_change(struct ifnet *ifp, int link_state)
 {
 	/* Return if state hasn't changed. */
 	if (ifp->if_link_state == link_state)
 		return;
 
 	ifp->if_link_state = link_state;
 
 	/* XXXGL: reference ifp? */
 	taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask);
 }
 
 static void
 do_link_state_change(void *arg, int pending)
 {
 	struct ifnet *ifp;
 	int link_state;
 
 	ifp = arg;
 	link_state = ifp->if_link_state;
 
 	CURVNET_SET(ifp->if_vnet);
 	rt_ifmsg(ifp, 0);
 	if (ifp->if_vlantrunk != NULL)
 		(*vlan_link_state_p)(ifp);
 
 	if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) &&
 	    ifp->if_l2com != NULL)
 		(*ng_ether_link_state_p)(ifp, link_state);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	if (ifp->if_bridge)
 		ifp->if_bridge_linkstate(ifp);
 	if (ifp->if_lagg)
 		(*lagg_linkstate_p)(ifp, link_state);
 
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname,
 		    (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN",
 		    NULL);
 	if (pending > 1)
 		if_printf(ifp, "%d link states coalesced\n", pending);
 	if (log_link_state_change)
 		if_printf(ifp, "link state changed to %s\n",
 		    (link_state == LINK_STATE_UP) ? "UP" : "DOWN" );
 	EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state);
 	CURVNET_RESTORE();
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 void
 if_down(struct ifnet *ifp)
 {
 
 	EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_DOWN);
 	if_unroute(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 void
 if_up(struct ifnet *ifp)
 {
 
 	if_route(ifp, IFF_UP, AF_UNSPEC);
 	EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_UP);
 }
 
 /*
  * Flush an interface queue.
  */
 void
 if_qflush(struct ifnet *ifp)
 {
 	struct mbuf *m, *n;
 	struct ifaltq *ifq;
 
 	ifq = &ifp->if_snd;
 	IFQ_LOCK(ifq);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(ifq))
 		ALTQ_PURGE(ifq);
 #endif
 	n = ifq->ifq_head;
 	while ((m = n) != NULL) {
 		n = m->m_nextpkt;
 		m_freem(m);
 	}
 	ifq->ifq_head = 0;
 	ifq->ifq_tail = 0;
 	ifq->ifq_len = 0;
 	IFQ_UNLOCK(ifq);
 }
 
 /*
  * Map interface name to interface structure pointer, with or without
  * returning a reference.
  */
 struct ifnet *
 ifunit_ref(const char *name)
 {
 	struct epoch_tracker et;
 	struct ifnet *ifp;
 
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 &&
 		    !(ifp->if_flags & IFF_DYING))
 			break;
 	}
 	if (ifp != NULL) {
 		if_ref(ifp);
 		MPASS(ifindex_table[ifp->if_index].ife_ifnet == ifp);
 	}
 
 	NET_EPOCH_EXIT(et);
 	return (ifp);
 }
 
 struct ifnet *
 ifunit(const char *name)
 {
 	struct epoch_tracker et;
 	struct ifnet *ifp;
 
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0)
 			break;
 	}
 	NET_EPOCH_EXIT(et);
 	return (ifp);
 }
 
 void *
 ifr_buffer_get_buffer(void *data)
 {
 	union ifreq_union *ifrup;
 
 	ifrup = data;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		return ((void *)(uintptr_t)
 		    ifrup->ifr32.ifr_ifru.ifru_buffer.buffer);
 #endif
 	return (ifrup->ifr.ifr_ifru.ifru_buffer.buffer);
 }
 
 static void
 ifr_buffer_set_buffer_null(void *data)
 {
 	union ifreq_union *ifrup;
 
 	ifrup = data;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		ifrup->ifr32.ifr_ifru.ifru_buffer.buffer = 0;
 	else
 #endif
 		ifrup->ifr.ifr_ifru.ifru_buffer.buffer = NULL;
 }
 
 size_t
 ifr_buffer_get_length(void *data)
 {
 	union ifreq_union *ifrup;
 
 	ifrup = data;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		return (ifrup->ifr32.ifr_ifru.ifru_buffer.length);
 #endif
 	return (ifrup->ifr.ifr_ifru.ifru_buffer.length);
 }
 
 static void
 ifr_buffer_set_length(void *data, size_t len)
 {
 	union ifreq_union *ifrup;
 
 	ifrup = data;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		ifrup->ifr32.ifr_ifru.ifru_buffer.length = len;
 	else
 #endif
 		ifrup->ifr.ifr_ifru.ifru_buffer.length = len;
 }
 
 void *
 ifr_data_get_ptr(void *ifrp)
 {
 	union ifreq_union *ifrup;
 
 	ifrup = ifrp;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		return ((void *)(uintptr_t)
 		    ifrup->ifr32.ifr_ifru.ifru_data);
 #endif
 		return (ifrup->ifr.ifr_ifru.ifru_data);
 }
 
 struct ifcap_nv_bit_name {
 	uint64_t cap_bit;
 	const char *cap_name;
 };
 #define CAPNV(x) {.cap_bit = IFCAP_##x, \
     .cap_name = __CONCAT(IFCAP_, __CONCAT(x, _NAME)) }
 const struct ifcap_nv_bit_name ifcap_nv_bit_names[] = {
 	CAPNV(RXCSUM),
 	CAPNV(TXCSUM),
 	CAPNV(NETCONS),
 	CAPNV(VLAN_MTU),
 	CAPNV(VLAN_HWTAGGING),
 	CAPNV(JUMBO_MTU),
 	CAPNV(POLLING),
 	CAPNV(VLAN_HWCSUM),
 	CAPNV(TSO4),
 	CAPNV(TSO6),
 	CAPNV(LRO),
 	CAPNV(WOL_UCAST),
 	CAPNV(WOL_MCAST),
 	CAPNV(WOL_MAGIC),
 	CAPNV(TOE4),
 	CAPNV(TOE6),
 	CAPNV(VLAN_HWFILTER),
 	CAPNV(VLAN_HWTSO),
 	CAPNV(LINKSTATE),
 	CAPNV(NETMAP),
 	CAPNV(RXCSUM_IPV6),
 	CAPNV(TXCSUM_IPV6),
 	CAPNV(HWSTATS),
 	CAPNV(TXRTLMT),
 	CAPNV(HWRXTSTMP),
 	CAPNV(MEXTPG),
 	CAPNV(TXTLS4),
 	CAPNV(TXTLS6),
 	CAPNV(VXLAN_HWCSUM),
 	CAPNV(VXLAN_HWTSO),
 	CAPNV(TXTLS_RTLMT),
 	{0, NULL}
 };
 #define CAP2NV(x) {.cap_bit = IFCAP2_##x, \
     .cap_name = __CONCAT(IFCAP2_, __CONCAT(x, _NAME)) }
 const struct ifcap_nv_bit_name ifcap2_nv_bit_names[] = {
 	CAP2NV(RXTLS4),
 	CAP2NV(RXTLS6),
 	{0, NULL}
 };
 #undef CAPNV
 #undef CAP2NV
 
 int
 if_capnv_to_capint(const nvlist_t *nv, int *old_cap,
     const struct ifcap_nv_bit_name *nn, bool all)
 {
 	int i, res;
 
 	res = 0;
 	for (i = 0; nn[i].cap_name != NULL; i++) {
 		if (nvlist_exists_bool(nv, nn[i].cap_name)) {
 			if (all || nvlist_get_bool(nv, nn[i].cap_name))
 				res |= nn[i].cap_bit;
 		} else {
 			res |= *old_cap & nn[i].cap_bit;
 		}
 	}
 	return (res);
 }
 
 void
 if_capint_to_capnv(nvlist_t *nv, const struct ifcap_nv_bit_name *nn,
     int ifr_cap, int ifr_req)
 {
 	int i;
 
 	for (i = 0; nn[i].cap_name != NULL; i++) {
 		if ((nn[i].cap_bit & ifr_cap) != 0) {
 			nvlist_add_bool(nv, nn[i].cap_name,
 			    (nn[i].cap_bit & ifr_req) != 0);
 		}
 	}
 }
 
 /*
  * Hardware specific interface ioctls.
  */
 int
 ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
 {
 	struct ifreq *ifr;
 	int error = 0, do_ifup = 0;
 	int new_flags, temp_flags;
 	size_t namelen, onamelen;
 	size_t descrlen, nvbuflen;
 	char *descrbuf;
 	char new_name[IFNAMSIZ];
 	char old_name[IFNAMSIZ], strbuf[IFNAMSIZ + 8];
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 	void *buf;
 	nvlist_t *nvcap;
 	struct siocsifcapnv_driver_data drv_ioctl_data;
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 	case SIOCGIFINDEX:
 		ifr->ifr_index = ifp->if_index;
 		break;
 
 	case SIOCGIFFLAGS:
 		temp_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifr->ifr_flags = temp_flags & 0xffff;
 		ifr->ifr_flagshigh = temp_flags >> 16;
 		break;
 
 	case SIOCGIFCAP:
 		ifr->ifr_reqcap = ifp->if_capabilities;
 		ifr->ifr_curcap = ifp->if_capenable;
 		break;
 
 	case SIOCGIFCAPNV:
 		if ((ifp->if_capabilities & IFCAP_NV) == 0) {
 			error = EINVAL;
 			break;
 		}
 		buf = NULL;
 		nvcap = nvlist_create(0);
 		for (;;) {
 			if_capint_to_capnv(nvcap, ifcap_nv_bit_names,
 			    ifp->if_capabilities, ifp->if_capenable);
 			if_capint_to_capnv(nvcap, ifcap2_nv_bit_names,
 			    ifp->if_capabilities2, ifp->if_capenable2);
 			error = (*ifp->if_ioctl)(ifp, SIOCGIFCAPNV,
 			    __DECONST(caddr_t, nvcap));
 			if (error != 0) {
 				if_printf(ifp,
 			    "SIOCGIFCAPNV driver mistake: nvlist error %d\n",
 				    error);
 				break;
 			}
 			buf = nvlist_pack(nvcap, &nvbuflen);
 			if (buf == NULL) {
 				error = nvlist_error(nvcap);
 				if (error == 0)
 					error = EDOOFUS;
 				break;
 			}
 			if (nvbuflen > ifr->ifr_cap_nv.buf_length) {
 				ifr->ifr_cap_nv.length = nvbuflen;
 				ifr->ifr_cap_nv.buffer = NULL;
 				error = EFBIG;
 				break;
 			}
 			ifr->ifr_cap_nv.length = nvbuflen;
 			error = copyout(buf, ifr->ifr_cap_nv.buffer, nvbuflen);
 			break;
 		}
 		free(buf, M_NVLIST);
 		nvlist_destroy(nvcap);
 		break;
 
 	case SIOCGIFDATA:
 	{
 		struct if_data ifd;
 
 		/* Ensure uninitialised padding is not leaked. */
 		memset(&ifd, 0, sizeof(ifd));
 
 		if_data_copy(ifp, &ifd);
 		error = copyout(&ifd, ifr_data_get_ptr(ifr), sizeof(ifd));
 		break;
 	}
 
 #ifdef MAC
 	case SIOCGIFMAC:
 		error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCGIFMETRIC:
 		ifr->ifr_metric = ifp->if_metric;
 		break;
 
 	case SIOCGIFMTU:
 		ifr->ifr_mtu = ifp->if_mtu;
 		break;
 
 	case SIOCGIFPHYS:
 		/* XXXGL: did this ever worked? */
 		ifr->ifr_phys = 0;
 		break;
 
 	case SIOCGIFDESCR:
 		error = 0;
 		sx_slock(&ifdescr_sx);
 		if (ifp->if_description == NULL)
 			error = ENOMSG;
 		else {
 			/* space for terminating nul */
 			descrlen = strlen(ifp->if_description) + 1;
 			if (ifr_buffer_get_length(ifr) < descrlen)
 				ifr_buffer_set_buffer_null(ifr);
 			else
 				error = copyout(ifp->if_description,
 				    ifr_buffer_get_buffer(ifr), descrlen);
 			ifr_buffer_set_length(ifr, descrlen);
 		}
 		sx_sunlock(&ifdescr_sx);
 		break;
 
 	case SIOCSIFDESCR:
 		error = priv_check(td, PRIV_NET_SETIFDESCR);
 		if (error)
 			return (error);
 
 		/*
 		 * Copy only (length-1) bytes to make sure that
 		 * if_description is always nul terminated.  The
 		 * length parameter is supposed to count the
 		 * terminating nul in.
 		 */
 		if (ifr_buffer_get_length(ifr) > ifdescr_maxlen)
 			return (ENAMETOOLONG);
 		else if (ifr_buffer_get_length(ifr) == 0)
 			descrbuf = NULL;
 		else {
 			descrbuf = if_allocdescr(ifr_buffer_get_length(ifr), M_WAITOK);
 			error = copyin(ifr_buffer_get_buffer(ifr), descrbuf,
 			    ifr_buffer_get_length(ifr) - 1);
 			if (error) {
 				if_freedescr(descrbuf);
 				break;
 			}
 		}
 
 		if_setdescr(ifp, descrbuf);
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCGIFFIB:
 		ifr->ifr_fib = ifp->if_fib;
 		break;
 
 	case SIOCSIFFIB:
 		error = priv_check(td, PRIV_NET_SETIFFIB);
 		if (error)
 			return (error);
 		if (ifr->ifr_fib >= rt_numfibs)
 			return (EINVAL);
 
 		ifp->if_fib = ifr->ifr_fib;
 		break;
 
 	case SIOCSIFFLAGS:
 		error = priv_check(td, PRIV_NET_SETIFFLAGS);
 		if (error)
 			return (error);
 		/*
 		 * Currently, no driver owned flags pass the IFF_CANTCHANGE
 		 * check, so we don't need special handling here yet.
 		 */
 		new_flags = (ifr->ifr_flags & 0xffff) |
 		    (ifr->ifr_flagshigh << 16);
 		if (ifp->if_flags & IFF_UP &&
 		    (new_flags & IFF_UP) == 0) {
 			if_down(ifp);
 		} else if (new_flags & IFF_UP &&
 		    (ifp->if_flags & IFF_UP) == 0) {
 			do_ifup = 1;
 		}
 		/* See if permanently promiscuous mode bit is about to flip */
 		if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) {
 			if (new_flags & IFF_PPROMISC)
 				ifp->if_flags |= IFF_PROMISC;
 			else if (ifp->if_pcount == 0)
 				ifp->if_flags &= ~IFF_PROMISC;
 			if (log_promisc_mode_change)
                                 if_printf(ifp, "permanently promiscuous mode %s\n",
                                     ((new_flags & IFF_PPROMISC) ?
                                      "enabled" : "disabled"));
 		}
 		ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
 			(new_flags &~ IFF_CANTCHANGE);
 		if (ifp->if_ioctl) {
 			(void) (*ifp->if_ioctl)(ifp, cmd, data);
 		}
 		if (do_ifup)
 			if_up(ifp);
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFCAP:
 		error = priv_check(td, PRIV_NET_SETIFCAP);
 		if (error != 0)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		if (ifr->ifr_reqcap & ~ifp->if_capabilities)
 			return (EINVAL);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFCAPNV:
 		error = priv_check(td, PRIV_NET_SETIFCAP);
 		if (error != 0)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		if ((ifp->if_capabilities & IFCAP_NV) == 0)
 			return (EINVAL);
 		if (ifr->ifr_cap_nv.length > IFR_CAP_NV_MAXBUFSIZE)
 			return (EINVAL);
 		nvcap = NULL;
 		buf = malloc(ifr->ifr_cap_nv.length, M_TEMP, M_WAITOK);
 		for (;;) {
 			error = copyin(ifr->ifr_cap_nv.buffer, buf,
 			    ifr->ifr_cap_nv.length);
 			if (error != 0)
 				break;
 			nvcap = nvlist_unpack(buf, ifr->ifr_cap_nv.length, 0);
 			if (nvcap == NULL) {
 				error = EINVAL;
 				break;
 			}
 			drv_ioctl_data.reqcap = if_capnv_to_capint(nvcap,
 			    &ifp->if_capenable, ifcap_nv_bit_names, false);
 			if ((drv_ioctl_data.reqcap &
 			    ~ifp->if_capabilities) != 0) {
 				error = EINVAL;
 				break;
 			}
 			drv_ioctl_data.reqcap2 = if_capnv_to_capint(nvcap,
 			    &ifp->if_capenable2, ifcap2_nv_bit_names, false);
 			if ((drv_ioctl_data.reqcap2 &
 			    ~ifp->if_capabilities2) != 0) {
 				error = EINVAL;
 				break;
 			}
 			drv_ioctl_data.nvcap = nvcap;
 			error = (*ifp->if_ioctl)(ifp, SIOCSIFCAPNV,
 			    (caddr_t)&drv_ioctl_data);
 			break;
 		}
 		nvlist_destroy(nvcap);
 		free(buf, M_TEMP);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 #ifdef MAC
 	case SIOCSIFMAC:
 		error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCSIFNAME:
 		error = priv_check(td, PRIV_NET_SETIFNAME);
 		if (error)
 			return (error);
 		error = copyinstr(ifr_data_get_ptr(ifr), new_name, IFNAMSIZ,
 		    NULL);
 		if (error != 0)
 			return (error);
 		if (new_name[0] == '\0')
 			return (EINVAL);
 		if (strcmp(new_name, ifp->if_xname) == 0)
 			break;
 		if (ifunit(new_name) != NULL)
 			return (EEXIST);
 
 		/*
 		 * XXX: Locking.  Nothing else seems to lock if_flags,
 		 * and there are numerous other races with the
 		 * ifunit() checks not being atomic with namespace
 		 * changes (renames, vmoves, if_attach, etc).
 		 */
 		ifp->if_flags |= IFF_RENAMING;
 		
 		EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 
 		if_printf(ifp, "changing name to '%s'\n", new_name);
 
 		IF_ADDR_WLOCK(ifp);
 		strlcpy(old_name, ifp->if_xname, sizeof(old_name));
 		strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
 		ifa = ifp->if_addr;
 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 		namelen = strlen(new_name);
 		onamelen = sdl->sdl_nlen;
 		/*
 		 * Move the address if needed.  This is safe because we
 		 * allocate space for a name of length IFNAMSIZ when we
 		 * create this in if_attach().
 		 */
 		if (namelen != onamelen) {
 			bcopy(sdl->sdl_data + onamelen,
 			    sdl->sdl_data + namelen, sdl->sdl_alen);
 		}
 		bcopy(new_name, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
 		bzero(sdl->sdl_data, onamelen);
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		IF_ADDR_WUNLOCK(ifp);
 
 		EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 
 		ifp->if_flags &= ~IFF_RENAMING;
 
 		snprintf(strbuf, sizeof(strbuf), "name=%s", new_name);
 		devctl_notify("IFNET", old_name, "RENAME", strbuf);
 		break;
 
 #ifdef VIMAGE
 	case SIOCSIFVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error)
 			return (error);
 		error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid);
 		break;
 #endif
 
 	case SIOCSIFMETRIC:
 		error = priv_check(td, PRIV_NET_SETIFMETRIC);
 		if (error)
 			return (error);
 		ifp->if_metric = ifr->ifr_metric;
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYS:
 		error = priv_check(td, PRIV_NET_SETIFPHYS);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFMTU:
 	{
 		u_long oldmtu = ifp->if_mtu;
 
 		error = priv_check(td, PRIV_NET_SETIFMTU);
 		if (error)
 			return (error);
 		if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU)
 			return (EINVAL);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		/* Disallow MTU changes on bridge member interfaces. */
 		if (ifp->if_bridge)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0) {
 			getmicrotime(&ifp->if_lastchange);
 			rt_ifmsg(ifp, 0);
 #ifdef INET
 			DEBUGNET_NOTIFY_MTU(ifp);
 #endif
 		}
 		/*
 		 * If the link MTU changed, do network layer specific procedure.
 		 */
 		if (ifp->if_mtu != oldmtu) {
 #ifdef INET6
 			nd6_setmtu(ifp);
 #endif
 			rt_updatemtu(ifp);
 		}
 		break;
 	}
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (cmd == SIOCADDMULTI)
 			error = priv_check(td, PRIV_NET_ADDMULTI);
 		else
 			error = priv_check(td, PRIV_NET_DELMULTI);
 		if (error)
 			return (error);
 
 		/* Don't allow group membership on non-multicast interfaces. */
 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
 			return (EOPNOTSUPP);
 
 		/* Don't let users screw up protocols' entries. */
 		if (ifr->ifr_addr.sa_family != AF_LINK)
 			return (EINVAL);
 
 		if (cmd == SIOCADDMULTI) {
 			struct epoch_tracker et;
 			struct ifmultiaddr *ifma;
 
 			/*
 			 * Userland is only permitted to join groups once
 			 * via the if_addmulti() KPI, because it cannot hold
 			 * struct ifmultiaddr * between calls. It may also
 			 * lose a race while we check if the membership
 			 * already exists.
 			 */
 			NET_EPOCH_ENTER(et);
 			ifma = if_findmulti(ifp, &ifr->ifr_addr);
 			NET_EPOCH_EXIT(et);
 			if (ifma != NULL)
 				error = EADDRINUSE;
 			else
 				error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
 		} else {
 			error = if_delmulti(ifp, &ifr->ifr_addr);
 		}
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYADDR:
 	case SIOCDIFPHYADDR:
 #ifdef INET6
 	case SIOCSIFPHYADDR_IN6:
 #endif
 	case SIOCSIFMEDIA:
 	case SIOCSIFGENERIC:
 		error = priv_check(td, PRIV_NET_HWIOCTL);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCGIFSTATUS:
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 	case SIOCGIFMEDIA:
 	case SIOCGIFXMEDIA:
 	case SIOCGIFGENERIC:
 	case SIOCGIFRSSKEY:
 	case SIOCGIFRSSHASH:
 	case SIOCGIFDOWNREASON:
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		break;
 
 	case SIOCSIFLLADDR:
 		error = priv_check(td, PRIV_NET_SETLLADDR);
 		if (error)
 			return (error);
 		error = if_setlladdr(ifp,
 		    ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len);
 		break;
 
 	case SIOCGHWADDR:
 		error = if_gethwaddr(ifp, ifr);
 		break;
 
 	case SIOCAIFGROUP:
 		error = priv_check(td, PRIV_NET_ADDIFGROUP);
 		if (error)
 			return (error);
 		error = if_addgroup(ifp,
 		    ((struct ifgroupreq *)data)->ifgr_group);
 		if (error != 0)
 			return (error);
 		break;
 
 	case SIOCGIFGROUP:
 	{
 		struct epoch_tracker et;
 
 		NET_EPOCH_ENTER(et);
 		error = if_getgroup((struct ifgroupreq *)data, ifp);
 		NET_EPOCH_EXIT(et);
 		break;
 	}
 
 	case SIOCDIFGROUP:
 		error = priv_check(td, PRIV_NET_DELIFGROUP);
 		if (error)
 			return (error);
 		error = if_delgroup(ifp,
 		    ((struct ifgroupreq *)data)->ifgr_group);
 		if (error != 0)
 			return (error);
 		break;
 
 	default:
 		error = ENOIOCTL;
 		break;
 	}
 	return (error);
 }
 
 /*
  * Interface ioctls.
  */
 int
 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td)
 {
 #ifdef COMPAT_FREEBSD32
 	union {
 		struct ifconf ifc;
 		struct ifdrv ifd;
 		struct ifgroupreq ifgr;
 		struct ifmediareq ifmr;
 	} thunk;
 	u_long saved_cmd;
 	struct ifconf32 *ifc32;
 	struct ifdrv32 *ifd32;
 	struct ifgroupreq32 *ifgr32;
 	struct ifmediareq32 *ifmr32;
 #endif
 	struct ifnet *ifp;
 	struct ifreq *ifr;
 	int error;
 	int oif_flags;
 #ifdef VIMAGE
 	bool shutdown;
 #endif
 
 	CURVNET_SET(so->so_vnet);
 #ifdef VIMAGE
 	/* Make sure the VNET is stable. */
 	shutdown = VNET_IS_SHUTTING_DOWN(so->so_vnet);
 	if (shutdown) {
 		CURVNET_RESTORE();
 		return (EBUSY);
 	}
 #endif
 
 #ifdef COMPAT_FREEBSD32
 	saved_cmd = cmd;
 	switch (cmd) {
 	case SIOCGIFCONF32:
 		ifc32 = (struct ifconf32 *)data;
 		thunk.ifc.ifc_len = ifc32->ifc_len;
 		thunk.ifc.ifc_buf = PTRIN(ifc32->ifc_buf);
 		data = (caddr_t)&thunk.ifc;
 		cmd = SIOCGIFCONF;
 		break;
 	case SIOCGDRVSPEC32:
 	case SIOCSDRVSPEC32:
 		ifd32 = (struct ifdrv32 *)data;
 		memcpy(thunk.ifd.ifd_name, ifd32->ifd_name,
 		    sizeof(thunk.ifd.ifd_name));
 		thunk.ifd.ifd_cmd = ifd32->ifd_cmd;
 		thunk.ifd.ifd_len = ifd32->ifd_len;
 		thunk.ifd.ifd_data = PTRIN(ifd32->ifd_data);
 		data = (caddr_t)&thunk.ifd;
 		cmd = _IOC_NEWTYPE(cmd, struct ifdrv);
 		break;
 	case SIOCAIFGROUP32:
 	case SIOCGIFGROUP32:
 	case SIOCDIFGROUP32:
 	case SIOCGIFGMEMB32:
 		ifgr32 = (struct ifgroupreq32 *)data;
 		memcpy(thunk.ifgr.ifgr_name, ifgr32->ifgr_name,
 		    sizeof(thunk.ifgr.ifgr_name));
 		thunk.ifgr.ifgr_len = ifgr32->ifgr_len;
 		switch (cmd) {
 		case SIOCAIFGROUP32:
 		case SIOCDIFGROUP32:
 			memcpy(thunk.ifgr.ifgr_group, ifgr32->ifgr_group,
 			    sizeof(thunk.ifgr.ifgr_group));
 			break;
 		case SIOCGIFGROUP32:
 		case SIOCGIFGMEMB32:
 			thunk.ifgr.ifgr_groups = PTRIN(ifgr32->ifgr_groups);
 			break;
 		}
 		data = (caddr_t)&thunk.ifgr;
 		cmd = _IOC_NEWTYPE(cmd, struct ifgroupreq);
 		break;
 	case SIOCGIFMEDIA32:
 	case SIOCGIFXMEDIA32:
 		ifmr32 = (struct ifmediareq32 *)data;
 		memcpy(thunk.ifmr.ifm_name, ifmr32->ifm_name,
 		    sizeof(thunk.ifmr.ifm_name));
 		thunk.ifmr.ifm_current = ifmr32->ifm_current;
 		thunk.ifmr.ifm_mask = ifmr32->ifm_mask;
 		thunk.ifmr.ifm_status = ifmr32->ifm_status;
 		thunk.ifmr.ifm_active = ifmr32->ifm_active;
 		thunk.ifmr.ifm_count = ifmr32->ifm_count;
 		thunk.ifmr.ifm_ulist = PTRIN(ifmr32->ifm_ulist);
 		data = (caddr_t)&thunk.ifmr;
 		cmd = _IOC_NEWTYPE(cmd, struct ifmediareq);
 		break;
 	}
 #endif
 
 	switch (cmd) {
 	case SIOCGIFCONF:
 		error = ifconf(cmd, data);
 		goto out_noref;
 	}
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 #ifdef VIMAGE
 	case SIOCSIFRVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error == 0)
 			error = if_vmove_reclaim(td, ifr->ifr_name,
 			    ifr->ifr_jid);
 		goto out_noref;
 #endif
 	case SIOCIFCREATE:
 	case SIOCIFCREATE2:
 		error = priv_check(td, PRIV_NET_IFCREATE);
 		if (error == 0)
 			error = if_clone_create(ifr->ifr_name,
 			    sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ?
 			    ifr_data_get_ptr(ifr) : NULL);
 		goto out_noref;
 	case SIOCIFDESTROY:
 		error = priv_check(td, PRIV_NET_IFDESTROY);
 
 		if (error == 0) {
 			sx_xlock(&ifnet_detach_sxlock);
 			error = if_clone_destroy(ifr->ifr_name);
 			sx_xunlock(&ifnet_detach_sxlock);
 		}
 		goto out_noref;
 
 	case SIOCIFGCLONERS:
 		error = if_clone_list((struct if_clonereq *)data);
 		goto out_noref;
 
 	case SIOCGIFGMEMB:
 		error = if_getgroupmembers((struct ifgroupreq *)data);
 		goto out_noref;
 
 #if defined(INET) || defined(INET6)
 	case SIOCSVH:
 	case SIOCGVH:
 		if (carp_ioctl_p == NULL)
 			error = EPROTONOSUPPORT;
 		else
 			error = (*carp_ioctl_p)(ifr, cmd, td);
 		goto out_noref;
 #endif
 	}
 
 	ifp = ifunit_ref(ifr->ifr_name);
 	if (ifp == NULL) {
 		error = ENXIO;
 		goto out_noref;
 	}
 
 	error = ifhwioctl(cmd, ifp, data, td);
 	if (error != ENOIOCTL)
 		goto out_ref;
 
 	oif_flags = ifp->if_flags;
 	if (so->so_proto == NULL) {
 		error = EOPNOTSUPP;
 		goto out_ref;
 	}
 
 	/*
 	 * Pass the request on to the socket control method, and if the
 	 * latter returns EOPNOTSUPP, directly to the interface.
 	 *
 	 * Make an exception for the legacy SIOCSIF* requests.  Drivers
 	 * trust SIOCSIFADDR et al to come from an already privileged
 	 * layer, and do not perform any credentials checks or input
 	 * validation.
 	 */
 	error = so->so_proto->pr_control(so, cmd, data, ifp, td);
 	if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL &&
 	    cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR &&
 	    cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK)
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 
 	if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
 #ifdef INET6
 		if (ifp->if_flags & IFF_UP)
 			in6_if_up(ifp);
 #endif
 	}
 
 out_ref:
 	if_rele(ifp);
 out_noref:
 	CURVNET_RESTORE();
 #ifdef COMPAT_FREEBSD32
 	if (error != 0)
 		return (error);
 	switch (saved_cmd) {
 	case SIOCGIFCONF32:
 		ifc32->ifc_len = thunk.ifc.ifc_len;
 		break;
 	case SIOCGDRVSPEC32:
 		/*
 		 * SIOCGDRVSPEC is IOWR, but nothing actually touches
 		 * the struct so just assert that ifd_len (the only
 		 * field it might make sense to update) hasn't
 		 * changed.
 		 */
 		KASSERT(thunk.ifd.ifd_len == ifd32->ifd_len,
 		    ("ifd_len was updated %u -> %zu", ifd32->ifd_len,
 			thunk.ifd.ifd_len));
 		break;
 	case SIOCGIFGROUP32:
 	case SIOCGIFGMEMB32:
 		ifgr32->ifgr_len = thunk.ifgr.ifgr_len;
 		break;
 	case SIOCGIFMEDIA32:
 	case SIOCGIFXMEDIA32:
 		ifmr32->ifm_current = thunk.ifmr.ifm_current;
 		ifmr32->ifm_mask = thunk.ifmr.ifm_mask;
 		ifmr32->ifm_status = thunk.ifmr.ifm_status;
 		ifmr32->ifm_active = thunk.ifmr.ifm_active;
 		ifmr32->ifm_count = thunk.ifmr.ifm_count;
 		break;
 	}
 #endif
 	return (error);
 }
 
 /*
  * The code common to handling reference counted flags,
  * e.g., in ifpromisc() and if_allmulti().
  * The "pflag" argument can specify a permanent mode flag to check,
  * such as IFF_PPROMISC for promiscuous mode; should be 0 if none.
  *
  * Only to be used on stack-owned flags, not driver-owned flags.
  */
 static int
 if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch)
 {
 	struct ifreq ifr;
 	int error;
 	int oldflags, oldcount;
 
 	/* Sanity checks to catch programming errors */
 	KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0,
 	    ("%s: setting driver-owned flag %d", __func__, flag));
 
 	if (onswitch)
 		KASSERT(*refcount >= 0,
 		    ("%s: increment negative refcount %d for flag %d",
 		    __func__, *refcount, flag));
 	else
 		KASSERT(*refcount > 0,
 		    ("%s: decrement non-positive refcount %d for flag %d",
 		    __func__, *refcount, flag));
 
 	/* In case this mode is permanent, just touch refcount */
 	if (ifp->if_flags & pflag) {
 		*refcount += onswitch ? 1 : -1;
 		return (0);
 	}
 
 	/* Save ifnet parameters for if_ioctl() may fail */
 	oldcount = *refcount;
 	oldflags = ifp->if_flags;
 
 	/*
 	 * See if we aren't the only and touching refcount is enough.
 	 * Actually toggle interface flag if we are the first or last.
 	 */
 	if (onswitch) {
 		if ((*refcount)++)
 			return (0);
 		ifp->if_flags |= flag;
 	} else {
 		if (--(*refcount))
 			return (0);
 		ifp->if_flags &= ~flag;
 	}
 
 	/* Call down the driver since we've changed interface flags */
 	if (ifp->if_ioctl == NULL) {
 		error = EOPNOTSUPP;
 		goto recover;
 	}
 	ifr.ifr_flags = ifp->if_flags & 0xffff;
 	ifr.ifr_flagshigh = ifp->if_flags >> 16;
 	error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 	if (error)
 		goto recover;
 	/* Notify userland that interface flags have changed */
 	rt_ifmsg(ifp, flag);
 	return (0);
 
 recover:
 	/* Recover after driver error */
 	*refcount = oldcount;
 	ifp->if_flags = oldflags;
 	return (error);
 }
 
 /*
  * Set/clear promiscuous mode on interface ifp based on the truth value
  * of pswitch.  The calls are reference counted so that only the first
  * "on" request actually has an effect, as does the final "off" request.
  * Results are undefined if the "off" and "on" requests are not matched.
  */
 int
 ifpromisc(struct ifnet *ifp, int pswitch)
 {
 	int error;
 	int oldflags = ifp->if_flags;
 
 	error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC,
 			   &ifp->if_pcount, pswitch);
 	/* If promiscuous mode status has changed, log a message */
 	if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) &&
             log_promisc_mode_change)
 		if_printf(ifp, "promiscuous mode %s\n",
 		    (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled");
 	return (error);
 }
 
 /*
  * Return interface configuration
  * of system.  List may be used
  * in later ioctl's (above) to get
  * other information.
  */
 /*ARGSUSED*/
 static int
 ifconf(u_long cmd, caddr_t data)
 {
 	struct ifconf *ifc = (struct ifconf *)data;
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 	struct sbuf *sb;
 	int error, full = 0, valid_len, max_len;
 
 	/* Limit initial buffer size to maxphys to avoid DoS from userspace. */
 	max_len = maxphys - 1;
 
 	/* Prevent hostile input from being able to crash the system */
 	if (ifc->ifc_len <= 0)
 		return (EINVAL);
 
 again:
 	if (ifc->ifc_len <= max_len) {
 		max_len = ifc->ifc_len;
 		full = 1;
 	}
 	sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
 	max_len = 0;
 	valid_len = 0;
 
 	IFNET_RLOCK();
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		struct epoch_tracker et;
 		int addrs;
 
 		/*
 		 * Zero the ifr to make sure we don't disclose the contents
 		 * of the stack.
 		 */
 		memset(&ifr, 0, sizeof(ifr));
 
 		if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
 		    >= sizeof(ifr.ifr_name)) {
 			sbuf_delete(sb);
 			IFNET_RUNLOCK();
 			return (ENAMETOOLONG);
 		}
 
 		addrs = 0;
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa = ifa->ifa_addr;
 
 			if (prison_if(curthread->td_ucred, sa) != 0)
 				continue;
 			addrs++;
 			if (sa->sa_len <= sizeof(*sa)) {
 				if (sa->sa_len < sizeof(*sa)) {
 					memset(&ifr.ifr_ifru.ifru_addr, 0,
 					    sizeof(ifr.ifr_ifru.ifru_addr));
 					memcpy(&ifr.ifr_ifru.ifru_addr, sa,
 					    sa->sa_len);
 				} else
 					ifr.ifr_ifru.ifru_addr = *sa;
 				sbuf_bcat(sb, &ifr, sizeof(ifr));
 				max_len += sizeof(ifr);
 			} else {
 				sbuf_bcat(sb, &ifr,
 				    offsetof(struct ifreq, ifr_addr));
 				max_len += offsetof(struct ifreq, ifr_addr);
 				sbuf_bcat(sb, sa, sa->sa_len);
 				max_len += sa->sa_len;
 			}
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 		NET_EPOCH_EXIT(et);
 		if (addrs == 0) {
 			sbuf_bcat(sb, &ifr, sizeof(ifr));
 			max_len += sizeof(ifr);
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 	}
 	IFNET_RUNLOCK();
 
 	/*
 	 * If we didn't allocate enough space (uncommon), try again.  If
 	 * we have already allocated as much space as we are allowed,
 	 * return what we've got.
 	 */
 	if (valid_len != max_len && !full) {
 		sbuf_delete(sb);
 		goto again;
 	}
 
 	ifc->ifc_len = valid_len;
 	sbuf_finish(sb);
 	error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
 	sbuf_delete(sb);
 	return (error);
 }
 
 /*
  * Just like ifpromisc(), but for all-multicast-reception mode.
  */
 int
 if_allmulti(struct ifnet *ifp, int onswitch)
 {
 
 	return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch));
 }
 
 struct ifmultiaddr *
 if_findmulti(struct ifnet *ifp, const struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 
 	IF_ADDR_LOCK_ASSERT(ifp);
 
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (sa->sa_family == AF_LINK) {
 			if (sa_dl_equal(ifma->ifma_addr, sa))
 				break;
 		} else {
 			if (sa_equal(ifma->ifma_addr, sa))
 				break;
 		}
 	}
 
 	return ifma;
 }
 
 /*
  * Allocate a new ifmultiaddr and initialize based on passed arguments.  We
  * make copies of passed sockaddrs.  The ifmultiaddr will not be added to
  * the ifnet multicast address list here, so the caller must do that and
  * other setup work (such as notifying the device driver).  The reference
  * count is initialized to 1.
  */
 static struct ifmultiaddr *
 if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa,
     int mflags)
 {
 	struct ifmultiaddr *ifma;
 	struct sockaddr *dupsa;
 
 	ifma = malloc(sizeof *ifma, M_IFMADDR, mflags |
 	    M_ZERO);
 	if (ifma == NULL)
 		return (NULL);
 
 	dupsa = malloc(sa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(sa, dupsa, sa->sa_len);
 	ifma->ifma_addr = dupsa;
 
 	ifma->ifma_ifp = ifp;
 	ifma->ifma_refcount = 1;
 	ifma->ifma_protospec = NULL;
 
 	if (llsa == NULL) {
 		ifma->ifma_lladdr = NULL;
 		return (ifma);
 	}
 
 	dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma->ifma_addr, M_IFMADDR);
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(llsa, dupsa, llsa->sa_len);
 	ifma->ifma_lladdr = dupsa;
 
 	return (ifma);
 }
 
 /*
  * if_freemulti: free ifmultiaddr structure and possibly attached related
  * addresses.  The caller is responsible for implementing reference
  * counting, notifying the driver, handling routing messages, and releasing
  * any dependent link layer state.
  */
 #ifdef MCAST_VERBOSE
 extern void kdb_backtrace(void);
 #endif
 static void
 if_freemulti_internal(struct ifmultiaddr *ifma)
 {
 
 	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d",
 	    ifma->ifma_refcount));
 
 	if (ifma->ifma_lladdr != NULL)
 		free(ifma->ifma_lladdr, M_IFMADDR);
 #ifdef MCAST_VERBOSE
 	kdb_backtrace();
 	printf("%s freeing ifma: %p\n", __func__, ifma);
 #endif
 	free(ifma->ifma_addr, M_IFMADDR);
 	free(ifma, M_IFMADDR);
 }
 
 static void
 if_destroymulti(epoch_context_t ctx)
 {
 	struct ifmultiaddr *ifma;
 
 	ifma = __containerof(ctx, struct ifmultiaddr, ifma_epoch_ctx);
 	if_freemulti_internal(ifma);
 }
 
 void
 if_freemulti(struct ifmultiaddr *ifma)
 {
 	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti_epoch: refcount %d",
 	    ifma->ifma_refcount));
 
 	NET_EPOCH_CALL(if_destroymulti, &ifma->ifma_epoch_ctx);
 }
 
 /*
  * Register an additional multicast address with a network interface.
  *
  * - If the address is already present, bump the reference count on the
  *   address and return.
  * - If the address is not link-layer, look up a link layer address.
  * - Allocate address structures for one or both addresses, and attach to the
  *   multicast address list on the interface.  If automatically adding a link
  *   layer address, the protocol address will own a reference to the link
  *   layer address, to be freed when it is freed.
  * - Notify the network device driver of an addition to the multicast address
  *   list.
  *
  * 'sa' points to caller-owned memory with the desired multicast address.
  *
  * 'retifma' will be used to return a pointer to the resulting multicast
  * address reference, if desired.
  */
 int
 if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
     struct ifmultiaddr **retifma)
 {
 	struct ifmultiaddr *ifma, *ll_ifma;
 	struct sockaddr *llsa;
 	struct sockaddr_dl sdl;
 	int error;
 
 #ifdef INET
 	IN_MULTI_LIST_UNLOCK_ASSERT();
 #endif
 #ifdef INET6
 	IN6_MULTI_LIST_UNLOCK_ASSERT();
 #endif
 	/*
 	 * If the address is already present, return a new reference to it;
 	 * otherwise, allocate storage and set up a new address.
 	 */
 	IF_ADDR_WLOCK(ifp);
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL) {
 		ifma->ifma_refcount++;
 		if (retifma != NULL)
 			*retifma = ifma;
 		IF_ADDR_WUNLOCK(ifp);
 		return (0);
 	}
 
 	/*
 	 * The address isn't already present; resolve the protocol address
 	 * into a link layer address, and then look that up, bump its
 	 * refcount or allocate an ifma for that also.
 	 * Most link layer resolving functions returns address data which
 	 * fits inside default sockaddr_dl structure. However callback
 	 * can allocate another sockaddr structure, in that case we need to
 	 * free it later.
 	 */
 	llsa = NULL;
 	ll_ifma = NULL;
 	if (ifp->if_resolvemulti != NULL) {
 		/* Provide called function with buffer size information */
 		sdl.sdl_len = sizeof(sdl);
 		llsa = (struct sockaddr *)&sdl;
 		error = ifp->if_resolvemulti(ifp, &llsa, sa);
 		if (error)
 			goto unlock_out;
 	}
 
 	/*
 	 * Allocate the new address.  Don't hook it up yet, as we may also
 	 * need to allocate a link layer multicast address.
 	 */
 	ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT);
 	if (ifma == NULL) {
 		error = ENOMEM;
 		goto free_llsa_out;
 	}
 
 	/*
 	 * If a link layer address is found, we'll need to see if it's
 	 * already present in the address list, or allocate is as well.
 	 * When this block finishes, the link layer address will be on the
 	 * list.
 	 */
 	if (llsa != NULL) {
 		ll_ifma = if_findmulti(ifp, llsa);
 		if (ll_ifma == NULL) {
 			ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT);
 			if (ll_ifma == NULL) {
 				--ifma->ifma_refcount;
 				if_freemulti(ifma);
 				error = ENOMEM;
 				goto free_llsa_out;
 			}
 			ll_ifma->ifma_flags |= IFMA_F_ENQUEUED;
 			CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma,
 			    ifma_link);
 		} else
 			ll_ifma->ifma_refcount++;
 		ifma->ifma_llifma = ll_ifma;
 	}
 
 	/*
 	 * We now have a new multicast address, ifma, and possibly a new or
 	 * referenced link layer address.  Add the primary address to the
 	 * ifnet address list.
 	 */
 	ifma->ifma_flags |= IFMA_F_ENQUEUED;
 	CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if (retifma != NULL)
 		*retifma = ifma;
 
 	/*
 	 * Must generate the message while holding the lock so that 'ifma'
 	 * pointer is still valid.
 	 */
 	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
 	IF_ADDR_WUNLOCK(ifp);
 
 	/*
 	 * We are certain we have added something, so call down to the
 	 * interface to let them know about it.
 	 */
 	if (ifp->if_ioctl != NULL) {
 		if (THREAD_CAN_SLEEP())
 			(void )(*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
 		else
 			taskqueue_enqueue(taskqueue_swi, &ifp->if_addmultitask);
 	}
 
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 	return (0);
 
 free_llsa_out:
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 unlock_out:
 	IF_ADDR_WUNLOCK(ifp);
 	return (error);
 }
 
 static void
 if_siocaddmulti(void *arg, int pending)
 {
 	struct ifnet *ifp;
 
 	ifp = arg;
 #ifdef DIAGNOSTIC
 	if (pending > 1)
 		if_printf(ifp, "%d SIOCADDMULTI coalesced\n", pending);
 #endif
 	CURVNET_SET(ifp->if_vnet);
 	(void )(*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
 	CURVNET_RESTORE();
 }
 
 /*
  * Delete a multicast group membership by network-layer group address.
  *
  * Returns ENOENT if the entry could not be found. If ifp no longer
  * exists, results are undefined. This entry point should only be used
  * from subsystems which do appropriate locking to hold ifp for the
  * duration of the call.
  * Network-layer protocol domains must use if_delmulti_ifma().
  */
 int
 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 	int lastref;
 
 	KASSERT(ifp, ("%s: NULL ifp", __func__));
 
 	IF_ADDR_WLOCK(ifp);
 	lastref = 0;
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL)
 		lastref = if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 
 	if (ifma == NULL)
 		return (ENOENT);
 
 	if (lastref && ifp->if_ioctl != NULL) {
 		(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 	}
 
 	return (0);
 }
 
 /*
  * Delete all multicast group membership for an interface.
  * Should be used to quickly flush all multicast filters.
  */
 void
 if_delallmulti(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 	struct ifmultiaddr *next;
 
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
 		if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 void
 if_delmulti_ifma(struct ifmultiaddr *ifma)
 {
 	if_delmulti_ifma_flags(ifma, 0);
 }
 
 /*
  * Delete a multicast group membership by group membership pointer.
  * Network-layer protocol domains must use this routine.
  *
  * It is safe to call this routine if the ifp disappeared.
  */
 void
 if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags)
 {
 	struct ifnet *ifp;
 	int lastref;
 	MCDPRINTF("%s freeing ifma: %p\n", __func__, ifma);
 #ifdef INET
 	IN_MULTI_LIST_UNLOCK_ASSERT();
 #endif
 	ifp = ifma->ifma_ifp;
 #ifdef DIAGNOSTIC
 	if (ifp == NULL) {
 		printf("%s: ifma_ifp seems to be detached\n", __func__);
 	} else {
 		struct epoch_tracker et;
 		struct ifnet *oifp;
 
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link)
 			if (ifp == oifp)
 				break;
 		NET_EPOCH_EXIT(et);
 		if (ifp != oifp)
 			ifp = NULL;
 	}
 #endif
 	/*
 	 * If and only if the ifnet instance exists: Acquire the address lock.
 	 */
 	if (ifp != NULL)
 		IF_ADDR_WLOCK(ifp);
 
 	lastref = if_delmulti_locked(ifp, ifma, flags);
 
 	if (ifp != NULL) {
 		/*
 		 * If and only if the ifnet instance exists:
 		 *  Release the address lock.
 		 *  If the group was left: update the hardware hash filter.
 		 */
 		IF_ADDR_WUNLOCK(ifp);
 		if (lastref && ifp->if_ioctl != NULL) {
 			(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 		}
 	}
 }
 
 /*
  * Perform deletion of network-layer and/or link-layer multicast address.
  *
  * Return 0 if the reference count was decremented.
  * Return 1 if the final reference was released, indicating that the
  * hardware hash filter should be reprogrammed.
  */
 static int
 if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching)
 {
 	struct ifmultiaddr *ll_ifma;
 
 	if (ifp != NULL && ifma->ifma_ifp != NULL) {
 		KASSERT(ifma->ifma_ifp == ifp,
 		    ("%s: inconsistent ifp %p", __func__, ifp));
 		IF_ADDR_WLOCK_ASSERT(ifp);
 	}
 
 	ifp = ifma->ifma_ifp;
 	MCDPRINTF("%s freeing %p from %s \n", __func__, ifma, ifp ? ifp->if_xname : "");
 
 	/*
 	 * If the ifnet is detaching, null out references to ifnet,
 	 * so that upper protocol layers will notice, and not attempt
 	 * to obtain locks for an ifnet which no longer exists. The
 	 * routing socket announcement must happen before the ifnet
 	 * instance is detached from the system.
 	 */
 	if (detaching) {
 #ifdef DIAGNOSTIC
 		printf("%s: detaching ifnet instance %p\n", __func__, ifp);
 #endif
 		/*
 		 * ifp may already be nulled out if we are being reentered
 		 * to delete the ll_ifma.
 		 */
 		if (ifp != NULL) {
 			rt_newmaddrmsg(RTM_DELMADDR, ifma);
 			ifma->ifma_ifp = NULL;
 		}
 	}
 
 	if (--ifma->ifma_refcount > 0)
 		return 0;
 
 	if (ifp != NULL && detaching == 0 && (ifma->ifma_flags & IFMA_F_ENQUEUED)) {
 		CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
 		ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 	}
 	/*
 	 * If this ifma is a network-layer ifma, a link-layer ifma may
 	 * have been associated with it. Release it first if so.
 	 */
 	ll_ifma = ifma->ifma_llifma;
 	if (ll_ifma != NULL) {
 		KASSERT(ifma->ifma_lladdr != NULL,
 		    ("%s: llifma w/o lladdr", __func__));
 		if (detaching)
 			ll_ifma->ifma_ifp = NULL;	/* XXX */
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ifp != NULL) {
 				if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) {
 					CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr,
 						ifma_link);
 					ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 				}
 			}
 			if_freemulti(ll_ifma);
 		}
 	}
 #ifdef INVARIANTS
 	if (ifp) {
 		struct ifmultiaddr *ifmatmp;
 
 		CK_STAILQ_FOREACH(ifmatmp, &ifp->if_multiaddrs, ifma_link)
 			MPASS(ifma != ifmatmp);
 	}
 #endif
 	if_freemulti(ifma);
 	/*
 	 * The last reference to this instance of struct ifmultiaddr
 	 * was released; the hardware should be notified of this change.
 	 */
 	return 1;
 }
 
 /*
  * Set the link layer address on an interface.
  *
  * At this time we only support certain types of interfaces,
  * and we don't allow the length of the address to change.
  *
  * Set noinline to be dtrace-friendly
  */
 __noinline int
 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
 {
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 
 	ifa = ifp->if_addr;
 	if (ifa == NULL)
 		return (EINVAL);
 
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	if (sdl == NULL)
 		return (EINVAL);
 
 	if (len != sdl->sdl_alen)	/* don't allow length to change */
 		return (EINVAL);
 
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_XETHER:
 	case IFT_L2VLAN:
 	case IFT_BRIDGE:
 	case IFT_IEEE8023ADLAG:
 		bcopy(lladdr, LLADDR(sdl), len);
 		break;
 	default:
 		return (ENODEV);
 	}
 
 	/*
 	 * If the interface is already up, we need
 	 * to re-init it in order to reprogram its
 	 * address filter.
 	 */
 	if ((ifp->if_flags & IFF_UP) != 0) {
 		if (ifp->if_ioctl) {
 			ifp->if_flags &= ~IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 			ifp->if_flags |= IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 		}
 	}
 	EVENTHANDLER_INVOKE(iflladdr_event, ifp);
 
 	return (0);
 }
 
 /*
  * Compat function for handling basic encapsulation requests.
  * Not converted stacks (FDDI, IB, ..) supports traditional
  * output model: ARP (and other similar L2 protocols) are handled
  * inside output routine, arpresolve/nd6_resolve() returns MAC
  * address instead of full prepend.
  *
  * This function creates calculated header==MAC for IPv4/IPv6 and
  * returns EAFNOSUPPORT (which is then handled in ARP code) for other
  * address families.
  */
 static int
 if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req)
 {
 
 	if (req->rtype != IFENCAP_LL)
 		return (EOPNOTSUPP);
 
 	if (req->bufsize < req->lladdr_len)
 		return (ENOMEM);
 
 	switch (req->family) {
 	case AF_INET:
 	case AF_INET6:
 		break;
 	default:
 		return (EAFNOSUPPORT);
 	}
 
 	/* Copy lladdr to storage as is */
 	memmove(req->buf, req->lladdr, req->lladdr_len);
 	req->bufsize = req->lladdr_len;
 	req->lladdr_off = 0;
 
 	return (0);
 }
 
 /*
  * Tunnel interfaces can nest, also they may cause infinite recursion
  * calls when misconfigured. We'll prevent this by detecting loops.
  * High nesting level may cause stack exhaustion. We'll prevent this
  * by introducing upper limit.
  *
  * Return 0, if tunnel nesting count is equal or less than limit.
  */
 int
 if_tunnel_check_nesting(struct ifnet *ifp, struct mbuf *m, uint32_t cookie,
     int limit)
 {
 	struct m_tag *mtag;
 	int count;
 
 	count = 1;
 	mtag = NULL;
 	while ((mtag = m_tag_locate(m, cookie, 0, mtag)) != NULL) {
 		if (*(struct ifnet **)(mtag + 1) == ifp) {
 			log(LOG_NOTICE, "%s: loop detected\n", if_name(ifp));
 			return (EIO);
 		}
 		count++;
 	}
 	if (count > limit) {
 		log(LOG_NOTICE,
 		    "%s: if_output recursively called too many times(%d)\n",
 		    if_name(ifp), count);
 		return (EIO);
 	}
 	mtag = m_tag_alloc(cookie, 0, sizeof(struct ifnet *), M_NOWAIT);
 	if (mtag == NULL)
 		return (ENOMEM);
 	*(struct ifnet **)(mtag + 1) = ifp;
 	m_tag_prepend(m, mtag);
 	return (0);
 }
 
 /*
  * Get the link layer address that was read from the hardware at attach.
  *
  * This is only set by Ethernet NICs (IFT_ETHER), but laggX interfaces re-type
  * their component interfaces as IFT_IEEE8023ADLAG.
  */
 int
 if_gethwaddr(struct ifnet *ifp, struct ifreq *ifr)
 {
 
 	if (ifp->if_hw_addr == NULL)
 		return (ENODEV);
 
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_IEEE8023ADLAG:
 		bcopy(ifp->if_hw_addr, ifr->ifr_addr.sa_data, ifp->if_addrlen);
 		return (0);
 	default:
 		return (ENODEV);
 	}
 }
 
 /*
  * The name argument must be a pointer to storage which will last as
  * long as the interface does.  For physical devices, the result of
  * device_get_name(dev) is a good choice and for pseudo-devices a
  * static string works well.
  */
 void
 if_initname(struct ifnet *ifp, const char *name, int unit)
 {
 	ifp->if_dname = name;
 	ifp->if_dunit = unit;
 	if (unit != IF_DUNIT_NONE)
 		snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
 	else
 		strlcpy(ifp->if_xname, name, IFNAMSIZ);
 }
 
 static int
 if_vlog(struct ifnet *ifp, int pri, const char *fmt, va_list ap)
 {
 	char if_fmt[256];
 
 	snprintf(if_fmt, sizeof(if_fmt), "%s: %s", ifp->if_xname, fmt);
 	vlog(pri, if_fmt, ap);
 	return (0);
 }
 
 
 int
 if_printf(struct ifnet *ifp, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	if_vlog(ifp, LOG_INFO, fmt, ap);
 	va_end(ap);
 	return (0);
 }
 
 int
 if_log(struct ifnet *ifp, int pri, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	if_vlog(ifp, pri, fmt, ap);
 	va_end(ap);
 	return (0);
 }
 
 void
 if_start(struct ifnet *ifp)
 {
 
 	(*(ifp)->if_start)(ifp);
 }
 
 /*
  * Backwards compatibility interface for drivers 
  * that have not implemented it
  */
 static int
 if_transmit_default(struct ifnet *ifp, struct mbuf *m)
 {
 	int error;
 
 	IFQ_HANDOFF(ifp, m, error);
 	return (error);
 }
 
 static void
 if_input_default(struct ifnet *ifp __unused, struct mbuf *m)
 {
 
 	m_freem(m);
 }
 
 int
 if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust)
 {
 	int active = 0;
 
 	IF_LOCK(ifq);
 	if (_IF_QFULL(ifq)) {
 		IF_UNLOCK(ifq);
 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 		m_freem(m);
 		return (0);
 	}
 	if (ifp != NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust);
 		if (m->m_flags & (M_BCAST|M_MCAST))
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 		active = ifp->if_drv_flags & IFF_DRV_OACTIVE;
 	}
 	_IF_ENQUEUE(ifq, m);
 	IF_UNLOCK(ifq);
 	if (ifp != NULL && !active)
 		(*(ifp)->if_start)(ifp);
 	return (1);
 }
 
 void
 if_register_com_alloc(u_char type,
     if_com_alloc_t *a, if_com_free_t *f)
 {
 
 	KASSERT(if_com_alloc[type] == NULL,
 	    ("if_register_com_alloc: %d already registered", type));
 	KASSERT(if_com_free[type] == NULL,
 	    ("if_register_com_alloc: %d free already registered", type));
 
 	if_com_alloc[type] = a;
 	if_com_free[type] = f;
 }
 
 void
 if_deregister_com_alloc(u_char type)
 {
 
 	KASSERT(if_com_alloc[type] != NULL,
 	    ("if_deregister_com_alloc: %d not registered", type));
 	KASSERT(if_com_free[type] != NULL,
 	    ("if_deregister_com_alloc: %d free not registered", type));
 
 	/*
 	 * Ensure all pending EPOCH(9) callbacks have been executed. This
 	 * fixes issues about late invocation of if_destroy(), which leads
 	 * to memory leak from if_com_alloc[type] allocated if_l2com.
 	 */
 	NET_EPOCH_DRAIN_CALLBACKS();
 
 	if_com_alloc[type] = NULL;
 	if_com_free[type] = NULL;
 }
 
 /* API for driver access to network stack owned ifnet.*/
 uint64_t
 if_setbaudrate(struct ifnet *ifp, uint64_t baudrate)
 {
 	uint64_t oldbrate;
 
 	oldbrate = ifp->if_baudrate;
 	ifp->if_baudrate = baudrate;
 	return (oldbrate);
 }
 
 uint64_t
 if_getbaudrate(const if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_baudrate);
 }
 
 int
 if_setcapabilities(if_t ifp, int capabilities)
 {
 	((struct ifnet *)ifp)->if_capabilities = capabilities;
 	return (0);
 }
 
 int
 if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit)
 {
 	((struct ifnet *)ifp)->if_capabilities &= ~clearbit;
 	((struct ifnet *)ifp)->if_capabilities |= setbit;
 
 	return (0);
 }
 
 int
 if_getcapabilities(const if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_capabilities;
 }
 
 int 
 if_setcapenable(if_t ifp, int capabilities)
 {
 	((struct ifnet *)ifp)->if_capenable = capabilities;
 	return (0);
 }
 
 int 
 if_setcapenablebit(if_t ifp, int setcap, int clearcap)
 {
 	if(clearcap)
 		((struct ifnet *)ifp)->if_capenable &= ~clearcap;
 	if(setcap) 
 		((struct ifnet *)ifp)->if_capenable |= setcap;
 
 	return (0);
 }
 
 const char *
 if_getdname(const if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_dname;
 }
 
 void
 if_setdname(if_t ifp, const char *dname)
 {
 	((struct ifnet *)ifp)->if_dname = dname;
 }
 
 const char *
 if_name(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_xname;
 }
 
 int
 if_setname(if_t ifp, const char *name)
 {
 	if (strlen(name) > sizeof(ifp->if_xname) - 1)
 		return (ENAMETOOLONG);
 	strlcpy(ifp->if_xname, name, sizeof(ifp->if_xname));
 
 	return (0);
 }
 
 int 
 if_togglecapenable(if_t ifp, int togglecap)
 {
 	((struct ifnet *)ifp)->if_capenable ^= togglecap;
 	return (0);
 }
 
 int
 if_getcapenable(const if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_capenable;
 }
 
 int
 if_getdunit(const if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_dunit;
 }
 
 int
 if_getindex(const if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_index;
 }
 
 void
 if_setdescr(if_t ifp, char *descrbuf)
 {
 	sx_xlock(&ifdescr_sx);
 	char *odescrbuf = ifp->if_description;
 	ifp->if_description = descrbuf;
 	sx_xunlock(&ifdescr_sx);
 
 	if_freedescr(odescrbuf);
 }
 
 char *
 if_allocdescr(size_t sz, int malloc_flag)
 {
 	malloc_flag &= (M_WAITOK | M_NOWAIT);
 	return (malloc(sz, M_IFDESCR, M_ZERO | malloc_flag));
 }
 
 void
 if_freedescr(char *descrbuf)
 {
 	free(descrbuf, M_IFDESCR);
 }
 
 int
 if_getalloctype(const if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_alloctype;
 }
 
 /*
  * This is largely undesirable because it ties ifnet to a device, but does
  * provide flexiblity for an embedded product vendor. Should be used with
  * the understanding that it violates the interface boundaries, and should be
  * a last resort only.
  */
 int
 if_setdev(if_t ifp, void *dev)
 {
 	return (0);
 }
 
 int
 if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags)
 {
 	((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags;
 	((struct ifnet *)ifp)->if_drv_flags |= set_flags;
 
 	return (0);
 }
 
 int
 if_getdrvflags(const if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_drv_flags;
 }
 
 int
 if_setdrvflags(if_t ifp, int flags)
 {
 	((struct ifnet *)ifp)->if_drv_flags = flags;
 	return (0);
 }
 
 int
 if_setflags(if_t ifp, int flags)
 {
 
 	ifp->if_flags = flags;
 	return (0);
 }
 
 int
 if_setflagbits(if_t ifp, int set, int clear)
 {
 	((struct ifnet *)ifp)->if_flags &= ~clear;
 	((struct ifnet *)ifp)->if_flags |= set;
 
 	return (0);
 }
 
 int
 if_getflags(const if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_flags;
 }
 
 int
 if_clearhwassist(if_t ifp)
 {
 	((struct ifnet *)ifp)->if_hwassist = 0;
 	return (0);
 }
 
 int
 if_sethwassistbits(if_t ifp, int toset, int toclear)
 {
 	((struct ifnet *)ifp)->if_hwassist &= ~toclear;
 	((struct ifnet *)ifp)->if_hwassist |= toset;
 
 	return (0);
 }
 
 int
 if_sethwassist(if_t ifp, int hwassist_bit)
 {
 	((struct ifnet *)ifp)->if_hwassist = hwassist_bit;
 	return (0);
 }
 
 int
 if_gethwassist(const if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_hwassist;
 }
 
 int
 if_togglehwassist(if_t ifp, int toggle_bits)
 {
 	((struct ifnet *)ifp)->if_hwassist ^= toggle_bits;
 	return (0);
 }
 
 int
 if_setmtu(if_t ifp, int mtu)
 {
 	((struct ifnet *)ifp)->if_mtu = mtu;
 	return (0);
 }
 
 int
 if_getmtu(const if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_mtu;
 }
 
 int
 if_getmtu_family(const if_t ifp, int family)
 {
 	struct domain *dp;
 
 	SLIST_FOREACH(dp, &domains, dom_next) {
 		if (dp->dom_family == family && dp->dom_ifmtu != NULL)
 			return (dp->dom_ifmtu((struct ifnet *)ifp));
 	}
 
 	return (((struct ifnet *)ifp)->if_mtu);
 }
 
 /*
  * Methods for drivers to access interface unicast and multicast
  * link level addresses.  Driver shall not know 'struct ifaddr' neither
  * 'struct ifmultiaddr'.
  */
 u_int
 if_lladdr_count(if_t ifp)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 	u_int count;
 
 	count = 0;
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (ifa->ifa_addr->sa_family == AF_LINK)
 			count++;
 	NET_EPOCH_EXIT(et);
 
 	return (count);
 }
 
 u_int
 if_foreach_lladdr(if_t ifp, iflladdr_cb_t cb, void *cb_arg)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 	u_int count;
 
 	MPASS(cb);
 
 	count = 0;
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_LINK)
 			continue;
 		count += (*cb)(cb_arg, (struct sockaddr_dl *)ifa->ifa_addr,
 		    count);
 	}
 	NET_EPOCH_EXIT(et);
 
 	return (count);
 }
 
 u_int
 if_llmaddr_count(if_t ifp)
 {
 	struct epoch_tracker et;
 	struct ifmultiaddr *ifma;
 	int count;
 
 	count = 0;
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
 		if (ifma->ifma_addr->sa_family == AF_LINK)
 			count++;
 	NET_EPOCH_EXIT(et);
 
 	return (count);
 }
 
 u_int
 if_foreach_llmaddr(if_t ifp, iflladdr_cb_t cb, void *cb_arg)
 {
 	struct epoch_tracker et;
 	struct ifmultiaddr *ifma;
 	u_int count;
 
 	MPASS(cb);
 
 	count = 0;
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		count += (*cb)(cb_arg, (struct sockaddr_dl *)ifma->ifma_addr,
 		    count);
 	}
 	NET_EPOCH_EXIT(et);
 
 	return (count);
 }
 
 u_int
 if_foreach_addr_type(if_t ifp, int type, if_addr_cb_t cb, void *cb_arg)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 	u_int count;
 
 	MPASS(cb);
 
 	count = 0;
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != type)
 			continue;
 		count += (*cb)(cb_arg, ifa, count);
 	}
 	NET_EPOCH_EXIT(et);
 
 	return (count);
 }
 
 int
 if_setsoftc(if_t ifp, void *softc)
 {
 	((struct ifnet *)ifp)->if_softc = softc;
 	return (0);
 }
 
 void *
 if_getsoftc(const if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_softc;
 }
 
 void 
 if_setrcvif(struct mbuf *m, if_t ifp)
 {
 
 	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	m->m_pkthdr.rcvif = (struct ifnet *)ifp;
 }
 
 void 
 if_setvtag(struct mbuf *m, uint16_t tag)
 {
 	m->m_pkthdr.ether_vtag = tag;	
 }
 
 uint16_t
 if_getvtag(struct mbuf *m)
 {
 
 	return (m->m_pkthdr.ether_vtag);
 }
 
 int
 if_sendq_empty(if_t ifp)
 {
 	return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd);
 }
 
 struct ifaddr *
 if_getifaddr(const if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_addr;
 }
 
 int
 if_getamcount(const if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_amcount;
 }
 
 int
 if_setsendqready(if_t ifp)
 {
 	IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd);
 	return (0);
 }
 
 int
 if_setsendqlen(if_t ifp, int tx_desc_count)
 {
 	IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count);
 	((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count;
 
 	return (0);
 }
 
 int
 if_vlantrunkinuse(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0;
 }
 
 int
 if_init(if_t ifp, void *ctx)
 {
 	(*((struct ifnet *)ifp)->if_init)(ctx);
 	return (0);
 }
 
 int
 if_input(if_t ifp, struct mbuf* sendmp)
 {
 	(*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp);
 	return (0);
 
 }
 
 int
 if_transmit(if_t ifp, struct mbuf *m)
 {
 	(*((struct ifnet *)ifp)->if_transmit)((struct ifnet *)ifp, m);
 	return (0);
 }
 
 struct mbuf *
 if_dequeue(if_t ifp)
 {
 	struct mbuf *m;
 	IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m);
 
 	return (m);
 }
 
 int
 if_sendq_prepend(if_t ifp, struct mbuf *m)
 {
 	IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m);
 	return (0);
 }
 
 int
 if_setifheaderlen(if_t ifp, int len)
 {
 	((struct ifnet *)ifp)->if_hdrlen = len;
 	return (0);
 }
 
 caddr_t
 if_getlladdr(const if_t ifp)
 {
 	return (IF_LLADDR((struct ifnet *)ifp));
 }
 
 void *
 if_gethandle(u_char type)
 {
 	return (if_alloc(type));
 }
 
 void
 if_bpfmtap(if_t ifh, struct mbuf *m)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 
 	BPF_MTAP(ifp, m);
 }
 
 void
 if_etherbpfmtap(if_t ifh, struct mbuf *m)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 
 	ETHER_BPF_MTAP(ifp, m);
 }
 
 void
 if_vlancap(if_t ifh)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 	VLAN_CAPABILITIES(ifp);
 }
 
 int
 if_sethwtsomax(if_t ifp, u_int if_hw_tsomax)
 {
 
 	((struct ifnet *)ifp)->if_hw_tsomax = if_hw_tsomax;
         return (0);
 }
 
 int
 if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount)
 {
 
 	((struct ifnet *)ifp)->if_hw_tsomaxsegcount = if_hw_tsomaxsegcount;
         return (0);
 }
 
 int
 if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize)
 {
 
 	((struct ifnet *)ifp)->if_hw_tsomaxsegsize = if_hw_tsomaxsegsize;
         return (0);
 }
 
 u_int
 if_gethwtsomax(const if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_hw_tsomax);
 }
 
 u_int
 if_gethwtsomaxsegcount(const if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_hw_tsomaxsegcount);
 }
 
 u_int
 if_gethwtsomaxsegsize(const if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_hw_tsomaxsegsize);
 }
 
 void
 if_setinitfn(if_t ifp, if_init_fn_t init_fn)
 {
 	((struct ifnet *)ifp)->if_init = init_fn;
 }
 
 void
 if_setinputfn(if_t ifp, if_input_fn_t input_fn)
 {
 	((struct ifnet *)ifp)->if_input = input_fn;
 }
 
 void
 if_setioctlfn(if_t ifp, if_ioctl_fn_t ioctl_fn)
 {
 	((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn;
 }
 
 void
 if_setoutputfn(if_t ifp, if_output_fn_t output_fn)
 {
 	((struct ifnet *)ifp)->if_output = output_fn;
 }
 
 void
 if_setstartfn(if_t ifp, if_start_fn_t start_fn)
 {
 	((struct ifnet *)ifp)->if_start = (void *)start_fn;
 }
 
 void
 if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn)
 {
 	((struct ifnet *)ifp)->if_transmit = start_fn;
 }
 
 void
 if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn)
 {
 	((struct ifnet *)ifp)->if_qflush = flush_fn;
 
 }
 
 void
 if_setsndtagallocfn(if_t ifp, if_snd_tag_alloc_t alloc_fn)
 {
 	((struct ifnet *)ifp)->if_snd_tag_alloc = alloc_fn;
 }
 
 void
 if_setgetcounterfn(if_t ifp, if_get_counter_t fn)
 {
 
 	ifp->if_get_counter = fn;
 }
 
 #ifdef DDB
 static void
 if_show_ifnet(struct ifnet *ifp)
 {
 
 	if (ifp == NULL)
 		return;
 	db_printf("%s:\n", ifp->if_xname);
 #define	IF_DB_PRINTF(f, e)	db_printf("   %s = " f "\n", #e, ifp->e);
 	IF_DB_PRINTF("%s", if_dname);
 	IF_DB_PRINTF("%d", if_dunit);
 	IF_DB_PRINTF("%s", if_description);
 	IF_DB_PRINTF("%u", if_index);
 	IF_DB_PRINTF("%d", if_idxgen);
 	IF_DB_PRINTF("%u", if_refcount);
 	IF_DB_PRINTF("%p", if_softc);
 	IF_DB_PRINTF("%p", if_l2com);
 	IF_DB_PRINTF("%p", if_llsoftc);
 	IF_DB_PRINTF("%d", if_amcount);
 	IF_DB_PRINTF("%p", if_addr);
 	IF_DB_PRINTF("%p", if_broadcastaddr);
 	IF_DB_PRINTF("%p", if_afdata);
 	IF_DB_PRINTF("%d", if_afdata_initialized);
 	IF_DB_PRINTF("%u", if_fib);
 	IF_DB_PRINTF("%p", if_vnet);
 	IF_DB_PRINTF("%p", if_home_vnet);
 	IF_DB_PRINTF("%p", if_vlantrunk);
 	IF_DB_PRINTF("%p", if_bpf);
 	IF_DB_PRINTF("%u", if_pcount);
 	IF_DB_PRINTF("%p", if_bridge);
 	IF_DB_PRINTF("%p", if_lagg);
 	IF_DB_PRINTF("%p", if_pf_kif);
 	IF_DB_PRINTF("%p", if_carp);
 	IF_DB_PRINTF("%p", if_label);
 	IF_DB_PRINTF("%p", if_netmap);
 	IF_DB_PRINTF("0x%08x", if_flags);
 	IF_DB_PRINTF("0x%08x", if_drv_flags);
 	IF_DB_PRINTF("0x%08x", if_capabilities);
 	IF_DB_PRINTF("0x%08x", if_capenable);
 	IF_DB_PRINTF("%p", if_snd.ifq_head);
 	IF_DB_PRINTF("%p", if_snd.ifq_tail);
 	IF_DB_PRINTF("%d", if_snd.ifq_len);
 	IF_DB_PRINTF("%d", if_snd.ifq_maxlen);
 	IF_DB_PRINTF("%p", if_snd.ifq_drv_head);
 	IF_DB_PRINTF("%p", if_snd.ifq_drv_tail);
 	IF_DB_PRINTF("%d", if_snd.ifq_drv_len);
 	IF_DB_PRINTF("%d", if_snd.ifq_drv_maxlen);
 	IF_DB_PRINTF("%d", if_snd.altq_type);
 	IF_DB_PRINTF("%x", if_snd.altq_flags);
 #undef IF_DB_PRINTF
 }
 
 DB_SHOW_COMMAND(ifnet, db_show_ifnet)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show ifnet <struct ifnet *>\n");
 		return;
 	}
 
 	if_show_ifnet((struct ifnet *)addr);
 }
 
 DB_SHOW_ALL_COMMAND(ifnets, db_show_all_ifnets)
 {
 	struct ifnet *ifp;
 	u_short idx;
 
 	for (idx = 1; idx <= if_index; idx++) {
 		ifp = ifindex_table[idx].ife_ifnet;
 		if (ifp == NULL)
 			continue;
 		db_printf( "%20s ifp=%p\n", ifp->if_xname, ifp);
 		if (db_pager_quit)
 			break;
 	}
 }
 #endif	/* DDB */
diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c
index 5c3ce137931c..67d031a65052 100644
--- a/sys/net/if_bridge.c
+++ b/sys/net/if_bridge.c
@@ -1,3797 +1,3798 @@
 /*	$NetBSD: if_bridge.c,v 1.31 2005/06/01 19:45:34 jdc Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright 2001 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Jason R. Thorpe for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed for the NetBSD Project by
  *	Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * OpenBSD: if_bridge.c,v 1.60 2001/06/15 03:38:33 itojun Exp
  */
 
 /*
  * Network interface bridge support.
  *
  * TODO:
  *
  *	- Currently only supports Ethernet-like interfaces (Ethernet,
  *	  802.11, VLANs on Ethernet, etc.)  Figure out a nice way
  *	  to bridge other types of interfaces (maybe consider
  *	  heterogeneous bridges).
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/protosw.h>
 #include <sys/systm.h>
 #include <sys/jail.h>
 #include <sys/time.h>
 #include <sys/socket.h> /* for net/if.h */
 #include <sys/sockio.h>
 #include <sys/ctype.h>  /* string functions */
 #include <sys/kernel.h>
 #include <sys/random.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <vm/uma.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 
 #include <net/bpf.h>
 #include <net/if.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif
 #if defined(INET) || defined(INET6)
 #include <netinet/ip_carp.h>
 #endif
 #include <machine/in_cksum.h>
 #include <netinet/if_ether.h>
 #include <net/bridgestp.h>
 #include <net/if_bridgevar.h>
 #include <net/if_llc.h>
 #include <net/if_vlan_var.h>
 
 #include <net/route.h>
 
 #ifdef INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
  * should be more generalized?
  */
 extern void	nd6_setmtu(struct ifnet *);
 #endif
 
 /*
  * Size of the route hash table.  Must be a power of two.
  */
 #ifndef BRIDGE_RTHASH_SIZE
 #define	BRIDGE_RTHASH_SIZE		1024
 #endif
 
 #define	BRIDGE_RTHASH_MASK		(BRIDGE_RTHASH_SIZE - 1)
 
 /*
  * Default maximum number of addresses to cache.
  */
 #ifndef BRIDGE_RTABLE_MAX
 #define	BRIDGE_RTABLE_MAX		2000
 #endif
 
 /*
  * Timeout (in seconds) for entries learned dynamically.
  */
 #ifndef BRIDGE_RTABLE_TIMEOUT
 #define	BRIDGE_RTABLE_TIMEOUT		(20 * 60)	/* same as ARP */
 #endif
 
 /*
  * Number of seconds between walks of the route list.
  */
 #ifndef BRIDGE_RTABLE_PRUNE_PERIOD
 #define	BRIDGE_RTABLE_PRUNE_PERIOD	(5 * 60)
 #endif
 
 /*
  * List of capabilities to possibly mask on the member interface.
  */
 #define	BRIDGE_IFCAPS_MASK		(IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM|\
 					 IFCAP_TXCSUM_IPV6)
 
 /*
  * List of capabilities to strip
  */
 #define	BRIDGE_IFCAPS_STRIP		IFCAP_LRO
 
 /*
  * Bridge locking
  *
  * The bridge relies heavily on the epoch(9) system to protect its data
  * structures. This means we can safely use CK_LISTs while in NET_EPOCH, but we
  * must ensure there is only one writer at a time.
  *
  * That is: for read accesses we only need to be in NET_EPOCH, but for write
  * accesses we must hold:
  *
  *  - BRIDGE_RT_LOCK, for any change to bridge_rtnodes
  *  - BRIDGE_LOCK, for any other change
  *
  * The BRIDGE_LOCK is a sleepable lock, because it is held across ioctl()
  * calls to bridge member interfaces and these ioctl()s can sleep.
  * The BRIDGE_RT_LOCK is a non-sleepable mutex, because it is sometimes
  * required while we're in NET_EPOCH and then we're not allowed to sleep.
  */
 #define BRIDGE_LOCK_INIT(_sc)		do {			\
 	sx_init(&(_sc)->sc_sx, "if_bridge");			\
 	mtx_init(&(_sc)->sc_rt_mtx, "if_bridge rt", NULL, MTX_DEF);	\
 } while (0)
 #define BRIDGE_LOCK_DESTROY(_sc)	do {	\
 	sx_destroy(&(_sc)->sc_sx);		\
 	mtx_destroy(&(_sc)->sc_rt_mtx);		\
 } while (0)
 #define BRIDGE_LOCK(_sc)		sx_xlock(&(_sc)->sc_sx)
 #define BRIDGE_UNLOCK(_sc)		sx_xunlock(&(_sc)->sc_sx)
 #define BRIDGE_LOCK_ASSERT(_sc)		sx_assert(&(_sc)->sc_sx, SX_XLOCKED)
 #define BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(_sc)	\
 	    MPASS(in_epoch(net_epoch_preempt) || sx_xlocked(&(_sc)->sc_sx))
 #define BRIDGE_UNLOCK_ASSERT(_sc)	sx_assert(&(_sc)->sc_sx, SX_UNLOCKED)
 #define BRIDGE_RT_LOCK(_sc)		mtx_lock(&(_sc)->sc_rt_mtx)
 #define BRIDGE_RT_UNLOCK(_sc)		mtx_unlock(&(_sc)->sc_rt_mtx)
 #define BRIDGE_RT_LOCK_ASSERT(_sc)	mtx_assert(&(_sc)->sc_rt_mtx, MA_OWNED)
 #define BRIDGE_RT_LOCK_OR_NET_EPOCH_ASSERT(_sc)	\
 	    MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(_sc)->sc_rt_mtx))
 
 /*
  * Bridge interface list entry.
  */
 struct bridge_iflist {
 	CK_LIST_ENTRY(bridge_iflist) bif_next;
 	struct ifnet		*bif_ifp;	/* member if */
 	struct bstp_port	bif_stp;	/* STP state */
 	uint32_t		bif_flags;	/* member if flags */
 	int			bif_savedcaps;	/* saved capabilities */
 	uint32_t		bif_addrmax;	/* max # of addresses */
 	uint32_t		bif_addrcnt;	/* cur. # of addresses */
 	uint32_t		bif_addrexceeded;/* # of address violations */
 	struct epoch_context	bif_epoch_ctx;
 };
 
 /*
  * Bridge route node.
  */
 struct bridge_rtnode {
 	CK_LIST_ENTRY(bridge_rtnode) brt_hash;	/* hash table linkage */
 	CK_LIST_ENTRY(bridge_rtnode) brt_list;	/* list linkage */
 	struct bridge_iflist	*brt_dst;	/* destination if */
 	unsigned long		brt_expire;	/* expiration time */
 	uint8_t			brt_flags;	/* address flags */
 	uint8_t			brt_addr[ETHER_ADDR_LEN];
 	uint16_t		brt_vlan;	/* vlan id */
 	struct	vnet		*brt_vnet;
 	struct	epoch_context	brt_epoch_ctx;
 };
 #define	brt_ifp			brt_dst->bif_ifp
 
 /*
  * Software state for each bridge.
  */
 struct bridge_softc {
 	struct ifnet		*sc_ifp;	/* make this an interface */
 	LIST_ENTRY(bridge_softc) sc_list;
 	struct sx		sc_sx;
 	struct mtx		sc_rt_mtx;
 	uint32_t		sc_brtmax;	/* max # of addresses */
 	uint32_t		sc_brtcnt;	/* cur. # of addresses */
 	uint32_t		sc_brttimeout;	/* rt timeout in seconds */
 	struct callout		sc_brcallout;	/* bridge callout */
 	CK_LIST_HEAD(, bridge_iflist) sc_iflist;	/* member interface list */
 	CK_LIST_HEAD(, bridge_rtnode) *sc_rthash;	/* our forwarding table */
 	CK_LIST_HEAD(, bridge_rtnode) sc_rtlist;	/* list version of above */
 	uint32_t		sc_rthash_key;	/* key for hash */
 	CK_LIST_HEAD(, bridge_iflist) sc_spanlist;	/* span ports list */
 	struct bstp_state	sc_stp;		/* STP state */
 	uint32_t		sc_brtexceeded;	/* # of cache drops */
 	struct ifnet		*sc_ifaddr;	/* member mac copied from */
 	struct ether_addr	sc_defaddr;	/* Default MAC address */
 	struct epoch_context	sc_epoch_ctx;
 };
 
 VNET_DEFINE_STATIC(struct sx, bridge_list_sx);
 #define	V_bridge_list_sx	VNET(bridge_list_sx)
 static eventhandler_tag bridge_detach_cookie;
 
 int	bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD;
 
 VNET_DEFINE_STATIC(uma_zone_t, bridge_rtnode_zone);
 #define	V_bridge_rtnode_zone	VNET(bridge_rtnode_zone)
 
 static int	bridge_clone_create(struct if_clone *, char *, size_t,
 		    struct ifc_data *, struct ifnet **);
 static int	bridge_clone_destroy(struct if_clone *, struct ifnet *, uint32_t);
 
 static int	bridge_ioctl(struct ifnet *, u_long, caddr_t);
 static void	bridge_mutecaps(struct bridge_softc *);
 static void	bridge_set_ifcap(struct bridge_softc *, struct bridge_iflist *,
 		    int);
 static void	bridge_ifdetach(void *arg __unused, struct ifnet *);
 static void	bridge_init(void *);
 static void	bridge_dummynet(struct mbuf *, struct ifnet *);
 static void	bridge_stop(struct ifnet *, int);
 static int	bridge_transmit(struct ifnet *, struct mbuf *);
 #ifdef ALTQ
 static void	bridge_altq_start(if_t);
 static int	bridge_altq_transmit(if_t, struct mbuf *);
 #endif
 static void	bridge_qflush(struct ifnet *);
 static struct mbuf *bridge_input(struct ifnet *, struct mbuf *);
 static int	bridge_output(struct ifnet *, struct mbuf *, struct sockaddr *,
 		    struct rtentry *);
 static int	bridge_enqueue(struct bridge_softc *, struct ifnet *,
 		    struct mbuf *);
 static void	bridge_rtdelete(struct bridge_softc *, struct ifnet *ifp, int);
 
 static void	bridge_forward(struct bridge_softc *, struct bridge_iflist *,
 		    struct mbuf *m);
 
 static void	bridge_timer(void *);
 
 static void	bridge_broadcast(struct bridge_softc *, struct ifnet *,
 		    struct mbuf *, int);
 static void	bridge_span(struct bridge_softc *, struct mbuf *);
 
 static int	bridge_rtupdate(struct bridge_softc *, const uint8_t *,
 		    uint16_t, struct bridge_iflist *, int, uint8_t);
 static struct ifnet *bridge_rtlookup(struct bridge_softc *, const uint8_t *,
 		    uint16_t);
 static void	bridge_rttrim(struct bridge_softc *);
 static void	bridge_rtage(struct bridge_softc *);
 static void	bridge_rtflush(struct bridge_softc *, int);
 static int	bridge_rtdaddr(struct bridge_softc *, const uint8_t *,
 		    uint16_t);
 
 static void	bridge_rtable_init(struct bridge_softc *);
 static void	bridge_rtable_fini(struct bridge_softc *);
 
 static int	bridge_rtnode_addr_cmp(const uint8_t *, const uint8_t *);
 static struct bridge_rtnode *bridge_rtnode_lookup(struct bridge_softc *,
 		    const uint8_t *, uint16_t);
 static int	bridge_rtnode_insert(struct bridge_softc *,
 		    struct bridge_rtnode *);
 static void	bridge_rtnode_destroy(struct bridge_softc *,
 		    struct bridge_rtnode *);
 static void	bridge_rtable_expire(struct ifnet *, int);
 static void	bridge_state_change(struct ifnet *, int);
 
 static struct bridge_iflist *bridge_lookup_member(struct bridge_softc *,
 		    const char *name);
 static struct bridge_iflist *bridge_lookup_member_if(struct bridge_softc *,
 		    struct ifnet *ifp);
 static void	bridge_delete_member(struct bridge_softc *,
 		    struct bridge_iflist *, int);
 static void	bridge_delete_span(struct bridge_softc *,
 		    struct bridge_iflist *);
 
 static int	bridge_ioctl_add(struct bridge_softc *, void *);
 static int	bridge_ioctl_del(struct bridge_softc *, void *);
 static int	bridge_ioctl_gifflags(struct bridge_softc *, void *);
 static int	bridge_ioctl_sifflags(struct bridge_softc *, void *);
 static int	bridge_ioctl_scache(struct bridge_softc *, void *);
 static int	bridge_ioctl_gcache(struct bridge_softc *, void *);
 static int	bridge_ioctl_gifs(struct bridge_softc *, void *);
 static int	bridge_ioctl_rts(struct bridge_softc *, void *);
 static int	bridge_ioctl_saddr(struct bridge_softc *, void *);
 static int	bridge_ioctl_sto(struct bridge_softc *, void *);
 static int	bridge_ioctl_gto(struct bridge_softc *, void *);
 static int	bridge_ioctl_daddr(struct bridge_softc *, void *);
 static int	bridge_ioctl_flush(struct bridge_softc *, void *);
 static int	bridge_ioctl_gpri(struct bridge_softc *, void *);
 static int	bridge_ioctl_spri(struct bridge_softc *, void *);
 static int	bridge_ioctl_ght(struct bridge_softc *, void *);
 static int	bridge_ioctl_sht(struct bridge_softc *, void *);
 static int	bridge_ioctl_gfd(struct bridge_softc *, void *);
 static int	bridge_ioctl_sfd(struct bridge_softc *, void *);
 static int	bridge_ioctl_gma(struct bridge_softc *, void *);
 static int	bridge_ioctl_sma(struct bridge_softc *, void *);
 static int	bridge_ioctl_sifprio(struct bridge_softc *, void *);
 static int	bridge_ioctl_sifcost(struct bridge_softc *, void *);
 static int	bridge_ioctl_sifmaxaddr(struct bridge_softc *, void *);
 static int	bridge_ioctl_addspan(struct bridge_softc *, void *);
 static int	bridge_ioctl_delspan(struct bridge_softc *, void *);
 static int	bridge_ioctl_gbparam(struct bridge_softc *, void *);
 static int	bridge_ioctl_grte(struct bridge_softc *, void *);
 static int	bridge_ioctl_gifsstp(struct bridge_softc *, void *);
 static int	bridge_ioctl_sproto(struct bridge_softc *, void *);
 static int	bridge_ioctl_stxhc(struct bridge_softc *, void *);
 static int	bridge_pfil(struct mbuf **, struct ifnet *, struct ifnet *,
 		    int);
 static int	bridge_ip_checkbasic(struct mbuf **mp);
 #ifdef INET6
 static int	bridge_ip6_checkbasic(struct mbuf **mp);
 #endif /* INET6 */
 static int	bridge_fragment(struct ifnet *, struct mbuf **mp,
 		    struct ether_header *, int, struct llc *);
 static void	bridge_linkstate(struct ifnet *ifp);
 static void	bridge_linkcheck(struct bridge_softc *sc);
 
 /* The default bridge vlan is 1 (IEEE 802.1Q-2003 Table 9-2) */
 #define	VLANTAGOF(_m)	\
     (_m->m_flags & M_VLANTAG) ? EVL_VLANOFTAG(_m->m_pkthdr.ether_vtag) : 1
 
 static struct bstp_cb_ops bridge_ops = {
 	.bcb_state = bridge_state_change,
 	.bcb_rtage = bridge_rtable_expire
 };
 
 SYSCTL_DECL(_net_link);
 static SYSCTL_NODE(_net_link, IFT_BRIDGE, bridge, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Bridge");
 
 /* only pass IP[46] packets when pfil is enabled */
 VNET_DEFINE_STATIC(int, pfil_onlyip) = 1;
 #define	V_pfil_onlyip	VNET(pfil_onlyip)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_onlyip,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_onlyip), 0,
     "Only pass IP packets when pfil is enabled");
 
 /* run pfil hooks on the bridge interface */
 VNET_DEFINE_STATIC(int, pfil_bridge) = 0;
 #define	V_pfil_bridge	VNET(pfil_bridge)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_bridge,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_bridge), 0,
     "Packet filter on the bridge interface");
 
 /* layer2 filter with ipfw */
 VNET_DEFINE_STATIC(int, pfil_ipfw);
 #define	V_pfil_ipfw	VNET(pfil_ipfw)
 
 /* layer2 ARP filter with ipfw */
 VNET_DEFINE_STATIC(int, pfil_ipfw_arp);
 #define	V_pfil_ipfw_arp	VNET(pfil_ipfw_arp)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, ipfw_arp,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_ipfw_arp), 0,
     "Filter ARP packets through IPFW layer2");
 
 /* run pfil hooks on the member interface */
 VNET_DEFINE_STATIC(int, pfil_member) = 0;
 #define	V_pfil_member	VNET(pfil_member)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_member,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_member), 0,
     "Packet filter on the member interface");
 
 /* run pfil hooks on the physical interface for locally destined packets */
 VNET_DEFINE_STATIC(int, pfil_local_phys);
 #define	V_pfil_local_phys	VNET(pfil_local_phys)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_local_phys,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_local_phys), 0,
     "Packet filter on the physical interface for locally destined packets");
 
 /* log STP state changes */
 VNET_DEFINE_STATIC(int, log_stp);
 #define	V_log_stp	VNET(log_stp)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, log_stp,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(log_stp), 0,
     "Log STP state changes");
 
 /* share MAC with first bridge member */
 VNET_DEFINE_STATIC(int, bridge_inherit_mac);
 #define	V_bridge_inherit_mac	VNET(bridge_inherit_mac)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, inherit_mac,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(bridge_inherit_mac), 0,
     "Inherit MAC address from the first bridge member");
 
 VNET_DEFINE_STATIC(int, allow_llz_overlap) = 0;
 #define	V_allow_llz_overlap	VNET(allow_llz_overlap)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, allow_llz_overlap,
     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(allow_llz_overlap), 0,
     "Allow overlap of link-local scope "
     "zones of a bridge interface and the member interfaces");
 
 struct bridge_control {
 	int	(*bc_func)(struct bridge_softc *, void *);
 	int	bc_argsize;
 	int	bc_flags;
 };
 
 #define	BC_F_COPYIN		0x01	/* copy arguments in */
 #define	BC_F_COPYOUT		0x02	/* copy arguments out */
 #define	BC_F_SUSER		0x04	/* do super-user check */
 
 const struct bridge_control bridge_control_table[] = {
 	{ bridge_ioctl_add,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 	{ bridge_ioctl_del,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_gifflags,	sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_COPYOUT },
 	{ bridge_ioctl_sifflags,	sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_scache,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 	{ bridge_ioctl_gcache,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 
 	{ bridge_ioctl_gifs,		sizeof(struct ifbifconf),
 	  BC_F_COPYIN|BC_F_COPYOUT },
 	{ bridge_ioctl_rts,		sizeof(struct ifbaconf),
 	  BC_F_COPYIN|BC_F_COPYOUT },
 
 	{ bridge_ioctl_saddr,		sizeof(struct ifbareq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_sto,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 	{ bridge_ioctl_gto,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 
 	{ bridge_ioctl_daddr,		sizeof(struct ifbareq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_flush,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_gpri,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 	{ bridge_ioctl_spri,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_ght,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 	{ bridge_ioctl_sht,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_gfd,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 	{ bridge_ioctl_sfd,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_gma,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 	{ bridge_ioctl_sma,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_sifprio,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_sifcost,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_addspan,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 	{ bridge_ioctl_delspan,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_gbparam,		sizeof(struct ifbropreq),
 	  BC_F_COPYOUT },
 
 	{ bridge_ioctl_grte,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 
 	{ bridge_ioctl_gifsstp,		sizeof(struct ifbpstpconf),
 	  BC_F_COPYIN|BC_F_COPYOUT },
 
 	{ bridge_ioctl_sproto,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_stxhc,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_sifmaxaddr,	sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 };
 const int bridge_control_table_size = nitems(bridge_control_table);
 
 VNET_DEFINE_STATIC(LIST_HEAD(, bridge_softc), bridge_list);
 #define	V_bridge_list	VNET(bridge_list)
 #define	BRIDGE_LIST_LOCK_INIT(x)	sx_init(&V_bridge_list_sx,	\
 					    "if_bridge list")
 #define	BRIDGE_LIST_LOCK_DESTROY(x)	sx_destroy(&V_bridge_list_sx)
 #define	BRIDGE_LIST_LOCK(x)		sx_xlock(&V_bridge_list_sx)
 #define	BRIDGE_LIST_UNLOCK(x)		sx_xunlock(&V_bridge_list_sx)
 
 VNET_DEFINE_STATIC(struct if_clone *, bridge_cloner);
 #define	V_bridge_cloner	VNET(bridge_cloner)
 
 static const char bridge_name[] = "bridge";
 
 static void
 vnet_bridge_init(const void *unused __unused)
 {
 
 	V_bridge_rtnode_zone = uma_zcreate("bridge_rtnode",
 	    sizeof(struct bridge_rtnode), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 	BRIDGE_LIST_LOCK_INIT();
 	LIST_INIT(&V_bridge_list);
 
 	struct if_clone_addreq req = {
 		.create_f = bridge_clone_create,
 		.destroy_f = bridge_clone_destroy,
 		.flags = IFC_F_AUTOUNIT,
 	};
 	V_bridge_cloner = ifc_attach_cloner(bridge_name, &req);
 }
 VNET_SYSINIT(vnet_bridge_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_bridge_init, NULL);
 
 static void
 vnet_bridge_uninit(const void *unused __unused)
 {
 
 	ifc_detach_cloner(V_bridge_cloner);
 	V_bridge_cloner = NULL;
 	BRIDGE_LIST_LOCK_DESTROY();
 
 	/* Callbacks may use the UMA zone. */
 	NET_EPOCH_DRAIN_CALLBACKS();
 
 	uma_zdestroy(V_bridge_rtnode_zone);
 }
 VNET_SYSUNINIT(vnet_bridge_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_bridge_uninit, NULL);
 
 static int
 bridge_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		bridge_dn_p = bridge_dummynet;
 		bridge_detach_cookie = EVENTHANDLER_REGISTER(
 		    ifnet_departure_event, bridge_ifdetach, NULL,
 		    EVENTHANDLER_PRI_ANY);
 		break;
 	case MOD_UNLOAD:
 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 		    bridge_detach_cookie);
 		bridge_dn_p = NULL;
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t bridge_mod = {
 	"if_bridge",
 	bridge_modevent,
 	0
 };
 
 DECLARE_MODULE(if_bridge, bridge_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_bridge, 1);
 MODULE_DEPEND(if_bridge, bridgestp, 1, 1, 1);
 
 /*
  * handler for net.link.bridge.ipfw
  */
 static int
 sysctl_pfil_ipfw(SYSCTL_HANDLER_ARGS)
 {
 	int enable = V_pfil_ipfw;
 	int error;
 
 	error = sysctl_handle_int(oidp, &enable, 0, req);
 	enable &= 1;
 
 	if (enable != V_pfil_ipfw) {
 		V_pfil_ipfw = enable;
 
 		/*
 		 * Disable pfil so that ipfw doesnt run twice, if the user
 		 * really wants both then they can re-enable pfil_bridge and/or
 		 * pfil_member. Also allow non-ip packets as ipfw can filter by
 		 * layer2 type.
 		 */
 		if (V_pfil_ipfw) {
 			V_pfil_onlyip = 0;
 			V_pfil_bridge = 0;
 			V_pfil_member = 0;
 		}
 	}
 
 	return (error);
 }
 SYSCTL_PROC(_net_link_bridge, OID_AUTO, ipfw,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET | CTLFLAG_NEEDGIANT,
     &VNET_NAME(pfil_ipfw), 0, &sysctl_pfil_ipfw, "I",
     "Layer2 filter with IPFW");
 
 #ifdef VIMAGE
 static void
 bridge_reassign(struct ifnet *ifp, struct vnet *newvnet, char *arg)
 {
 	struct bridge_softc *sc = ifp->if_softc;
 	struct bridge_iflist *bif;
 
 	BRIDGE_LOCK(sc);
 
 	while ((bif = CK_LIST_FIRST(&sc->sc_iflist)) != NULL)
 		bridge_delete_member(sc, bif, 0);
 
 	while ((bif = CK_LIST_FIRST(&sc->sc_spanlist)) != NULL) {
 		bridge_delete_span(sc, bif);
 	}
 
 	BRIDGE_UNLOCK(sc);
 
 	ether_reassign(ifp, newvnet, arg);
 }
 #endif
 
 /*
  * bridge_clone_create:
  *
  *	Create a new bridge instance.
  */
 static int
 bridge_clone_create(struct if_clone *ifc, char *name, size_t len,
     struct ifc_data *ifd, struct ifnet **ifpp)
 {
 	struct bridge_softc *sc;
 	struct ifnet *ifp;
 
 	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
 	ifp = sc->sc_ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		free(sc, M_DEVBUF);
 		return (ENOSPC);
 	}
 
 	BRIDGE_LOCK_INIT(sc);
 	sc->sc_brtmax = BRIDGE_RTABLE_MAX;
 	sc->sc_brttimeout = BRIDGE_RTABLE_TIMEOUT;
 
 	/* Initialize our routing table. */
 	bridge_rtable_init(sc);
 
 	callout_init_mtx(&sc->sc_brcallout, &sc->sc_rt_mtx, 0);
 
 	CK_LIST_INIT(&sc->sc_iflist);
 	CK_LIST_INIT(&sc->sc_spanlist);
 
 	ifp->if_softc = sc;
 	if_initname(ifp, bridge_name, ifd->unit);
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_ioctl = bridge_ioctl;
 #ifdef ALTQ
 	ifp->if_start = bridge_altq_start;
 	ifp->if_transmit = bridge_altq_transmit;
 	IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen);
 	ifp->if_snd.ifq_drv_maxlen = 0;
 	IFQ_SET_READY(&ifp->if_snd);
 #else
 	ifp->if_transmit = bridge_transmit;
 #endif
 	ifp->if_qflush = bridge_qflush;
 	ifp->if_init = bridge_init;
 	ifp->if_type = IFT_BRIDGE;
 
 	ether_gen_addr(ifp, &sc->sc_defaddr);
 
 	bstp_attach(&sc->sc_stp, &bridge_ops);
 	ether_ifattach(ifp, sc->sc_defaddr.octet);
 	/* Now undo some of the damage... */
 	ifp->if_baudrate = 0;
 	ifp->if_type = IFT_BRIDGE;
 #ifdef VIMAGE
 	ifp->if_reassign = bridge_reassign;
 #endif
 
 	BRIDGE_LIST_LOCK();
 	LIST_INSERT_HEAD(&V_bridge_list, sc, sc_list);
 	BRIDGE_LIST_UNLOCK();
 	*ifpp = ifp;
 
 	return (0);
 }
 
 static void
 bridge_clone_destroy_cb(struct epoch_context *ctx)
 {
 	struct bridge_softc *sc;
 
 	sc = __containerof(ctx, struct bridge_softc, sc_epoch_ctx);
 
 	BRIDGE_LOCK_DESTROY(sc);
 	free(sc, M_DEVBUF);
 }
 
 /*
  * bridge_clone_destroy:
  *
  *	Destroy a bridge instance.
  */
 static int
 bridge_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags)
 {
 	struct bridge_softc *sc = ifp->if_softc;
 	struct bridge_iflist *bif;
 	struct epoch_tracker et;
 
 	BRIDGE_LOCK(sc);
 
 	bridge_stop(ifp, 1);
 	ifp->if_flags &= ~IFF_UP;
 
 	while ((bif = CK_LIST_FIRST(&sc->sc_iflist)) != NULL)
 		bridge_delete_member(sc, bif, 0);
 
 	while ((bif = CK_LIST_FIRST(&sc->sc_spanlist)) != NULL) {
 		bridge_delete_span(sc, bif);
 	}
 
 	/* Tear down the routing table. */
 	bridge_rtable_fini(sc);
 
 	BRIDGE_UNLOCK(sc);
 
 	NET_EPOCH_ENTER(et);
 
 	callout_drain(&sc->sc_brcallout);
 
 	BRIDGE_LIST_LOCK();
 	LIST_REMOVE(sc, sc_list);
 	BRIDGE_LIST_UNLOCK();
 
 	bstp_detach(&sc->sc_stp);
 #ifdef ALTQ
 	IFQ_PURGE(&ifp->if_snd);
 #endif
 	NET_EPOCH_EXIT(et);
 
 	ether_ifdetach(ifp);
 	if_free(ifp);
 
 	NET_EPOCH_CALL(bridge_clone_destroy_cb, &sc->sc_epoch_ctx);
 
 	return (0);
 }
 
 /*
  * bridge_ioctl:
  *
  *	Handle a control request from the operator.
  */
 static int
 bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct bridge_softc *sc = ifp->if_softc;
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct bridge_iflist *bif;
 	struct thread *td = curthread;
 	union {
 		struct ifbreq ifbreq;
 		struct ifbifconf ifbifconf;
 		struct ifbareq ifbareq;
 		struct ifbaconf ifbaconf;
 		struct ifbrparam ifbrparam;
 		struct ifbropreq ifbropreq;
 	} args;
 	struct ifdrv *ifd = (struct ifdrv *) data;
 	const struct bridge_control *bc;
 	int error = 0, oldmtu;
 
 	BRIDGE_LOCK(sc);
 
 	switch (cmd) {
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		break;
 
 	case SIOCGDRVSPEC:
 	case SIOCSDRVSPEC:
 		if (ifd->ifd_cmd >= bridge_control_table_size) {
 			error = EINVAL;
 			break;
 		}
 		bc = &bridge_control_table[ifd->ifd_cmd];
 
 		if (cmd == SIOCGDRVSPEC &&
 		    (bc->bc_flags & BC_F_COPYOUT) == 0) {
 			error = EINVAL;
 			break;
 		}
 		else if (cmd == SIOCSDRVSPEC &&
 		    (bc->bc_flags & BC_F_COPYOUT) != 0) {
 			error = EINVAL;
 			break;
 		}
 
 		if (bc->bc_flags & BC_F_SUSER) {
 			error = priv_check(td, PRIV_NET_BRIDGE);
 			if (error)
 				break;
 		}
 
 		if (ifd->ifd_len != bc->bc_argsize ||
 		    ifd->ifd_len > sizeof(args)) {
 			error = EINVAL;
 			break;
 		}
 
 		bzero(&args, sizeof(args));
 		if (bc->bc_flags & BC_F_COPYIN) {
 			error = copyin(ifd->ifd_data, &args, ifd->ifd_len);
 			if (error)
 				break;
 		}
 
 		oldmtu = ifp->if_mtu;
 		error = (*bc->bc_func)(sc, &args);
 		if (error)
 			break;
 
 		/*
 		 * Bridge MTU may change during addition of the first port.
 		 * If it did, do network layer specific procedure.
 		 */
 		if (ifp->if_mtu != oldmtu) {
 #ifdef INET6
 			nd6_setmtu(ifp);
 #endif
 			rt_updatemtu(ifp);
 		}
 
 		if (bc->bc_flags & BC_F_COPYOUT)
 			error = copyout(&args, ifd->ifd_data, ifd->ifd_len);
 
 		break;
 
 	case SIOCSIFFLAGS:
 		if (!(ifp->if_flags & IFF_UP) &&
 		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			/*
 			 * If interface is marked down and it is running,
 			 * then stop and disable it.
 			 */
 			bridge_stop(ifp, 1);
 		} else if ((ifp->if_flags & IFF_UP) &&
 		    !(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			/*
 			 * If interface is marked up and it is stopped, then
 			 * start it.
 			 */
 			BRIDGE_UNLOCK(sc);
 			(*ifp->if_init)(sc);
 			BRIDGE_LOCK(sc);
 		}
 		break;
 
 	case SIOCSIFMTU:
 		oldmtu = sc->sc_ifp->if_mtu;
 
 		if (ifr->ifr_mtu < 576) {
 			error = EINVAL;
 			break;
 		}
 		if (CK_LIST_EMPTY(&sc->sc_iflist)) {
 			sc->sc_ifp->if_mtu = ifr->ifr_mtu;
 			break;
 		}
 		CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 			error = (*bif->bif_ifp->if_ioctl)(bif->bif_ifp,
 			    SIOCSIFMTU, (caddr_t)ifr);
 			if (error != 0) {
 				log(LOG_NOTICE, "%s: invalid MTU: %u for"
 				    " member %s\n", sc->sc_ifp->if_xname,
 				    ifr->ifr_mtu,
 				    bif->bif_ifp->if_xname);
 				error = EINVAL;
 				break;
 			}
 		}
 		if (error) {
 			/* Restore the previous MTU on all member interfaces. */
 			ifr->ifr_mtu = oldmtu;
 			CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 				(*bif->bif_ifp->if_ioctl)(bif->bif_ifp,
 				    SIOCSIFMTU, (caddr_t)ifr);
 			}
 		} else {
 			sc->sc_ifp->if_mtu = ifr->ifr_mtu;
 		}
 		break;
 	default:
 		/*
 		 * drop the lock as ether_ioctl() will call bridge_start() and
 		 * cause the lock to be recursed.
 		 */
 		BRIDGE_UNLOCK(sc);
 		error = ether_ioctl(ifp, cmd, data);
 		BRIDGE_LOCK(sc);
 		break;
 	}
 
 	BRIDGE_UNLOCK(sc);
 
 	return (error);
 }
 
 /*
  * bridge_mutecaps:
  *
  *	Clear or restore unwanted capabilities on the member interface
  */
 static void
 bridge_mutecaps(struct bridge_softc *sc)
 {
 	struct bridge_iflist *bif;
 	int enabled, mask;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	/* Initial bitmask of capabilities to test */
 	mask = BRIDGE_IFCAPS_MASK;
 
 	CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		/* Every member must support it or its disabled */
 		mask &= bif->bif_savedcaps;
 	}
 
 	CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		enabled = bif->bif_ifp->if_capenable;
 		enabled &= ~BRIDGE_IFCAPS_STRIP;
 		/* strip off mask bits and enable them again if allowed */
 		enabled &= ~BRIDGE_IFCAPS_MASK;
 		enabled |= mask;
 		bridge_set_ifcap(sc, bif, enabled);
 	}
 }
 
 static void
 bridge_set_ifcap(struct bridge_softc *sc, struct bridge_iflist *bif, int set)
 {
 	struct ifnet *ifp = bif->bif_ifp;
 	struct ifreq ifr;
 	int error, mask, stuck;
 
 	bzero(&ifr, sizeof(ifr));
 	ifr.ifr_reqcap = set;
 
 	if (ifp->if_capenable != set) {
 		error = (*ifp->if_ioctl)(ifp, SIOCSIFCAP, (caddr_t)&ifr);
 		if (error)
 			if_printf(sc->sc_ifp,
 			    "error setting capabilities on %s: %d\n",
 			    ifp->if_xname, error);
 		mask = BRIDGE_IFCAPS_MASK | BRIDGE_IFCAPS_STRIP;
 		stuck = ifp->if_capenable & mask & ~set;
 		if (stuck != 0)
 			if_printf(sc->sc_ifp,
 			    "can't disable some capabilities on %s: 0x%x\n",
 			    ifp->if_xname, stuck);
 	}
 }
 
 /*
  * bridge_lookup_member:
  *
  *	Lookup a bridge member interface.
  */
 static struct bridge_iflist *
 bridge_lookup_member(struct bridge_softc *sc, const char *name)
 {
 	struct bridge_iflist *bif;
 	struct ifnet *ifp;
 
 	BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(sc);
 
 	CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		ifp = bif->bif_ifp;
 		if (strcmp(ifp->if_xname, name) == 0)
 			return (bif);
 	}
 
 	return (NULL);
 }
 
 /*
  * bridge_lookup_member_if:
  *
  *	Lookup a bridge member interface by ifnet*.
  */
 static struct bridge_iflist *
 bridge_lookup_member_if(struct bridge_softc *sc, struct ifnet *member_ifp)
 {
 	struct bridge_iflist *bif;
 
 	BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(sc);
 
 	CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		if (bif->bif_ifp == member_ifp)
 			return (bif);
 	}
 
 	return (NULL);
 }
 
 static void
 bridge_delete_member_cb(struct epoch_context *ctx)
 {
 	struct bridge_iflist *bif;
 
 	bif = __containerof(ctx, struct bridge_iflist, bif_epoch_ctx);
 
 	free(bif, M_DEVBUF);
 }
 
 /*
  * bridge_delete_member:
  *
  *	Delete the specified member interface.
  */
 static void
 bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif,
     int gone)
 {
 	struct ifnet *ifs = bif->bif_ifp;
 	struct ifnet *fif = NULL;
 	struct bridge_iflist *bifl;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	if (bif->bif_flags & IFBIF_STP)
 		bstp_disable(&bif->bif_stp);
 
 	ifs->if_bridge = NULL;
 	CK_LIST_REMOVE(bif, bif_next);
 
 	/*
 	 * If removing the interface that gave the bridge its mac address, set
 	 * the mac address of the bridge to the address of the next member, or
 	 * to its default address if no members are left.
 	 */
 	if (V_bridge_inherit_mac && sc->sc_ifaddr == ifs) {
 		if (CK_LIST_EMPTY(&sc->sc_iflist)) {
 			bcopy(&sc->sc_defaddr,
 			    IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
 			sc->sc_ifaddr = NULL;
 		} else {
 			bifl = CK_LIST_FIRST(&sc->sc_iflist);
 			fif = bifl->bif_ifp;
 			bcopy(IF_LLADDR(fif),
 			    IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
 			sc->sc_ifaddr = fif;
 		}
 		EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
 	}
 
 	bridge_linkcheck(sc);
 	bridge_mutecaps(sc);	/* recalcuate now this interface is removed */
 	BRIDGE_RT_LOCK(sc);
 	bridge_rtdelete(sc, ifs, IFBF_FLUSHALL);
 	BRIDGE_RT_UNLOCK(sc);
 	KASSERT(bif->bif_addrcnt == 0,
 	    ("%s: %d bridge routes referenced", __func__, bif->bif_addrcnt));
 
 	ifs->if_bridge_output = NULL;
 	ifs->if_bridge_input = NULL;
 	ifs->if_bridge_linkstate = NULL;
 	if (!gone) {
 		switch (ifs->if_type) {
 		case IFT_ETHER:
 		case IFT_L2VLAN:
 			/*
 			 * Take the interface out of promiscuous mode, but only
 			 * if it was promiscuous in the first place. It might
 			 * not be if we're in the bridge_ioctl_add() error path.
 			 */
 			if (ifs->if_flags & IFF_PROMISC)
 				(void) ifpromisc(ifs, 0);
 			break;
 
 		case IFT_GIF:
 			break;
 
 		default:
 #ifdef DIAGNOSTIC
 			panic("bridge_delete_member: impossible");
 #endif
 			break;
 		}
 		/* reneable any interface capabilities */
 		bridge_set_ifcap(sc, bif, bif->bif_savedcaps);
 	}
 	bstp_destroy(&bif->bif_stp);	/* prepare to free */
 
 	NET_EPOCH_CALL(bridge_delete_member_cb, &bif->bif_epoch_ctx);
 }
 
 /*
  * bridge_delete_span:
  *
  *	Delete the specified span interface.
  */
 static void
 bridge_delete_span(struct bridge_softc *sc, struct bridge_iflist *bif)
 {
 	BRIDGE_LOCK_ASSERT(sc);
 
 	KASSERT(bif->bif_ifp->if_bridge == NULL,
 	    ("%s: not a span interface", __func__));
 
 	CK_LIST_REMOVE(bif, bif_next);
 
 	NET_EPOCH_CALL(bridge_delete_member_cb, &bif->bif_epoch_ctx);
 }
 
 static int
 bridge_ioctl_add(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif = NULL;
 	struct ifnet *ifs;
 	int error = 0;
 
 	ifs = ifunit(req->ifbr_ifsname);
 	if (ifs == NULL)
 		return (ENOENT);
 	if (ifs->if_ioctl == NULL)	/* must be supported */
 		return (EINVAL);
 
 	/* If it's in the span list, it can't be a member. */
 	CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
 		if (ifs == bif->bif_ifp)
 			return (EBUSY);
 
 	if (ifs->if_bridge == sc)
 		return (EEXIST);
 
 	if (ifs->if_bridge != NULL)
 		return (EBUSY);
 
 	switch (ifs->if_type) {
 	case IFT_ETHER:
 	case IFT_L2VLAN:
 	case IFT_GIF:
 		/* permitted interface types */
 		break;
 	default:
 		return (EINVAL);
 	}
 
 #ifdef INET6
 	/*
 	 * Two valid inet6 addresses with link-local scope must not be
 	 * on the parent interface and the member interfaces at the
 	 * same time.  This restriction is needed to prevent violation
 	 * of link-local scope zone.  Attempts to add a member
 	 * interface which has inet6 addresses when the parent has
 	 * inet6 triggers removal of all inet6 addresses on the member
 	 * interface.
 	 */
 
 	/* Check if the parent interface has a link-local scope addr. */
 	if (V_allow_llz_overlap == 0 &&
 	    in6ifa_llaonifp(sc->sc_ifp) != NULL) {
 		/*
 		 * If any, remove all inet6 addresses from the member
 		 * interfaces.
 		 */
 		CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
  			if (in6ifa_llaonifp(bif->bif_ifp)) {
 				in6_ifdetach(bif->bif_ifp);
 				if_printf(sc->sc_ifp,
 				    "IPv6 addresses on %s have been removed "
 				    "before adding it as a member to prevent "
 				    "IPv6 address scope violation.\n",
 				    bif->bif_ifp->if_xname);
 			}
 		}
 		if (in6ifa_llaonifp(ifs)) {
 			in6_ifdetach(ifs);
 			if_printf(sc->sc_ifp,
 			    "IPv6 addresses on %s have been removed "
 			    "before adding it as a member to prevent "
 			    "IPv6 address scope violation.\n",
 			    ifs->if_xname);
 		}
 	}
 #endif
 	/* Allow the first Ethernet member to define the MTU */
 	if (CK_LIST_EMPTY(&sc->sc_iflist))
 		sc->sc_ifp->if_mtu = ifs->if_mtu;
 	else if (sc->sc_ifp->if_mtu != ifs->if_mtu) {
 		struct ifreq ifr;
 
 		snprintf(ifr.ifr_name, sizeof(ifr.ifr_name), "%s",
 		    ifs->if_xname);
 		ifr.ifr_mtu = sc->sc_ifp->if_mtu;
 
 		error = (*ifs->if_ioctl)(ifs,
 		    SIOCSIFMTU, (caddr_t)&ifr);
 		if (error != 0) {
 			log(LOG_NOTICE, "%s: invalid MTU: %u for"
 			    " new member %s\n", sc->sc_ifp->if_xname,
 			    ifr.ifr_mtu,
 			    ifs->if_xname);
 			return (EINVAL);
 		}
 	}
 
 	bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO);
 	if (bif == NULL)
 		return (ENOMEM);
 
 	bif->bif_ifp = ifs;
 	bif->bif_flags = IFBIF_LEARNING | IFBIF_DISCOVER;
 	bif->bif_savedcaps = ifs->if_capenable;
 
 	/*
 	 * Assign the interface's MAC address to the bridge if it's the first
 	 * member and the MAC address of the bridge has not been changed from
 	 * the default randomly generated one.
 	 */
 	if (V_bridge_inherit_mac && CK_LIST_EMPTY(&sc->sc_iflist) &&
 	    !memcmp(IF_LLADDR(sc->sc_ifp), sc->sc_defaddr.octet, ETHER_ADDR_LEN)) {
 		bcopy(IF_LLADDR(ifs), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
 		sc->sc_ifaddr = ifs;
 		EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
 	}
 
 	ifs->if_bridge = sc;
 	ifs->if_bridge_output = bridge_output;
 	ifs->if_bridge_input = bridge_input;
 	ifs->if_bridge_linkstate = bridge_linkstate;
 	bstp_create(&sc->sc_stp, &bif->bif_stp, bif->bif_ifp);
 	/*
 	 * XXX: XLOCK HERE!?!
 	 *
 	 * NOTE: insert_***HEAD*** should be safe for the traversals.
 	 */
 	CK_LIST_INSERT_HEAD(&sc->sc_iflist, bif, bif_next);
 
 	/* Set interface capabilities to the intersection set of all members */
 	bridge_mutecaps(sc);
 	bridge_linkcheck(sc);
 
 	/* Place the interface into promiscuous mode */
 	switch (ifs->if_type) {
 		case IFT_ETHER:
 		case IFT_L2VLAN:
 			error = ifpromisc(ifs, 1);
 			break;
 	}
 
 	if (error)
 		bridge_delete_member(sc, bif, 0);
 	return (error);
 }
 
 static int
 bridge_ioctl_del(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	bridge_delete_member(sc, bif, 0);
 
 	return (0);
 }
 
 static int
 bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 	struct bstp_port *bp;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	bp = &bif->bif_stp;
 	req->ifbr_ifsflags = bif->bif_flags;
 	req->ifbr_state = bp->bp_state;
 	req->ifbr_priority = bp->bp_priority;
 	req->ifbr_path_cost = bp->bp_path_cost;
 	req->ifbr_portno = bif->bif_ifp->if_index & 0xfff;
 	req->ifbr_proto = bp->bp_protover;
 	req->ifbr_role = bp->bp_role;
 	req->ifbr_stpflags = bp->bp_flags;
 	req->ifbr_addrcnt = bif->bif_addrcnt;
 	req->ifbr_addrmax = bif->bif_addrmax;
 	req->ifbr_addrexceeded = bif->bif_addrexceeded;
 
 	/* Copy STP state options as flags */
 	if (bp->bp_operedge)
 		req->ifbr_ifsflags |= IFBIF_BSTP_EDGE;
 	if (bp->bp_flags & BSTP_PORT_AUTOEDGE)
 		req->ifbr_ifsflags |= IFBIF_BSTP_AUTOEDGE;
 	if (bp->bp_ptp_link)
 		req->ifbr_ifsflags |= IFBIF_BSTP_PTP;
 	if (bp->bp_flags & BSTP_PORT_AUTOPTP)
 		req->ifbr_ifsflags |= IFBIF_BSTP_AUTOPTP;
 	if (bp->bp_flags & BSTP_PORT_ADMEDGE)
 		req->ifbr_ifsflags |= IFBIF_BSTP_ADMEDGE;
 	if (bp->bp_flags & BSTP_PORT_ADMCOST)
 		req->ifbr_ifsflags |= IFBIF_BSTP_ADMCOST;
 	return (0);
 }
 
 static int
 bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg)
 {
 	struct epoch_tracker et;
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 	struct bstp_port *bp;
 	int error;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 	bp = &bif->bif_stp;
 
 	if (req->ifbr_ifsflags & IFBIF_SPAN)
 		/* SPAN is readonly */
 		return (EINVAL);
 
 	NET_EPOCH_ENTER(et);
 
 	if (req->ifbr_ifsflags & IFBIF_STP) {
 		if ((bif->bif_flags & IFBIF_STP) == 0) {
 			error = bstp_enable(&bif->bif_stp);
 			if (error) {
 				NET_EPOCH_EXIT(et);
 				return (error);
 			}
 		}
 	} else {
 		if ((bif->bif_flags & IFBIF_STP) != 0)
 			bstp_disable(&bif->bif_stp);
 	}
 
 	/* Pass on STP flags */
 	bstp_set_edge(bp, req->ifbr_ifsflags & IFBIF_BSTP_EDGE ? 1 : 0);
 	bstp_set_autoedge(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOEDGE ? 1 : 0);
 	bstp_set_ptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_PTP ? 1 : 0);
 	bstp_set_autoptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOPTP ? 1 : 0);
 
 	/* Save the bits relating to the bridge */
 	bif->bif_flags = req->ifbr_ifsflags & IFBIFMASK;
 
 	NET_EPOCH_EXIT(et);
 
 	return (0);
 }
 
 static int
 bridge_ioctl_scache(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	sc->sc_brtmax = param->ifbrp_csize;
 	bridge_rttrim(sc);
 
 	return (0);
 }
 
 static int
 bridge_ioctl_gcache(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	param->ifbrp_csize = sc->sc_brtmax;
 
 	return (0);
 }
 
 static int
 bridge_ioctl_gifs(struct bridge_softc *sc, void *arg)
 {
 	struct ifbifconf *bifc = arg;
 	struct bridge_iflist *bif;
 	struct ifbreq breq;
 	char *buf, *outbuf;
 	int count, buflen, len, error = 0;
 
 	count = 0;
 	CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next)
 		count++;
 	CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
 		count++;
 
 	buflen = sizeof(breq) * count;
 	if (bifc->ifbic_len == 0) {
 		bifc->ifbic_len = buflen;
 		return (0);
 	}
 	outbuf = malloc(buflen, M_TEMP, M_NOWAIT | M_ZERO);
 	if (outbuf == NULL)
 		return (ENOMEM);
 
 	count = 0;
 	buf = outbuf;
 	len = min(bifc->ifbic_len, buflen);
 	bzero(&breq, sizeof(breq));
 	CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		if (len < sizeof(breq))
 			break;
 
 		strlcpy(breq.ifbr_ifsname, bif->bif_ifp->if_xname,
 		    sizeof(breq.ifbr_ifsname));
 		/* Fill in the ifbreq structure */
 		error = bridge_ioctl_gifflags(sc, &breq);
 		if (error)
 			break;
 		memcpy(buf, &breq, sizeof(breq));
 		count++;
 		buf += sizeof(breq);
 		len -= sizeof(breq);
 	}
 	CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) {
 		if (len < sizeof(breq))
 			break;
 
 		strlcpy(breq.ifbr_ifsname, bif->bif_ifp->if_xname,
 		    sizeof(breq.ifbr_ifsname));
 		breq.ifbr_ifsflags = bif->bif_flags;
 		breq.ifbr_portno = bif->bif_ifp->if_index & 0xfff;
 		memcpy(buf, &breq, sizeof(breq));
 		count++;
 		buf += sizeof(breq);
 		len -= sizeof(breq);
 	}
 
 	bifc->ifbic_len = sizeof(breq) * count;
 	error = copyout(outbuf, bifc->ifbic_req, bifc->ifbic_len);
 	free(outbuf, M_TEMP);
 	return (error);
 }
 
 static int
 bridge_ioctl_rts(struct bridge_softc *sc, void *arg)
 {
 	struct ifbaconf *bac = arg;
 	struct bridge_rtnode *brt;
 	struct ifbareq bareq;
 	char *buf, *outbuf;
 	int count, buflen, len, error = 0;
 
 	if (bac->ifbac_len == 0)
 		return (0);
 
 	count = 0;
 	CK_LIST_FOREACH(brt, &sc->sc_rtlist, brt_list)
 		count++;
 	buflen = sizeof(bareq) * count;
 
 	outbuf = malloc(buflen, M_TEMP, M_NOWAIT | M_ZERO);
 	if (outbuf == NULL)
 		return (ENOMEM);
 
 	count = 0;
 	buf = outbuf;
 	len = min(bac->ifbac_len, buflen);
 	bzero(&bareq, sizeof(bareq));
 	CK_LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) {
 		if (len < sizeof(bareq))
 			goto out;
 		strlcpy(bareq.ifba_ifsname, brt->brt_ifp->if_xname,
 		    sizeof(bareq.ifba_ifsname));
 		memcpy(bareq.ifba_dst, brt->brt_addr, sizeof(brt->brt_addr));
 		bareq.ifba_vlan = brt->brt_vlan;
 		if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC &&
 				time_uptime < brt->brt_expire)
 			bareq.ifba_expire = brt->brt_expire - time_uptime;
 		else
 			bareq.ifba_expire = 0;
 		bareq.ifba_flags = brt->brt_flags;
 
 		memcpy(buf, &bareq, sizeof(bareq));
 		count++;
 		buf += sizeof(bareq);
 		len -= sizeof(bareq);
 	}
 out:
 	bac->ifbac_len = sizeof(bareq) * count;
 	error = copyout(outbuf, bac->ifbac_req, bac->ifbac_len);
 	free(outbuf, M_TEMP);
 	return (error);
 }
 
 static int
 bridge_ioctl_saddr(struct bridge_softc *sc, void *arg)
 {
 	struct ifbareq *req = arg;
 	struct bridge_iflist *bif;
 	struct epoch_tracker et;
 	int error;
 
 	NET_EPOCH_ENTER(et);
 	bif = bridge_lookup_member(sc, req->ifba_ifsname);
 	if (bif == NULL) {
 		NET_EPOCH_EXIT(et);
 		return (ENOENT);
 	}
 
 	/* bridge_rtupdate() may acquire the lock. */
 	error = bridge_rtupdate(sc, req->ifba_dst, req->ifba_vlan, bif, 1,
 	    req->ifba_flags);
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 static int
 bridge_ioctl_sto(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	sc->sc_brttimeout = param->ifbrp_ctime;
 	return (0);
 }
 
 static int
 bridge_ioctl_gto(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	param->ifbrp_ctime = sc->sc_brttimeout;
 	return (0);
 }
 
 static int
 bridge_ioctl_daddr(struct bridge_softc *sc, void *arg)
 {
 	struct ifbareq *req = arg;
 
 	return (bridge_rtdaddr(sc, req->ifba_dst, req->ifba_vlan));
 }
 
 static int
 bridge_ioctl_flush(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 
 	BRIDGE_RT_LOCK(sc);
 	bridge_rtflush(sc, req->ifbr_ifsflags);
 	BRIDGE_RT_UNLOCK(sc);
 
 	return (0);
 }
 
 static int
 bridge_ioctl_gpri(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 	struct bstp_state *bs = &sc->sc_stp;
 
 	param->ifbrp_prio = bs->bs_bridge_priority;
 	return (0);
 }
 
 static int
 bridge_ioctl_spri(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_priority(&sc->sc_stp, param->ifbrp_prio));
 }
 
 static int
 bridge_ioctl_ght(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 	struct bstp_state *bs = &sc->sc_stp;
 
 	param->ifbrp_hellotime = bs->bs_bridge_htime >> 8;
 	return (0);
 }
 
 static int
 bridge_ioctl_sht(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_htime(&sc->sc_stp, param->ifbrp_hellotime));
 }
 
 static int
 bridge_ioctl_gfd(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 	struct bstp_state *bs = &sc->sc_stp;
 
 	param->ifbrp_fwddelay = bs->bs_bridge_fdelay >> 8;
 	return (0);
 }
 
 static int
 bridge_ioctl_sfd(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_fdelay(&sc->sc_stp, param->ifbrp_fwddelay));
 }
 
 static int
 bridge_ioctl_gma(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 	struct bstp_state *bs = &sc->sc_stp;
 
 	param->ifbrp_maxage = bs->bs_bridge_max_age >> 8;
 	return (0);
 }
 
 static int
 bridge_ioctl_sma(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_maxage(&sc->sc_stp, param->ifbrp_maxage));
 }
 
 static int
 bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	return (bstp_set_port_priority(&bif->bif_stp, req->ifbr_priority));
 }
 
 static int
 bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	return (bstp_set_path_cost(&bif->bif_stp, req->ifbr_path_cost));
 }
 
 static int
 bridge_ioctl_sifmaxaddr(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	bif->bif_addrmax = req->ifbr_addrmax;
 	return (0);
 }
 
 static int
 bridge_ioctl_addspan(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif = NULL;
 	struct ifnet *ifs;
 
 	ifs = ifunit(req->ifbr_ifsname);
 	if (ifs == NULL)
 		return (ENOENT);
 
 	CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
 		if (ifs == bif->bif_ifp)
 			return (EBUSY);
 
 	if (ifs->if_bridge != NULL)
 		return (EBUSY);
 
 	switch (ifs->if_type) {
 		case IFT_ETHER:
 		case IFT_GIF:
 		case IFT_L2VLAN:
 			break;
 		default:
 			return (EINVAL);
 	}
 
 	bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO);
 	if (bif == NULL)
 		return (ENOMEM);
 
 	bif->bif_ifp = ifs;
 	bif->bif_flags = IFBIF_SPAN;
 
 	CK_LIST_INSERT_HEAD(&sc->sc_spanlist, bif, bif_next);
 
 	return (0);
 }
 
 static int
 bridge_ioctl_delspan(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 	struct ifnet *ifs;
 
 	ifs = ifunit(req->ifbr_ifsname);
 	if (ifs == NULL)
 		return (ENOENT);
 
 	CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
 		if (ifs == bif->bif_ifp)
 			break;
 
 	if (bif == NULL)
 		return (ENOENT);
 
 	bridge_delete_span(sc, bif);
 
 	return (0);
 }
 
 static int
 bridge_ioctl_gbparam(struct bridge_softc *sc, void *arg)
 {
 	struct ifbropreq *req = arg;
 	struct bstp_state *bs = &sc->sc_stp;
 	struct bstp_port *root_port;
 
 	req->ifbop_maxage = bs->bs_bridge_max_age >> 8;
 	req->ifbop_hellotime = bs->bs_bridge_htime >> 8;
 	req->ifbop_fwddelay = bs->bs_bridge_fdelay >> 8;
 
 	root_port = bs->bs_root_port;
 	if (root_port == NULL)
 		req->ifbop_root_port = 0;
 	else
 		req->ifbop_root_port = root_port->bp_ifp->if_index;
 
 	req->ifbop_holdcount = bs->bs_txholdcount;
 	req->ifbop_priority = bs->bs_bridge_priority;
 	req->ifbop_protocol = bs->bs_protover;
 	req->ifbop_root_path_cost = bs->bs_root_pv.pv_cost;
 	req->ifbop_bridgeid = bs->bs_bridge_pv.pv_dbridge_id;
 	req->ifbop_designated_root = bs->bs_root_pv.pv_root_id;
 	req->ifbop_designated_bridge = bs->bs_root_pv.pv_dbridge_id;
 	req->ifbop_last_tc_time.tv_sec = bs->bs_last_tc_time.tv_sec;
 	req->ifbop_last_tc_time.tv_usec = bs->bs_last_tc_time.tv_usec;
 
 	return (0);
 }
 
 static int
 bridge_ioctl_grte(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	param->ifbrp_cexceeded = sc->sc_brtexceeded;
 	return (0);
 }
 
 static int
 bridge_ioctl_gifsstp(struct bridge_softc *sc, void *arg)
 {
 	struct ifbpstpconf *bifstp = arg;
 	struct bridge_iflist *bif;
 	struct bstp_port *bp;
 	struct ifbpstpreq bpreq;
 	char *buf, *outbuf;
 	int count, buflen, len, error = 0;
 
 	count = 0;
 	CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		if ((bif->bif_flags & IFBIF_STP) != 0)
 			count++;
 	}
 
 	buflen = sizeof(bpreq) * count;
 	if (bifstp->ifbpstp_len == 0) {
 		bifstp->ifbpstp_len = buflen;
 		return (0);
 	}
 
 	outbuf = malloc(buflen, M_TEMP, M_NOWAIT | M_ZERO);
 	if (outbuf == NULL)
 		return (ENOMEM);
 
 	count = 0;
 	buf = outbuf;
 	len = min(bifstp->ifbpstp_len, buflen);
 	bzero(&bpreq, sizeof(bpreq));
 	CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		if (len < sizeof(bpreq))
 			break;
 
 		if ((bif->bif_flags & IFBIF_STP) == 0)
 			continue;
 
 		bp = &bif->bif_stp;
 		bpreq.ifbp_portno = bif->bif_ifp->if_index & 0xfff;
 		bpreq.ifbp_fwd_trans = bp->bp_forward_transitions;
 		bpreq.ifbp_design_cost = bp->bp_desg_pv.pv_cost;
 		bpreq.ifbp_design_port = bp->bp_desg_pv.pv_port_id;
 		bpreq.ifbp_design_bridge = bp->bp_desg_pv.pv_dbridge_id;
 		bpreq.ifbp_design_root = bp->bp_desg_pv.pv_root_id;
 
 		memcpy(buf, &bpreq, sizeof(bpreq));
 		count++;
 		buf += sizeof(bpreq);
 		len -= sizeof(bpreq);
 	}
 
 	bifstp->ifbpstp_len = sizeof(bpreq) * count;
 	error = copyout(outbuf, bifstp->ifbpstp_req, bifstp->ifbpstp_len);
 	free(outbuf, M_TEMP);
 	return (error);
 }
 
 static int
 bridge_ioctl_sproto(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_protocol(&sc->sc_stp, param->ifbrp_proto));
 }
 
 static int
 bridge_ioctl_stxhc(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_holdcount(&sc->sc_stp, param->ifbrp_txhc));
 }
 
 /*
  * bridge_ifdetach:
  *
  *	Detach an interface from a bridge.  Called when a member
  *	interface is detaching.
  */
 static void
 bridge_ifdetach(void *arg __unused, struct ifnet *ifp)
 {
 	struct bridge_softc *sc = ifp->if_bridge;
 	struct bridge_iflist *bif;
 
 	if (ifp->if_flags & IFF_RENAMING)
 		return;
 	if (V_bridge_cloner == NULL) {
 		/*
 		 * This detach handler can be called after
 		 * vnet_bridge_uninit().  Just return in that case.
 		 */
 		return;
 	}
 	/* Check if the interface is a bridge member */
 	if (sc != NULL) {
 		BRIDGE_LOCK(sc);
 
 		bif = bridge_lookup_member_if(sc, ifp);
 		if (bif != NULL)
 			bridge_delete_member(sc, bif, 1);
 
 		BRIDGE_UNLOCK(sc);
 		return;
 	}
 
 	/* Check if the interface is a span port */
 	BRIDGE_LIST_LOCK();
 	LIST_FOREACH(sc, &V_bridge_list, sc_list) {
 		BRIDGE_LOCK(sc);
 		CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
 			if (ifp == bif->bif_ifp) {
 				bridge_delete_span(sc, bif);
 				break;
 			}
 
 		BRIDGE_UNLOCK(sc);
 	}
 	BRIDGE_LIST_UNLOCK();
 }
 
 /*
  * bridge_init:
  *
  *	Initialize a bridge interface.
  */
 static void
 bridge_init(void *xsc)
 {
 	struct bridge_softc *sc = (struct bridge_softc *)xsc;
 	struct ifnet *ifp = sc->sc_ifp;
 
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 		return;
 
 	BRIDGE_LOCK(sc);
 	callout_reset(&sc->sc_brcallout, bridge_rtable_prune_period * hz,
 	    bridge_timer, sc);
 
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	bstp_init(&sc->sc_stp);		/* Initialize Spanning Tree */
 
 	BRIDGE_UNLOCK(sc);
 }
 
 /*
  * bridge_stop:
  *
  *	Stop the bridge interface.
  */
 static void
 bridge_stop(struct ifnet *ifp, int disable)
 {
 	struct bridge_softc *sc = ifp->if_softc;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 
 	BRIDGE_RT_LOCK(sc);
 	callout_stop(&sc->sc_brcallout);
 
 	bstp_stop(&sc->sc_stp);
 
 	bridge_rtflush(sc, IFBF_FLUSHDYN);
 	BRIDGE_RT_UNLOCK(sc);
 
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 }
 
 /*
  * bridge_enqueue:
  *
  *	Enqueue a packet on a bridge member interface.
  *
  */
 static int
 bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m)
 {
 	int len, err = 0;
 	short mflags;
 	struct mbuf *m0;
 
 	/* We may be sending a fragment so traverse the mbuf */
 	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		len = m->m_pkthdr.len;
 		mflags = m->m_flags;
 
 		/*
 		 * If underlying interface can not do VLAN tag insertion itself
 		 * then attach a packet tag that holds it.
 		 */
 		if ((m->m_flags & M_VLANTAG) &&
 		    (dst_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) {
 			m = ether_vlanencap(m, m->m_pkthdr.ether_vtag);
 			if (m == NULL) {
 				if_printf(dst_ifp,
 				    "unable to prepend VLAN header\n");
 				if_inc_counter(dst_ifp, IFCOUNTER_OERRORS, 1);
 				continue;
 			}
 			m->m_flags &= ~M_VLANTAG;
 		}
 
 		M_ASSERTPKTHDR(m); /* We shouldn't transmit mbuf without pkthdr */
 		if ((err = dst_ifp->if_transmit(dst_ifp, m))) {
 			int n;
 
 			for (m = m0, n = 1; m != NULL; m = m0, n++) {
 				m0 = m->m_nextpkt;
 				m_freem(m);
 			}
 			if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, n);
 			break;
 		}
 
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1);
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, len);
 		if (mflags & M_MCAST)
 			if_inc_counter(sc->sc_ifp, IFCOUNTER_OMCASTS, 1);
 	}
 
 	return (err);
 }
 
 /*
  * bridge_dummynet:
  *
  * 	Receive a queued packet from dummynet and pass it on to the output
  * 	interface.
  *
  *	The mbuf has the Ethernet header already attached.
  */
 static void
 bridge_dummynet(struct mbuf *m, struct ifnet *ifp)
 {
 	struct bridge_softc *sc;
 
 	sc = ifp->if_bridge;
 
 	/*
 	 * The packet didnt originate from a member interface. This should only
 	 * ever happen if a member interface is removed while packets are
 	 * queued for it.
 	 */
 	if (sc == NULL) {
 		m_freem(m);
 		return;
 	}
 
 	if (PFIL_HOOKED_OUT(V_inet_pfil_head)
 #ifdef INET6
 	    || PFIL_HOOKED_OUT(V_inet6_pfil_head)
 #endif
 	    ) {
 		if (bridge_pfil(&m, sc->sc_ifp, ifp, PFIL_OUT) != 0)
 			return;
 		if (m == NULL)
 			return;
 	}
 
 	bridge_enqueue(sc, ifp, m);
 }
 
 /*
  * bridge_output:
  *
  *	Send output from a bridge member interface.  This
  *	performs the bridging function for locally originated
  *	packets.
  *
  *	The mbuf has the Ethernet header already attached.  We must
  *	enqueue or free the mbuf before returning.
  */
 static int
 bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa,
     struct rtentry *rt)
 {
 	struct ether_header *eh;
 	struct ifnet *bifp, *dst_if;
 	struct bridge_softc *sc;
 	uint16_t vlan;
 
 	NET_EPOCH_ASSERT();
 
 	if (m->m_len < ETHER_HDR_LEN) {
 		m = m_pullup(m, ETHER_HDR_LEN);
 		if (m == NULL)
 			return (0);
 	}
 
 	eh = mtod(m, struct ether_header *);
 	sc = ifp->if_bridge;
 	vlan = VLANTAGOF(m);
 
 	bifp = sc->sc_ifp;
 
 	/*
 	 * If bridge is down, but the original output interface is up,
 	 * go ahead and send out that interface.  Otherwise, the packet
 	 * is dropped below.
 	 */
 	if ((bifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		dst_if = ifp;
 		goto sendunicast;
 	}
 
 	/*
 	 * If the packet is a multicast, or we don't know a better way to
 	 * get there, send to all interfaces.
 	 */
 	if (ETHER_IS_MULTICAST(eh->ether_dhost))
 		dst_if = NULL;
 	else
 		dst_if = bridge_rtlookup(sc, eh->ether_dhost, vlan);
 	/* Tap any traffic not passing back out the originating interface */
 	if (dst_if != ifp)
 		ETHER_BPF_MTAP(bifp, m);
 	if (dst_if == NULL) {
 		struct bridge_iflist *bif;
 		struct mbuf *mc;
 		int used = 0;
 
 		bridge_span(sc, m);
 
 		CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 			dst_if = bif->bif_ifp;
 
 			if (dst_if->if_type == IFT_GIF)
 				continue;
 			if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0)
 				continue;
 
 			/*
 			 * If this is not the original output interface,
 			 * and the interface is participating in spanning
 			 * tree, make sure the port is in a state that
 			 * allows forwarding.
 			 */
 			if (dst_if != ifp && (bif->bif_flags & IFBIF_STP) &&
 			    bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
 				continue;
 
 			if (CK_LIST_NEXT(bif, bif_next) == NULL) {
 				used = 1;
 				mc = m;
 			} else {
 				mc = m_dup(m, M_NOWAIT);
 				if (mc == NULL) {
 					if_inc_counter(bifp, IFCOUNTER_OERRORS, 1);
 					continue;
 				}
 			}
 
 			bridge_enqueue(sc, dst_if, mc);
 		}
 		if (used == 0)
 			m_freem(m);
 		return (0);
 	}
 
 sendunicast:
 	/*
 	 * XXX Spanning tree consideration here?
 	 */
 
 	bridge_span(sc, m);
 	if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		m_freem(m);
 		return (0);
 	}
 
 	bridge_enqueue(sc, dst_if, m);
 	return (0);
 }
 
 /*
  * bridge_transmit:
  *
  *	Do output on a bridge.
  *
  */
 static int
 bridge_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct bridge_softc *sc;
 	struct ether_header *eh;
 	struct ifnet *dst_if;
 	int error = 0;
 
 	sc = ifp->if_softc;
 
 	ETHER_BPF_MTAP(ifp, m);
 
 	eh = mtod(m, struct ether_header *);
 
 	if (((m->m_flags & (M_BCAST|M_MCAST)) == 0) &&
 	    (dst_if = bridge_rtlookup(sc, eh->ether_dhost, 1)) != NULL) {
 		error = bridge_enqueue(sc, dst_if, m);
 	} else
 		bridge_broadcast(sc, ifp, m, 0);
 
 	return (error);
 }
 
 #ifdef ALTQ
 static void
 bridge_altq_start(if_t ifp)
 {
 	struct ifaltq *ifq = &ifp->if_snd;
 	struct mbuf *m;
 
 	IFQ_LOCK(ifq);
 	IFQ_DEQUEUE_NOLOCK(ifq, m);
 	while (m != NULL) {
 		bridge_transmit(ifp, m);
 		IFQ_DEQUEUE_NOLOCK(ifq, m);
 	}
 	IFQ_UNLOCK(ifq);
 }
 
 static int
 bridge_altq_transmit(if_t ifp, struct mbuf *m)
 {
 	int err;
 
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		IFQ_ENQUEUE(&ifp->if_snd, m, err);
 		if (err == 0)
 			bridge_altq_start(ifp);
 	} else
 		err = bridge_transmit(ifp, m);
 
 	return (err);
 }
 #endif	/* ALTQ */
 
 /*
  * The ifp->if_qflush entry point for if_bridge(4) is no-op.
  */
 static void
 bridge_qflush(struct ifnet *ifp __unused)
 {
 }
 
 /*
  * bridge_forward:
  *
  *	The forwarding function of the bridge.
  *
  *	NOTE: Releases the lock on return.
  */
 static void
 bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif,
     struct mbuf *m)
 {
 	struct bridge_iflist *dbif;
 	struct ifnet *src_if, *dst_if, *ifp;
 	struct ether_header *eh;
 	uint16_t vlan;
 	uint8_t *dst;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	src_if = m->m_pkthdr.rcvif;
 	ifp = sc->sc_ifp;
 
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	vlan = VLANTAGOF(m);
 
 	if ((sbif->bif_flags & IFBIF_STP) &&
 	    sbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
 		goto drop;
 
 	eh = mtod(m, struct ether_header *);
 	dst = eh->ether_dhost;
 
 	/* If the interface is learning, record the address. */
 	if (sbif->bif_flags & IFBIF_LEARNING) {
 		error = bridge_rtupdate(sc, eh->ether_shost, vlan,
 		    sbif, 0, IFBAF_DYNAMIC);
 		/*
 		 * If the interface has addresses limits then deny any source
 		 * that is not in the cache.
 		 */
 		if (error && sbif->bif_addrmax)
 			goto drop;
 	}
 
 	if ((sbif->bif_flags & IFBIF_STP) != 0 &&
 	    sbif->bif_stp.bp_state == BSTP_IFSTATE_LEARNING)
 		goto drop;
 
 	/*
 	 * At this point, the port either doesn't participate
 	 * in spanning tree or it is in the forwarding state.
 	 */
 
 	/*
 	 * If the packet is unicast, destined for someone on
 	 * "this" side of the bridge, drop it.
 	 */
 	if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) {
 		dst_if = bridge_rtlookup(sc, dst, vlan);
 		if (src_if == dst_if)
 			goto drop;
 	} else {
 		/*
 		 * Check if its a reserved multicast address, any address
 		 * listed in 802.1D section 7.12.6 may not be forwarded by the
 		 * bridge.
 		 * This is currently 01-80-C2-00-00-00 to 01-80-C2-00-00-0F
 		 */
 		if (dst[0] == 0x01 && dst[1] == 0x80 &&
 		    dst[2] == 0xc2 && dst[3] == 0x00 &&
 		    dst[4] == 0x00 && dst[5] <= 0x0f)
 			goto drop;
 
 		/* ...forward it to all interfaces. */
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 		dst_if = NULL;
 	}
 
 	/*
 	 * If we have a destination interface which is a member of our bridge,
 	 * OR this is a unicast packet, push it through the bpf(4) machinery.
 	 * For broadcast or multicast packets, don't bother because it will
 	 * be reinjected into ether_input. We do this before we pass the packets
 	 * through the pfil(9) framework, as it is possible that pfil(9) will
 	 * drop the packet, or possibly modify it, making it difficult to debug
 	 * firewall issues on the bridge.
 	 */
 	if (dst_if != NULL || (m->m_flags & (M_BCAST | M_MCAST)) == 0)
 		ETHER_BPF_MTAP(ifp, m);
 
 	/* run the packet filter */
 	if (PFIL_HOOKED_IN(V_inet_pfil_head)
 #ifdef INET6
 	    || PFIL_HOOKED_IN(V_inet6_pfil_head)
 #endif
 	    ) {
 		if (bridge_pfil(&m, ifp, src_if, PFIL_IN) != 0)
 			return;
 		if (m == NULL)
 			return;
 	}
 
 	if (dst_if == NULL) {
 		bridge_broadcast(sc, src_if, m, 1);
 		return;
 	}
 
 	/*
 	 * At this point, we're dealing with a unicast frame
 	 * going to a different interface.
 	 */
 	if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		goto drop;
 
 	dbif = bridge_lookup_member_if(sc, dst_if);
 	if (dbif == NULL)
 		/* Not a member of the bridge (anymore?) */
 		goto drop;
 
 	/* Private segments can not talk to each other */
 	if (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE)
 		goto drop;
 
 	if ((dbif->bif_flags & IFBIF_STP) &&
 	    dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
 		goto drop;
 
 	if (PFIL_HOOKED_OUT(V_inet_pfil_head)
 #ifdef INET6
 	    || PFIL_HOOKED_OUT(V_inet6_pfil_head)
 #endif
 	    ) {
 		if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0)
 			return;
 		if (m == NULL)
 			return;
 	}
 
 	bridge_enqueue(sc, dst_if, m);
 	return;
 
 drop:
 	m_freem(m);
 }
 
 /*
  * bridge_input:
  *
  *	Receive input from a member interface.  Queue the packet for
  *	bridging if it is not for us.
  */
 static struct mbuf *
 bridge_input(struct ifnet *ifp, struct mbuf *m)
 {
 	struct bridge_softc *sc = ifp->if_bridge;
 	struct bridge_iflist *bif, *bif2;
 	struct ifnet *bifp;
 	struct ether_header *eh;
 	struct mbuf *mc, *mc2;
 	uint16_t vlan;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	if ((sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return (m);
 
 	bifp = sc->sc_ifp;
 	vlan = VLANTAGOF(m);
 
 	/*
 	 * Implement support for bridge monitoring. If this flag has been
 	 * set on this interface, discard the packet once we push it through
 	 * the bpf(4) machinery, but before we do, increment the byte and
 	 * packet counters associated with this interface.
 	 */
 	if ((bifp->if_flags & IFF_MONITOR) != 0) {
 		m->m_pkthdr.rcvif  = bifp;
 		ETHER_BPF_MTAP(bifp, m);
 		if_inc_counter(bifp, IFCOUNTER_IPACKETS, 1);
 		if_inc_counter(bifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 		m_freem(m);
 		return (NULL);
 	}
 	bif = bridge_lookup_member_if(sc, ifp);
 	if (bif == NULL) {
 		return (m);
 	}
 
 	eh = mtod(m, struct ether_header *);
 
 	bridge_span(sc, m);
 
 	if (m->m_flags & (M_BCAST|M_MCAST)) {
 		/* Tap off 802.1D packets; they do not get forwarded. */
 		if (memcmp(eh->ether_dhost, bstp_etheraddr,
 		    ETHER_ADDR_LEN) == 0) {
 			bstp_input(&bif->bif_stp, ifp, m); /* consumes mbuf */
 			return (NULL);
 		}
 
 		if ((bif->bif_flags & IFBIF_STP) &&
 		    bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) {
 			return (m);
 		}
 
 		/*
 		 * Make a deep copy of the packet and enqueue the copy
 		 * for bridge processing; return the original packet for
 		 * local processing.
 		 */
 		mc = m_dup(m, M_NOWAIT);
 		if (mc == NULL) {
 			return (m);
 		}
 
 		/* Perform the bridge forwarding function with the copy. */
 		bridge_forward(sc, bif, mc);
 
 		/*
 		 * Reinject the mbuf as arriving on the bridge so we have a
 		 * chance at claiming multicast packets. We can not loop back
 		 * here from ether_input as a bridge is never a member of a
 		 * bridge.
 		 */
 		KASSERT(bifp->if_bridge == NULL,
 		    ("loop created in bridge_input"));
 		mc2 = m_dup(m, M_NOWAIT);
 		if (mc2 != NULL) {
 			/* Keep the layer3 header aligned */
 			int i = min(mc2->m_pkthdr.len, max_protohdr);
 			mc2 = m_copyup(mc2, i, ETHER_ALIGN);
 		}
 		if (mc2 != NULL) {
 			mc2->m_pkthdr.rcvif = bifp;
 			(*bifp->if_input)(bifp, mc2);
 		}
 
 		/* Return the original packet for local processing. */
 		return (m);
 	}
 
 	if ((bif->bif_flags & IFBIF_STP) &&
 	    bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) {
 		return (m);
 	}
 
 #if (defined(INET) || defined(INET6))
 #   define OR_CARP_CHECK_WE_ARE_DST(iface) \
 	|| ((iface)->if_carp \
 	    && (*carp_forus_p)((iface), eh->ether_dhost))
 #   define OR_CARP_CHECK_WE_ARE_SRC(iface) \
 	|| ((iface)->if_carp \
 	    && (*carp_forus_p)((iface), eh->ether_shost))
 #else
 #   define OR_CARP_CHECK_WE_ARE_DST(iface)
 #   define OR_CARP_CHECK_WE_ARE_SRC(iface)
 #endif
 
 #ifdef INET6
 #   define OR_PFIL_HOOKED_INET6 \
 	|| PFIL_HOOKED_IN(V_inet6_pfil_head)
 #else
 #   define OR_PFIL_HOOKED_INET6
 #endif
 
 #define GRAB_OUR_PACKETS(iface) \
 	if ((iface)->if_type == IFT_GIF) \
 		continue; \
 	/* It is destined for us. */ \
 	if (memcmp(IF_LLADDR((iface)), eh->ether_dhost,  ETHER_ADDR_LEN) == 0 \
 	    OR_CARP_CHECK_WE_ARE_DST((iface))				\
 	    ) {								\
 		if (bif->bif_flags & IFBIF_LEARNING) {			\
 			error = bridge_rtupdate(sc, eh->ether_shost,	\
 			    vlan, bif, 0, IFBAF_DYNAMIC);		\
 			if (error && bif->bif_addrmax) {		\
 				m_freem(m);				\
 				return (NULL);				\
 			}						\
 		}							\
 		m->m_pkthdr.rcvif = iface;				\
 		if ((iface) == ifp) {					\
 			/* Skip bridge processing... src == dest */	\
 			return (m);					\
 		}							\
 		/* It's passing over or to the bridge, locally. */	\
 		ETHER_BPF_MTAP(bifp, m);				\
 		if_inc_counter(bifp, IFCOUNTER_IPACKETS, 1);		\
 		if_inc_counter(bifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); \
 		/* Filter on the physical interface. */			\
 		if (V_pfil_local_phys && (PFIL_HOOKED_IN(V_inet_pfil_head) \
 		     OR_PFIL_HOOKED_INET6)) {				\
 			if (bridge_pfil(&m, NULL, ifp,			\
 			    PFIL_IN) != 0 || m == NULL) {		\
 				return (NULL);				\
 			}						\
 		}							\
 		if ((iface) != bifp)					\
 			ETHER_BPF_MTAP(iface, m);			\
 		return (m);						\
 	}								\
 									\
 	/* We just received a packet that we sent out. */		\
 	if (memcmp(IF_LLADDR((iface)), eh->ether_shost, ETHER_ADDR_LEN) == 0 \
 	    OR_CARP_CHECK_WE_ARE_SRC((iface))			\
 	    ) {								\
 		m_freem(m);						\
 		return (NULL);						\
 	}
 
 	/*
 	 * Unicast.  Make sure it's not for the bridge.
 	 */
 	do { GRAB_OUR_PACKETS(bifp) } while (0);
 
 	/*
 	 * Give a chance for ifp at first priority. This will help when	the
 	 * packet comes through the interface like VLAN's with the same MACs
 	 * on several interfaces from the same bridge. This also will save
 	 * some CPU cycles in case the destination interface and the input
 	 * interface (eq ifp) are the same.
 	 */
 	do { GRAB_OUR_PACKETS(ifp) } while (0);
 
 	/* Now check the all bridge members. */
 	CK_LIST_FOREACH(bif2, &sc->sc_iflist, bif_next) {
 		GRAB_OUR_PACKETS(bif2->bif_ifp)
 	}
 
 #undef OR_CARP_CHECK_WE_ARE_DST
 #undef OR_CARP_CHECK_WE_ARE_SRC
 #undef OR_PFIL_HOOKED_INET6
 #undef GRAB_OUR_PACKETS
 
 	/* Perform the bridge forwarding function. */
 	bridge_forward(sc, bif, m);
 
 	return (NULL);
 }
 
 /*
  * bridge_broadcast:
  *
  *	Send a frame to all interfaces that are members of
  *	the bridge, except for the one on which the packet
  *	arrived.
  *
  *	NOTE: Releases the lock on return.
  */
 static void
 bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if,
     struct mbuf *m, int runfilt)
 {
 	struct bridge_iflist *dbif, *sbif;
 	struct mbuf *mc;
 	struct ifnet *dst_if;
 	int used = 0, i;
 
 	NET_EPOCH_ASSERT();
 
 	sbif = bridge_lookup_member_if(sc, src_if);
 
 	/* Filter on the bridge interface before broadcasting */
 	if (runfilt && (PFIL_HOOKED_OUT(V_inet_pfil_head)
 #ifdef INET6
 	    || PFIL_HOOKED_OUT(V_inet6_pfil_head)
 #endif
 	    )) {
 		if (bridge_pfil(&m, sc->sc_ifp, NULL, PFIL_OUT) != 0)
 			return;
 		if (m == NULL)
 			return;
 	}
 
 	CK_LIST_FOREACH(dbif, &sc->sc_iflist, bif_next) {
 		dst_if = dbif->bif_ifp;
 		if (dst_if == src_if)
 			continue;
 
 		/* Private segments can not talk to each other */
 		if (sbif && (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE))
 			continue;
 
 		if ((dbif->bif_flags & IFBIF_STP) &&
 		    dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
 			continue;
 
 		if ((dbif->bif_flags & IFBIF_DISCOVER) == 0 &&
 		    (m->m_flags & (M_BCAST|M_MCAST)) == 0)
 			continue;
 
 		if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0)
 			continue;
 
 		if (CK_LIST_NEXT(dbif, bif_next) == NULL) {
 			mc = m;
 			used = 1;
 		} else {
 			mc = m_dup(m, M_NOWAIT);
 			if (mc == NULL) {
 				if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 				continue;
 			}
 		}
 
 		/*
 		 * Filter on the output interface. Pass a NULL bridge interface
 		 * pointer so we do not redundantly filter on the bridge for
 		 * each interface we broadcast on.
 		 */
 		if (runfilt && (PFIL_HOOKED_OUT(V_inet_pfil_head)
 #ifdef INET6
 		    || PFIL_HOOKED_OUT(V_inet6_pfil_head)
 #endif
 		    )) {
 			if (used == 0) {
 				/* Keep the layer3 header aligned */
 				i = min(mc->m_pkthdr.len, max_protohdr);
 				mc = m_copyup(mc, i, ETHER_ALIGN);
 				if (mc == NULL) {
 					if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 					continue;
 				}
 			}
 			if (bridge_pfil(&mc, NULL, dst_if, PFIL_OUT) != 0)
 				continue;
 			if (mc == NULL)
 				continue;
 		}
 
 		bridge_enqueue(sc, dst_if, mc);
 	}
 	if (used == 0)
 		m_freem(m);
 }
 
 /*
  * bridge_span:
  *
  *	Duplicate a packet out one or more interfaces that are in span mode,
  *	the original mbuf is unmodified.
  */
 static void
 bridge_span(struct bridge_softc *sc, struct mbuf *m)
 {
 	struct bridge_iflist *bif;
 	struct ifnet *dst_if;
 	struct mbuf *mc;
 
 	NET_EPOCH_ASSERT();
 
 	if (CK_LIST_EMPTY(&sc->sc_spanlist))
 		return;
 
 	CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) {
 		dst_if = bif->bif_ifp;
 
 		if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0)
 			continue;
 
 		mc = m_dup(m, M_NOWAIT);
 		if (mc == NULL) {
 			if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 			continue;
 		}
 
 		bridge_enqueue(sc, dst_if, mc);
 	}
 }
 
 /*
  * bridge_rtupdate:
  *
  *	Add a bridge routing entry.
  */
 static int
 bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan,
     struct bridge_iflist *bif, int setflags, uint8_t flags)
 {
 	struct bridge_rtnode *brt;
 	int error;
 
 	BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(sc);
 
 	/* Check the source address is valid and not multicast. */
 	if (ETHER_IS_MULTICAST(dst) ||
 	    (dst[0] == 0 && dst[1] == 0 && dst[2] == 0 &&
 	     dst[3] == 0 && dst[4] == 0 && dst[5] == 0) != 0)
 		return (EINVAL);
 
 	/* 802.1p frames map to vlan 1 */
 	if (vlan == 0)
 		vlan = 1;
 
 	/*
 	 * A route for this destination might already exist.  If so,
 	 * update it, otherwise create a new one.
 	 */
 	if ((brt = bridge_rtnode_lookup(sc, dst, vlan)) == NULL) {
 		BRIDGE_RT_LOCK(sc);
 
 		/* Check again, now that we have the lock. There could have
 		 * been a race and we only want to insert this once. */
 		if ((brt = bridge_rtnode_lookup(sc, dst, vlan)) != NULL) {
 			BRIDGE_RT_UNLOCK(sc);
 			return (0);
 		}
 
 		if (sc->sc_brtcnt >= sc->sc_brtmax) {
 			sc->sc_brtexceeded++;
 			BRIDGE_RT_UNLOCK(sc);
 			return (ENOSPC);
 		}
 		/* Check per interface address limits (if enabled) */
 		if (bif->bif_addrmax && bif->bif_addrcnt >= bif->bif_addrmax) {
 			bif->bif_addrexceeded++;
 			BRIDGE_RT_UNLOCK(sc);
 			return (ENOSPC);
 		}
 
 		/*
 		 * Allocate a new bridge forwarding node, and
 		 * initialize the expiration time and Ethernet
 		 * address.
 		 */
 		brt = uma_zalloc(V_bridge_rtnode_zone, M_NOWAIT | M_ZERO);
 		if (brt == NULL) {
 			BRIDGE_RT_UNLOCK(sc);
 			return (ENOMEM);
 		}
 		brt->brt_vnet = curvnet;
 
 		if (bif->bif_flags & IFBIF_STICKY)
 			brt->brt_flags = IFBAF_STICKY;
 		else
 			brt->brt_flags = IFBAF_DYNAMIC;
 
 		memcpy(brt->brt_addr, dst, ETHER_ADDR_LEN);
 		brt->brt_vlan = vlan;
 
 		if ((error = bridge_rtnode_insert(sc, brt)) != 0) {
 			uma_zfree(V_bridge_rtnode_zone, brt);
 			BRIDGE_RT_UNLOCK(sc);
 			return (error);
 		}
 		brt->brt_dst = bif;
 		bif->bif_addrcnt++;
 
 		BRIDGE_RT_UNLOCK(sc);
 	}
 
 	if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC &&
 	    brt->brt_dst != bif) {
 		BRIDGE_RT_LOCK(sc);
 		brt->brt_dst->bif_addrcnt--;
 		brt->brt_dst = bif;
 		brt->brt_dst->bif_addrcnt++;
 		BRIDGE_RT_UNLOCK(sc);
 	}
 
 	if ((flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)
 		brt->brt_expire = time_uptime + sc->sc_brttimeout;
 	if (setflags)
 		brt->brt_flags = flags;
 
 	return (0);
 }
 
 /*
  * bridge_rtlookup:
  *
  *	Lookup the destination interface for an address.
  */
 static struct ifnet *
 bridge_rtlookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan)
 {
 	struct bridge_rtnode *brt;
 
 	NET_EPOCH_ASSERT();
 
 	if ((brt = bridge_rtnode_lookup(sc, addr, vlan)) == NULL)
 		return (NULL);
 
 	return (brt->brt_ifp);
 }
 
 /*
  * bridge_rttrim:
  *
  *	Trim the routine table so that we have a number
  *	of routing entries less than or equal to the
  *	maximum number.
  */
 static void
 bridge_rttrim(struct bridge_softc *sc)
 {
 	struct bridge_rtnode *brt, *nbrt;
 
 	NET_EPOCH_ASSERT();
 	BRIDGE_RT_LOCK_ASSERT(sc);
 
 	/* Make sure we actually need to do this. */
 	if (sc->sc_brtcnt <= sc->sc_brtmax)
 		return;
 
 	/* Force an aging cycle; this might trim enough addresses. */
 	bridge_rtage(sc);
 	if (sc->sc_brtcnt <= sc->sc_brtmax)
 		return;
 
 	CK_LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) {
 		if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) {
 			bridge_rtnode_destroy(sc, brt);
 			if (sc->sc_brtcnt <= sc->sc_brtmax)
 				return;
 		}
 	}
 }
 
 /*
  * bridge_timer:
  *
  *	Aging timer for the bridge.
  */
 static void
 bridge_timer(void *arg)
 {
 	struct bridge_softc *sc = arg;
 
 	BRIDGE_RT_LOCK_ASSERT(sc);
 
 	/* Destruction of rtnodes requires a proper vnet context */
 	CURVNET_SET(sc->sc_ifp->if_vnet);
 	bridge_rtage(sc);
 
 	if (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING)
 		callout_reset(&sc->sc_brcallout,
 		    bridge_rtable_prune_period * hz, bridge_timer, sc);
 	CURVNET_RESTORE();
 }
 
 /*
  * bridge_rtage:
  *
  *	Perform an aging cycle.
  */
 static void
 bridge_rtage(struct bridge_softc *sc)
 {
 	struct bridge_rtnode *brt, *nbrt;
 
 	BRIDGE_RT_LOCK_ASSERT(sc);
 
 	CK_LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) {
 		if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) {
 			if (time_uptime >= brt->brt_expire)
 				bridge_rtnode_destroy(sc, brt);
 		}
 	}
 }
 
 /*
  * bridge_rtflush:
  *
  *	Remove all dynamic addresses from the bridge.
  */
 static void
 bridge_rtflush(struct bridge_softc *sc, int full)
 {
 	struct bridge_rtnode *brt, *nbrt;
 
 	BRIDGE_RT_LOCK_ASSERT(sc);
 
 	CK_LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) {
 		if (full || (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)
 			bridge_rtnode_destroy(sc, brt);
 	}
 }
 
 /*
  * bridge_rtdaddr:
  *
  *	Remove an address from the table.
  */
 static int
 bridge_rtdaddr(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan)
 {
 	struct bridge_rtnode *brt;
 	int found = 0;
 
 	BRIDGE_RT_LOCK(sc);
 
 	/*
 	 * If vlan is zero then we want to delete for all vlans so the lookup
 	 * may return more than one.
 	 */
 	while ((brt = bridge_rtnode_lookup(sc, addr, vlan)) != NULL) {
 		bridge_rtnode_destroy(sc, brt);
 		found = 1;
 	}
 
 	BRIDGE_RT_UNLOCK(sc);
 
 	return (found ? 0 : ENOENT);
 }
 
 /*
  * bridge_rtdelete:
  *
  *	Delete routes to a speicifc member interface.
  */
 static void
 bridge_rtdelete(struct bridge_softc *sc, struct ifnet *ifp, int full)
 {
 	struct bridge_rtnode *brt, *nbrt;
 
 	BRIDGE_RT_LOCK_ASSERT(sc);
 
 	CK_LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) {
 		if (brt->brt_ifp == ifp && (full ||
 			    (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC))
 			bridge_rtnode_destroy(sc, brt);
 	}
 }
 
 /*
  * bridge_rtable_init:
  *
  *	Initialize the route table for this bridge.
  */
 static void
 bridge_rtable_init(struct bridge_softc *sc)
 {
 	int i;
 
 	sc->sc_rthash = malloc(sizeof(*sc->sc_rthash) * BRIDGE_RTHASH_SIZE,
 	    M_DEVBUF, M_WAITOK);
 
 	for (i = 0; i < BRIDGE_RTHASH_SIZE; i++)
 		CK_LIST_INIT(&sc->sc_rthash[i]);
 
 	sc->sc_rthash_key = arc4random();
 	CK_LIST_INIT(&sc->sc_rtlist);
 }
 
 /*
  * bridge_rtable_fini:
  *
  *	Deconstruct the route table for this bridge.
  */
 static void
 bridge_rtable_fini(struct bridge_softc *sc)
 {
 
 	KASSERT(sc->sc_brtcnt == 0,
 	    ("%s: %d bridge routes referenced", __func__, sc->sc_brtcnt));
 	free(sc->sc_rthash, M_DEVBUF);
 }
 
 /*
  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
  */
 #define	mix(a, b, c)							\
 do {									\
 	a -= b; a -= c; a ^= (c >> 13);					\
 	b -= c; b -= a; b ^= (a << 8);					\
 	c -= a; c -= b; c ^= (b >> 13);					\
 	a -= b; a -= c; a ^= (c >> 12);					\
 	b -= c; b -= a; b ^= (a << 16);					\
 	c -= a; c -= b; c ^= (b >> 5);					\
 	a -= b; a -= c; a ^= (c >> 3);					\
 	b -= c; b -= a; b ^= (a << 10);					\
 	c -= a; c -= b; c ^= (b >> 15);					\
 } while (/*CONSTCOND*/0)
 
 static __inline uint32_t
 bridge_rthash(struct bridge_softc *sc, const uint8_t *addr)
 {
 	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->sc_rthash_key;
 
 	b += addr[5] << 8;
 	b += addr[4];
 	a += addr[3] << 24;
 	a += addr[2] << 16;
 	a += addr[1] << 8;
 	a += addr[0];
 
 	mix(a, b, c);
 
 	return (c & BRIDGE_RTHASH_MASK);
 }
 
 #undef mix
 
 static int
 bridge_rtnode_addr_cmp(const uint8_t *a, const uint8_t *b)
 {
 	int i, d;
 
 	for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) {
 		d = ((int)a[i]) - ((int)b[i]);
 	}
 
 	return (d);
 }
 
 /*
  * bridge_rtnode_lookup:
  *
  *	Look up a bridge route node for the specified destination. Compare the
  *	vlan id or if zero then just return the first match.
  */
 static struct bridge_rtnode *
 bridge_rtnode_lookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan)
 {
 	struct bridge_rtnode *brt;
 	uint32_t hash;
 	int dir;
 
 	BRIDGE_RT_LOCK_OR_NET_EPOCH_ASSERT(sc);
 
 	hash = bridge_rthash(sc, addr);
 	CK_LIST_FOREACH(brt, &sc->sc_rthash[hash], brt_hash) {
 		dir = bridge_rtnode_addr_cmp(addr, brt->brt_addr);
 		if (dir == 0 && (brt->brt_vlan == vlan || vlan == 0))
 			return (brt);
 		if (dir > 0)
 			return (NULL);
 	}
 
 	return (NULL);
 }
 
 /*
  * bridge_rtnode_insert:
  *
  *	Insert the specified bridge node into the route table.  We
  *	assume the entry is not already in the table.
  */
 static int
 bridge_rtnode_insert(struct bridge_softc *sc, struct bridge_rtnode *brt)
 {
 	struct bridge_rtnode *lbrt;
 	uint32_t hash;
 	int dir;
 
 	BRIDGE_RT_LOCK_ASSERT(sc);
 
 	hash = bridge_rthash(sc, brt->brt_addr);
 
 	lbrt = CK_LIST_FIRST(&sc->sc_rthash[hash]);
 	if (lbrt == NULL) {
 		CK_LIST_INSERT_HEAD(&sc->sc_rthash[hash], brt, brt_hash);
 		goto out;
 	}
 
 	do {
 		dir = bridge_rtnode_addr_cmp(brt->brt_addr, lbrt->brt_addr);
 		if (dir == 0 && brt->brt_vlan == lbrt->brt_vlan)
 			return (EEXIST);
 		if (dir > 0) {
 			CK_LIST_INSERT_BEFORE(lbrt, brt, brt_hash);
 			goto out;
 		}
 		if (CK_LIST_NEXT(lbrt, brt_hash) == NULL) {
 			CK_LIST_INSERT_AFTER(lbrt, brt, brt_hash);
 			goto out;
 		}
 		lbrt = CK_LIST_NEXT(lbrt, brt_hash);
 	} while (lbrt != NULL);
 
 #ifdef DIAGNOSTIC
 	panic("bridge_rtnode_insert: impossible");
 #endif
 
 out:
 	CK_LIST_INSERT_HEAD(&sc->sc_rtlist, brt, brt_list);
 	sc->sc_brtcnt++;
 
 	return (0);
 }
 
 static void
 bridge_rtnode_destroy_cb(struct epoch_context *ctx)
 {
 	struct bridge_rtnode *brt;
 
 	brt = __containerof(ctx, struct bridge_rtnode, brt_epoch_ctx);
 
 	CURVNET_SET(brt->brt_vnet);
 	uma_zfree(V_bridge_rtnode_zone, brt);
 	CURVNET_RESTORE();
 }
 
 /*
  * bridge_rtnode_destroy:
  *
  *	Destroy a bridge rtnode.
  */
 static void
 bridge_rtnode_destroy(struct bridge_softc *sc, struct bridge_rtnode *brt)
 {
 	BRIDGE_RT_LOCK_ASSERT(sc);
 
 	CK_LIST_REMOVE(brt, brt_hash);
 
 	CK_LIST_REMOVE(brt, brt_list);
 	sc->sc_brtcnt--;
 	brt->brt_dst->bif_addrcnt--;
 
 	NET_EPOCH_CALL(bridge_rtnode_destroy_cb, &brt->brt_epoch_ctx);
 }
 
 /*
  * bridge_rtable_expire:
  *
  *	Set the expiry time for all routes on an interface.
  */
 static void
 bridge_rtable_expire(struct ifnet *ifp, int age)
 {
 	struct bridge_softc *sc = ifp->if_bridge;
 	struct bridge_rtnode *brt;
 
 	CURVNET_SET(ifp->if_vnet);
 	BRIDGE_RT_LOCK(sc);
 
 	/*
 	 * If the age is zero then flush, otherwise set all the expiry times to
 	 * age for the interface
 	 */
 	if (age == 0)
 		bridge_rtdelete(sc, ifp, IFBF_FLUSHDYN);
 	else {
 		CK_LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) {
 			/* Cap the expiry time to 'age' */
 			if (brt->brt_ifp == ifp &&
 			    brt->brt_expire > time_uptime + age &&
 			    (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)
 				brt->brt_expire = time_uptime + age;
 		}
 	}
 	BRIDGE_RT_UNLOCK(sc);
 	CURVNET_RESTORE();
 }
 
 /*
  * bridge_state_change:
  *
  *	Callback from the bridgestp code when a port changes states.
  */
 static void
 bridge_state_change(struct ifnet *ifp, int state)
 {
 	struct bridge_softc *sc = ifp->if_bridge;
 	static const char *stpstates[] = {
 		"disabled",
 		"listening",
 		"learning",
 		"forwarding",
 		"blocking",
 		"discarding"
 	};
 
 	CURVNET_SET(ifp->if_vnet);
 	if (V_log_stp)
 		log(LOG_NOTICE, "%s: state changed to %s on %s\n",
 		    sc->sc_ifp->if_xname, stpstates[state], ifp->if_xname);
 	CURVNET_RESTORE();
 }
 
 /*
  * Send bridge packets through pfil if they are one of the types pfil can deal
  * with, or if they are ARP or REVARP.  (pfil will pass ARP and REVARP without
  * question.) If *bifp or *ifp are NULL then packet filtering is skipped for
  * that interface.
  */
 static int
 bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir)
 {
 	int snap, error, i, hlen;
 	struct ether_header *eh1, eh2;
 	struct ip *ip;
 	struct llc llc1;
 	u_int16_t ether_type;
 	pfil_return_t rv;
 
 	snap = 0;
 	error = -1;	/* Default error if not error == 0 */
 
 #if 0
 	/* we may return with the IP fields swapped, ensure its not shared */
 	KASSERT(M_WRITABLE(*mp), ("%s: modifying a shared mbuf", __func__));
 #endif
 
 	if (V_pfil_bridge == 0 && V_pfil_member == 0 && V_pfil_ipfw == 0)
 		return (0); /* filtering is disabled */
 
 	i = min((*mp)->m_pkthdr.len, max_protohdr);
 	if ((*mp)->m_len < i) {
 	    *mp = m_pullup(*mp, i);
 	    if (*mp == NULL) {
 		printf("%s: m_pullup failed\n", __func__);
 		return (-1);
 	    }
 	}
 
 	eh1 = mtod(*mp, struct ether_header *);
 	ether_type = ntohs(eh1->ether_type);
 
 	/*
 	 * Check for SNAP/LLC.
 	 */
 	if (ether_type < ETHERMTU) {
 		struct llc *llc2 = (struct llc *)(eh1 + 1);
 
 		if ((*mp)->m_len >= ETHER_HDR_LEN + 8 &&
 		    llc2->llc_dsap == LLC_SNAP_LSAP &&
 		    llc2->llc_ssap == LLC_SNAP_LSAP &&
 		    llc2->llc_control == LLC_UI) {
 			ether_type = htons(llc2->llc_un.type_snap.ether_type);
 			snap = 1;
 		}
 	}
 
 	/*
 	 * If we're trying to filter bridge traffic, don't look at anything
 	 * other than IP and ARP traffic.  If the filter doesn't understand
 	 * IPv6, don't allow IPv6 through the bridge either.  This is lame
 	 * since if we really wanted, say, an AppleTalk filter, we are hosed,
 	 * but of course we don't have an AppleTalk filter to begin with.
 	 * (Note that since pfil doesn't understand ARP it will pass *ALL*
 	 * ARP traffic.)
 	 */
 	switch (ether_type) {
 		case ETHERTYPE_ARP:
 		case ETHERTYPE_REVARP:
 			if (V_pfil_ipfw_arp == 0)
 				return (0); /* Automatically pass */
 			break;
 
 		case ETHERTYPE_IP:
 #ifdef INET6
 		case ETHERTYPE_IPV6:
 #endif /* INET6 */
 			break;
 		default:
 			/*
 			 * Check to see if the user wants to pass non-ip
 			 * packets, these will not be checked by pfil(9) and
 			 * passed unconditionally so the default is to drop.
 			 */
 			if (V_pfil_onlyip)
 				goto bad;
 	}
 
 	/* Run the packet through pfil before stripping link headers */
 	if (PFIL_HOOKED_OUT(V_link_pfil_head) && V_pfil_ipfw != 0 &&
 	    dir == PFIL_OUT && ifp != NULL) {
 		switch (pfil_run_hooks(V_link_pfil_head, mp, ifp, dir, NULL)) {
 		case PFIL_DROPPED:
 			return (EACCES);
 		case PFIL_CONSUMED:
 			return (0);
 		}
 	}
 
 	/* Strip off the Ethernet header and keep a copy. */
 	m_copydata(*mp, 0, ETHER_HDR_LEN, (caddr_t) &eh2);
 	m_adj(*mp, ETHER_HDR_LEN);
 
 	/* Strip off snap header, if present */
 	if (snap) {
 		m_copydata(*mp, 0, sizeof(struct llc), (caddr_t) &llc1);
 		m_adj(*mp, sizeof(struct llc));
 	}
 
 	/*
 	 * Check the IP header for alignment and errors
 	 */
 	if (dir == PFIL_IN) {
 		switch (ether_type) {
 			case ETHERTYPE_IP:
 				error = bridge_ip_checkbasic(mp);
 				break;
 #ifdef INET6
 			case ETHERTYPE_IPV6:
 				error = bridge_ip6_checkbasic(mp);
 				break;
 #endif /* INET6 */
 			default:
 				error = 0;
 		}
 		if (error)
 			goto bad;
 	}
 
 	error = 0;
 
 	/*
 	 * Run the packet through pfil
 	 */
 	rv = PFIL_PASS;
 	switch (ether_type) {
 	case ETHERTYPE_IP:
 		/*
 		 * Run pfil on the member interface and the bridge, both can
 		 * be skipped by clearing pfil_member or pfil_bridge.
 		 *
 		 * Keep the order:
 		 *   in_if -> bridge_if -> out_if
 		 */
 		if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL && (rv =
 		    pfil_run_hooks(V_inet_pfil_head, mp, bifp, dir, NULL)) !=
 		    PFIL_PASS)
 			break;
 
 		if (V_pfil_member && ifp != NULL && (rv =
 		    pfil_run_hooks(V_inet_pfil_head, mp, ifp, dir, NULL)) !=
 		    PFIL_PASS)
 			break;
 
 		if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL && (rv =
 		    pfil_run_hooks(V_inet_pfil_head, mp, bifp, dir, NULL)) !=
 		    PFIL_PASS)
 			break;
 
 		/* check if we need to fragment the packet */
 		/* bridge_fragment generates a mbuf chain of packets */
 		/* that already include eth headers */
 		if (V_pfil_member && ifp != NULL && dir == PFIL_OUT) {
 			i = (*mp)->m_pkthdr.len;
 			if (i > ifp->if_mtu) {
 				error = bridge_fragment(ifp, mp, &eh2, snap,
 					    &llc1);
 				return (error);
 			}
 		}
 
 		/* Recalculate the ip checksum. */
 		ip = mtod(*mp, struct ip *);
 		hlen = ip->ip_hl << 2;
 		if (hlen < sizeof(struct ip))
 			goto bad;
 		if (hlen > (*mp)->m_len) {
 			if ((*mp = m_pullup(*mp, hlen)) == NULL)
 				goto bad;
 			ip = mtod(*mp, struct ip *);
 			if (ip == NULL)
 				goto bad;
 		}
 		ip->ip_sum = 0;
 		if (hlen == sizeof(struct ip))
 			ip->ip_sum = in_cksum_hdr(ip);
 		else
 			ip->ip_sum = in_cksum(*mp, hlen);
 
 		break;
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 		if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL && (rv =
 		    pfil_run_hooks(V_inet6_pfil_head, mp, bifp, dir, NULL)) !=
 		    PFIL_PASS)
 			break;
 
 		if (V_pfil_member && ifp != NULL && (rv =
 		    pfil_run_hooks(V_inet6_pfil_head, mp, ifp, dir, NULL)) !=
 		    PFIL_PASS)
 			break;
 
 		if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL && (rv =
 		    pfil_run_hooks(V_inet6_pfil_head, mp, bifp, dir, NULL)) !=
 		    PFIL_PASS)
 			break;
 		break;
 #endif
 	}
 
 	switch (rv) {
 	case PFIL_CONSUMED:
 		return (0);
 	case PFIL_DROPPED:
 		return (EACCES);
 	default:
 		break;
 	}
 
 	error = -1;
 
 	/*
 	 * Finally, put everything back the way it was and return
 	 */
 	if (snap) {
 		M_PREPEND(*mp, sizeof(struct llc), M_NOWAIT);
 		if (*mp == NULL)
 			return (error);
 		bcopy(&llc1, mtod(*mp, caddr_t), sizeof(struct llc));
 	}
 
 	M_PREPEND(*mp, ETHER_HDR_LEN, M_NOWAIT);
 	if (*mp == NULL)
 		return (error);
 	bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN);
 
 	return (0);
 
 bad:
 	m_freem(*mp);
 	*mp = NULL;
 	return (error);
 }
 
 /*
  * Perform basic checks on header size since
  * pfil assumes ip_input has already processed
  * it for it.  Cut-and-pasted from ip_input.c.
  * Given how simple the IPv6 version is,
  * does the IPv4 version really need to be
  * this complicated?
  *
  * XXX Should we update ipstat here, or not?
  * XXX Right now we update ipstat but not
  * XXX csum_counter.
  */
 static int
 bridge_ip_checkbasic(struct mbuf **mp)
 {
 	struct mbuf *m = *mp;
 	struct ip *ip;
 	int len, hlen;
 	u_short sum;
 
 	if (*mp == NULL)
 		return (-1);
 
 	if (IP_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) {
 		if ((m = m_copyup(m, sizeof(struct ip),
 			(max_linkhdr + 3) & ~3)) == NULL) {
 			/* XXXJRT new stat, please */
 			KMOD_IPSTAT_INC(ips_toosmall);
 			goto bad;
 		}
 	} else if (__predict_false(m->m_len < sizeof (struct ip))) {
 		if ((m = m_pullup(m, sizeof (struct ip))) == NULL) {
 			KMOD_IPSTAT_INC(ips_toosmall);
 			goto bad;
 		}
 	}
 	ip = mtod(m, struct ip *);
 	if (ip == NULL) goto bad;
 
 	if (ip->ip_v != IPVERSION) {
 		KMOD_IPSTAT_INC(ips_badvers);
 		goto bad;
 	}
 	hlen = ip->ip_hl << 2;
 	if (hlen < sizeof(struct ip)) { /* minimum header length */
 		KMOD_IPSTAT_INC(ips_badhlen);
 		goto bad;
 	}
 	if (hlen > m->m_len) {
 		if ((m = m_pullup(m, hlen)) == NULL) {
 			KMOD_IPSTAT_INC(ips_badhlen);
 			goto bad;
 		}
 		ip = mtod(m, struct ip *);
 		if (ip == NULL) goto bad;
 	}
 
 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
 		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
 	} else {
 		if (hlen == sizeof(struct ip)) {
 			sum = in_cksum_hdr(ip);
 		} else {
 			sum = in_cksum(m, hlen);
 		}
 	}
 	if (sum) {
 		KMOD_IPSTAT_INC(ips_badsum);
 		goto bad;
 	}
 
 	/* Retrieve the packet length. */
 	len = ntohs(ip->ip_len);
 
 	/*
 	 * Check for additional length bogosity
 	 */
 	if (len < hlen) {
 		KMOD_IPSTAT_INC(ips_badlen);
 		goto bad;
 	}
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IP header would have us expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len < len) {
 		KMOD_IPSTAT_INC(ips_tooshort);
 		goto bad;
 	}
 
 	/* Checks out, proceed */
 	*mp = m;
 	return (0);
 
 bad:
 	*mp = m;
 	return (-1);
 }
 
 #ifdef INET6
 /*
  * Same as above, but for IPv6.
  * Cut-and-pasted from ip6_input.c.
  * XXX Should we update ip6stat, or not?
  */
 static int
 bridge_ip6_checkbasic(struct mbuf **mp)
 {
 	struct mbuf *m = *mp;
 	struct ip6_hdr *ip6;
 
 	/*
 	 * If the IPv6 header is not aligned, slurp it up into a new
 	 * mbuf with space for link headers, in the event we forward
 	 * it.  Otherwise, if it is aligned, make sure the entire base
 	 * IPv6 header is in the first mbuf of the chain.
 	 */
 	if (IP6_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) {
 		struct ifnet *inifp = m->m_pkthdr.rcvif;
 		if ((m = m_copyup(m, sizeof(struct ip6_hdr),
 			    (max_linkhdr + 3) & ~3)) == NULL) {
 			/* XXXJRT new stat, please */
 			IP6STAT_INC(ip6s_toosmall);
 			in6_ifstat_inc(inifp, ifs6_in_hdrerr);
 			goto bad;
 		}
 	} else if (__predict_false(m->m_len < sizeof(struct ip6_hdr))) {
 		struct ifnet *inifp = m->m_pkthdr.rcvif;
 		if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
 			IP6STAT_INC(ip6s_toosmall);
 			in6_ifstat_inc(inifp, ifs6_in_hdrerr);
 			goto bad;
 		}
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 		IP6STAT_INC(ip6s_badvers);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
 		goto bad;
 	}
 
 	/* Checks out, proceed */
 	*mp = m;
 	return (0);
 
 bad:
 	*mp = m;
 	return (-1);
 }
 #endif /* INET6 */
 
 /*
  * bridge_fragment:
  *
  *	Fragment mbuf chain in multiple packets and prepend ethernet header.
  */
 static int
 bridge_fragment(struct ifnet *ifp, struct mbuf **mp, struct ether_header *eh,
     int snap, struct llc *llc)
 {
 	struct mbuf *m = *mp, *nextpkt = NULL, *mprev = NULL, *mcur = NULL;
 	struct ip *ip;
 	int error = -1;
 
 	if (m->m_len < sizeof(struct ip) &&
 	    (m = m_pullup(m, sizeof(struct ip))) == NULL)
 		goto dropit;
 	ip = mtod(m, struct ip *);
 
 	m->m_pkthdr.csum_flags |= CSUM_IP;
 	error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist);
 	if (error)
 		goto dropit;
 
 	/*
 	 * Walk the chain and re-add the Ethernet header for
 	 * each mbuf packet.
 	 */
 	for (mcur = m; mcur; mcur = mcur->m_nextpkt) {
 		nextpkt = mcur->m_nextpkt;
 		mcur->m_nextpkt = NULL;
 		if (snap) {
 			M_PREPEND(mcur, sizeof(struct llc), M_NOWAIT);
 			if (mcur == NULL) {
 				error = ENOBUFS;
 				if (mprev != NULL)
 					mprev->m_nextpkt = nextpkt;
 				goto dropit;
 			}
 			bcopy(llc, mtod(mcur, caddr_t),sizeof(struct llc));
 		}
 
 		M_PREPEND(mcur, ETHER_HDR_LEN, M_NOWAIT);
 		if (mcur == NULL) {
 			error = ENOBUFS;
 			if (mprev != NULL)
 				mprev->m_nextpkt = nextpkt;
 			goto dropit;
 		}
 		bcopy(eh, mtod(mcur, caddr_t), ETHER_HDR_LEN);
 
 		/*
 		 * The previous two M_PREPEND could have inserted one or two
 		 * mbufs in front so we have to update the previous packet's
 		 * m_nextpkt.
 		 */
 		mcur->m_nextpkt = nextpkt;
 		if (mprev != NULL)
 			mprev->m_nextpkt = mcur;
 		else {
 			/* The first mbuf in the original chain needs to be
 			 * updated. */
 			*mp = mcur;
 		}
 		mprev = mcur;
 	}
 
 	KMOD_IPSTAT_INC(ips_fragmented);
 	return (error);
 
 dropit:
 	for (mcur = *mp; mcur; mcur = m) { /* droping the full packet chain */
 		m = mcur->m_nextpkt;
 		m_freem(mcur);
 	}
 	return (error);
 }
 
 static void
 bridge_linkstate(struct ifnet *ifp)
 {
 	struct bridge_softc *sc = ifp->if_bridge;
 	struct bridge_iflist *bif;
 	struct epoch_tracker et;
 
 	NET_EPOCH_ENTER(et);
 
 	bif = bridge_lookup_member_if(sc, ifp);
 	if (bif == NULL) {
 		NET_EPOCH_EXIT(et);
 		return;
 	}
 	bridge_linkcheck(sc);
 
 	bstp_linkstate(&bif->bif_stp);
 
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 bridge_linkcheck(struct bridge_softc *sc)
 {
 	struct bridge_iflist *bif;
 	int new_link, hasls;
 
 	BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(sc);
 
 	new_link = LINK_STATE_DOWN;
 	hasls = 0;
 	/* Our link is considered up if at least one of our ports is active */
 	CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		if (bif->bif_ifp->if_capabilities & IFCAP_LINKSTATE)
 			hasls++;
 		if (bif->bif_ifp->if_link_state == LINK_STATE_UP) {
 			new_link = LINK_STATE_UP;
 			break;
 		}
 	}
 	if (!CK_LIST_EMPTY(&sc->sc_iflist) && !hasls) {
 		/* If no interfaces support link-state then we default to up */
 		new_link = LINK_STATE_UP;
 	}
 	if_link_state_change(sc->sc_ifp, new_link);
 }
diff --git a/sys/net/if_clone.c b/sys/net/if_clone.c
index 8360fb4d1d27..7dcb3c271e42 100644
--- a/sys/net/if_clone.c
+++ b/sys/net/if_clone.c
@@ -1,854 +1,855 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
  * Copyright (c) 1980, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if.c	8.5 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_clone.h>
 #include <net/radix.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 /* Current IF_MAXUNIT expands maximum to 5 characters. */
 #define	IFCLOSIZ	(IFNAMSIZ - 5)
 
 /*
  * Structure describing a `cloning' interface.
  *
  * List of locks
  * (c)		const until freeing
  * (d)		driver specific data, may need external protection.
  * (e)		locked by if_cloners_mtx
  * (i)		locked by ifc_mtx mtx
  */
 struct if_clone {
 	char ifc_name[IFCLOSIZ];	/* (c) Name of device, e.g. `gif' */
 	struct unrhdr *ifc_unrhdr;	/* (c) alloc_unr(9) header */
 	int ifc_maxunit;		/* (c) maximum unit number */
 	int ifc_flags;
 	long ifc_refcnt;		/* (i) Reference count. */
 	LIST_HEAD(, ifnet) ifc_iflist;	/* (i) List of cloned interfaces */
 	struct mtx ifc_mtx;		/* Mutex to protect members. */
 
 	ifc_match_f *ifc_match;		/* (c) Matcher function */
 	ifc_create_f *ifc_create;	/* (c) Creates new interface */
 	ifc_destroy_f *ifc_destroy;	/* (c) Destroys cloned interface */
 
 #ifdef CLONE_COMPAT_13
 	/* (c) Driver specific cloning functions.  Called with no locks held. */
 	union {
 		struct {	/* advanced cloner */
 			ifc_create_t	*_ifc_create;
 			ifc_destroy_t	*_ifc_destroy;
 		} A;
 		struct {	/* simple cloner */
 			ifcs_create_t	*_ifcs_create;
 			ifcs_destroy_t	*_ifcs_destroy;
 			int		_ifcs_minifs;	/* minimum ifs */
 
 		} S;
 	} U;
 #define	ifca_create	U.A._ifc_create
 #define	ifca_destroy	U.A._ifc_destroy
 #define	ifcs_create	U.S._ifcs_create
 #define	ifcs_destroy	U.S._ifcs_destroy
 #define	ifcs_minifs	U.S._ifcs_minifs
 #endif
 
 	LIST_ENTRY(if_clone) ifc_list;	/* (e) On list of cloners */
 };
 
 
 
 static void	if_clone_free(struct if_clone *ifc);
 static int	if_clone_createif(struct if_clone *ifc, char *name, size_t len,
 		    struct ifc_data *ifd, struct ifnet **ifpp);
 
 static int ifc_simple_match(struct if_clone *ifc, const char *name);
 static int ifc_handle_unit(struct if_clone *ifc, char *name, size_t len, int *punit);
 
 #ifdef CLONE_COMPAT_13
 static int ifc_simple_create_wrapper(struct if_clone *ifc, char *name, size_t maxlen,
     struct ifc_data *ifc_data, struct ifnet **ifpp);
 static int ifc_advanced_create_wrapper(struct if_clone *ifc, char *name, size_t maxlen,
     struct ifc_data *ifc_data, struct ifnet **ifpp);
 #endif
 
 static struct mtx if_cloners_mtx;
 MTX_SYSINIT(if_cloners_lock, &if_cloners_mtx, "if_cloners lock", MTX_DEF);
 VNET_DEFINE_STATIC(int, if_cloners_count);
 VNET_DEFINE(LIST_HEAD(, if_clone), if_cloners);
 
 #define	V_if_cloners_count	VNET(if_cloners_count)
 #define	V_if_cloners		VNET(if_cloners)
 
 #define IF_CLONERS_LOCK_ASSERT()	mtx_assert(&if_cloners_mtx, MA_OWNED)
 #define IF_CLONERS_LOCK()		mtx_lock(&if_cloners_mtx)
 #define IF_CLONERS_UNLOCK()		mtx_unlock(&if_cloners_mtx)
 
 #define IF_CLONE_LOCK_INIT(ifc)		\
     mtx_init(&(ifc)->ifc_mtx, "if_clone lock", NULL, MTX_DEF)
 #define IF_CLONE_LOCK_DESTROY(ifc)	mtx_destroy(&(ifc)->ifc_mtx)
 #define IF_CLONE_LOCK_ASSERT(ifc)	mtx_assert(&(ifc)->ifc_mtx, MA_OWNED)
 #define IF_CLONE_LOCK(ifc)		mtx_lock(&(ifc)->ifc_mtx)
 #define IF_CLONE_UNLOCK(ifc)		mtx_unlock(&(ifc)->ifc_mtx)
 
 #define IF_CLONE_ADDREF(ifc)						\
 	do {								\
 		IF_CLONE_LOCK(ifc);					\
 		IF_CLONE_ADDREF_LOCKED(ifc);				\
 		IF_CLONE_UNLOCK(ifc);					\
 	} while (0)
 #define IF_CLONE_ADDREF_LOCKED(ifc)					\
 	do {								\
 		IF_CLONE_LOCK_ASSERT(ifc);				\
 		KASSERT((ifc)->ifc_refcnt >= 0,				\
 		    ("negative refcnt %ld", (ifc)->ifc_refcnt));	\
 		(ifc)->ifc_refcnt++;					\
 	} while (0)
 #define IF_CLONE_REMREF(ifc)						\
 	do {								\
 		IF_CLONE_LOCK(ifc);					\
 		IF_CLONE_REMREF_LOCKED(ifc);				\
 	} while (0)
 #define IF_CLONE_REMREF_LOCKED(ifc)					\
 	do {								\
 		IF_CLONE_LOCK_ASSERT(ifc);				\
 		KASSERT((ifc)->ifc_refcnt > 0,				\
 		    ("bogus refcnt %ld", (ifc)->ifc_refcnt));		\
 		if (--(ifc)->ifc_refcnt == 0) {				\
 			IF_CLONE_UNLOCK(ifc);				\
 			if_clone_free(ifc);				\
 		} else {						\
 			/* silently free the lock */			\
 			IF_CLONE_UNLOCK(ifc);				\
 		}							\
 	} while (0)
 
 #define IFC_IFLIST_INSERT(_ifc, _ifp)					\
 	LIST_INSERT_HEAD(&_ifc->ifc_iflist, _ifp, if_clones)
 #define IFC_IFLIST_REMOVE(_ifc, _ifp)					\
 	LIST_REMOVE(_ifp, if_clones)
 
 static MALLOC_DEFINE(M_CLONE, "clone", "interface cloning framework");
 
 void
 vnet_if_clone_init(void)
 {
 
 	LIST_INIT(&V_if_cloners);
 }
 
 /*
  * Lookup and create a clone network interface.
  */
 int
 ifc_create_ifp(const char *name, struct ifc_data *ifd,
     struct ifnet **ifpp)
 {
 	struct if_clone *ifc;
 	char ifname[IFNAMSIZ];
 	struct ifnet *ifp = NULL;
 	int error;
 
 	/* Try to find an applicable cloner for this request */
 	IF_CLONERS_LOCK();
 	LIST_FOREACH(ifc, &V_if_cloners, ifc_list) {
 		if (ifc->ifc_match(ifc, name))
 			break;
 	}
 	IF_CLONERS_UNLOCK();
 
 	if (ifc == NULL)
 		return (EINVAL);
 
 	strlcpy(ifname, name, IFNAMSIZ);
 	error = if_clone_createif(ifc, ifname, IFNAMSIZ, ifd, &ifp);
 	if (ifpp != NULL)
 		*ifpp = ifp;
 
 	return (error);
 }
 
 int
 if_clone_create(char *name, size_t len, caddr_t params)
 {
 	struct ifc_data ifd = { .params = params };
 	struct ifnet *ifp;
 
 	int error = ifc_create_ifp(name, &ifd, &ifp);
 
 	if (error == 0)
 		strlcpy(name, if_name(ifp), len);
 
 	return (error);
 }
 
 void
 ifc_link_ifp(struct if_clone *ifc, struct ifnet *ifp)
 {
 
 	if ((ifc->ifc_flags & IFC_NOGROUP) == 0)
 		if_addgroup(ifp, ifc->ifc_name);
 
 	IF_CLONE_LOCK(ifc);
 	IFC_IFLIST_INSERT(ifc, ifp);
 	IF_CLONE_UNLOCK(ifc);
 }
 
 void
 if_clone_addif(struct if_clone *ifc, struct ifnet *ifp)
 {
 	ifc_link_ifp(ifc, ifp);
 }
 
 bool
 ifc_unlink_ifp(struct if_clone *ifc, struct ifnet *ifp)
 {
 	struct ifnet *ifcifp;
 
 	IF_CLONE_LOCK(ifc);
 	LIST_FOREACH(ifcifp, &ifc->ifc_iflist, if_clones) {
 		if (ifcifp == ifp) {
 			IFC_IFLIST_REMOVE(ifc, ifp);
 			break;
 		}
 	}
 	IF_CLONE_UNLOCK(ifc);
 
 	if (ifcifp != NULL && (ifc->ifc_flags & IFC_F_NOGROUP) == 0)
 		if_delgroup(ifp, ifc->ifc_name);
 
 	return (ifcifp != NULL);
 }
 
 static struct if_clone *
 ifc_find_cloner(const char *name, struct vnet *vnet)
 {
 	struct if_clone *ifc;
 
 	CURVNET_SET_QUIET(vnet);
 	IF_CLONERS_LOCK();
 	LIST_FOREACH(ifc, &V_if_cloners, ifc_list) {
 		if (strcmp(ifc->ifc_name, name) == 0) {
 			break;
 		}
 	}
 	IF_CLONERS_UNLOCK();
 	CURVNET_RESTORE();
 
 	return (ifc);
 }
 
 /*
  * Create a clone network interface.
  */
 static int
 if_clone_createif(struct if_clone *ifc, char *name, size_t len,
     struct ifc_data *ifd, struct ifnet **ifpp)
 {
 	int err, unit = 0;
 
 	if (ifunit(name) != NULL)
 		return (EEXIST);
 
 	if (ifc->ifc_flags & IFC_F_AUTOUNIT) {
 		if ((err = ifc_handle_unit(ifc, name, len, &unit)) != 0)
 			return (err);
 		ifd->unit = unit;
 	}
 	*ifpp = NULL;
 	err = (*ifc->ifc_create)(ifc, name, len, ifd, ifpp);
 
 	if (err == 0) {
 		MPASS(*ifpp != NULL);
 		if_clone_addif(ifc, *ifpp);
 	} else if (ifc->ifc_flags & IFC_F_AUTOUNIT)
 		ifc_free_unit(ifc, unit);
 
 	return (err);
 }
 
 /*
  * Lookup and destroy a clone network interface.
  */
 int
 if_clone_destroy(const char *name)
 {
 	int err;
 	struct if_clone *ifc;
 	struct ifnet *ifp;
 
 	ifp = ifunit_ref(name);
 	if (ifp == NULL)
 		return (ENXIO);
 
 	ifc = ifc_find_cloner(ifp->if_dname, ifp->if_home_vnet);
 	if (ifc == NULL) {
 		if_rele(ifp);
 		return (EINVAL);
 	}
 
 	err = if_clone_destroyif(ifc, ifp);
 	if_rele(ifp);
 	return err;
 }
 
 /*
  * Destroy a clone network interface.
  */
 static int
 if_clone_destroyif_flags(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags)
 {
 	int err;
 
 	/*
 	 * Given that the cloned ifnet might be attached to a different
 	 * vnet from where its cloner was registered, we have to
 	 * switch to the vnet context of the target vnet.
 	 */
 	CURVNET_SET_QUIET(ifp->if_vnet);
 
 	if (!ifc_unlink_ifp(ifc, ifp)) {
 		CURVNET_RESTORE();
 		return (ENXIO);		/* ifp is not on the list. */
 	}
 
 	int unit = ifp->if_dunit;
 	err = (*ifc->ifc_destroy)(ifc, ifp, flags);
 
 	if (err != 0)
 		ifc_link_ifp(ifc, ifp);
 	else if (ifc->ifc_flags & IFC_F_AUTOUNIT)
 		ifc_free_unit(ifc, unit);
 	CURVNET_RESTORE();
 	return (err);
 }
 
 int
 if_clone_destroyif(struct if_clone *ifc, struct ifnet *ifp)
 {
 	return (if_clone_destroyif_flags(ifc, ifp, 0));
 }
 
 static struct if_clone *
 if_clone_alloc(const char *name, int maxunit)
 {
 	struct if_clone *ifc;
 
 	KASSERT(name != NULL, ("%s: no name\n", __func__));
 
 	ifc = malloc(sizeof(struct if_clone), M_CLONE, M_WAITOK | M_ZERO);
 	strncpy(ifc->ifc_name, name, IFCLOSIZ-1);
 	IF_CLONE_LOCK_INIT(ifc);
 	IF_CLONE_ADDREF(ifc);
 	ifc->ifc_maxunit = maxunit ? maxunit : IF_MAXUNIT;
 	ifc->ifc_unrhdr = new_unrhdr(0, ifc->ifc_maxunit, &ifc->ifc_mtx);
 	LIST_INIT(&ifc->ifc_iflist);
 
 	return (ifc);
 }
 
 static int
 if_clone_attach(struct if_clone *ifc)
 {
 	struct if_clone *ifc1;
 
 	IF_CLONERS_LOCK();
 	LIST_FOREACH(ifc1, &V_if_cloners, ifc_list)
 		if (strcmp(ifc->ifc_name, ifc1->ifc_name) == 0) {
 			IF_CLONERS_UNLOCK();
 			IF_CLONE_REMREF(ifc);
 			return (EEXIST);
 		}
 	LIST_INSERT_HEAD(&V_if_cloners, ifc, ifc_list);
 	V_if_cloners_count++;
 	IF_CLONERS_UNLOCK();
 
 	return (0);
 }
 
 struct if_clone *
 ifc_attach_cloner(const char *name, struct if_clone_addreq *req)
 {
 	if (req->create_f == NULL || req->destroy_f == NULL)
 		return (NULL);
 	if (strnlen(name, IFCLOSIZ) >= (IFCLOSIZ - 1))
 		return (NULL);
 
 	struct if_clone *ifc = if_clone_alloc(name, req->maxunit);
 	ifc->ifc_match = req->match_f != NULL ? req->match_f : ifc_simple_match;
 	ifc->ifc_create = req->create_f;
 	ifc->ifc_destroy = req->destroy_f;
 	ifc->ifc_flags = (req->flags & (IFC_F_AUTOUNIT | IFC_F_NOGROUP));
 
 	if (if_clone_attach(ifc) != 0)
 		return (NULL);
 
 	EVENTHANDLER_INVOKE(if_clone_event, ifc);
 
 	return (ifc);
 }
 
 void
 ifc_detach_cloner(struct if_clone *ifc)
 {
 	if_clone_detach(ifc);
 }
 
 
 #ifdef CLONE_COMPAT_13
 
 static int
 ifc_advanced_create_wrapper(struct if_clone *ifc, char *name, size_t maxlen,
     struct ifc_data *ifc_data, struct ifnet **ifpp)
 {
 	int error = ifc->ifca_create(ifc, name, maxlen, ifc_data->params);
 
 	if (error == 0)
 		*ifpp = ifunit(name);
 	return (error);
 }
 
 static int
 ifc_advanced_destroy_wrapper(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags)
 {
 	if (ifc->ifca_destroy == NULL)
 		return (ENOTSUP);
 	return (ifc->ifca_destroy(ifc, ifp));
 }
 
 struct if_clone *
 if_clone_advanced(const char *name, u_int maxunit, ifc_match_t match,
 	ifc_create_t create, ifc_destroy_t destroy)
 {
 	struct if_clone *ifc;
 
 	ifc = if_clone_alloc(name, maxunit);
 	ifc->ifc_match = match;
 	ifc->ifc_create = ifc_advanced_create_wrapper;
 	ifc->ifc_destroy = ifc_advanced_destroy_wrapper;
 	ifc->ifca_destroy = destroy;
 	ifc->ifca_create = create;
 
 	if (if_clone_attach(ifc) != 0)
 		return (NULL);
 
 	EVENTHANDLER_INVOKE(if_clone_event, ifc);
 
 	return (ifc);
 }
 
 static int
 ifc_simple_create_wrapper(struct if_clone *ifc, char *name, size_t maxlen,
     struct ifc_data *ifc_data, struct ifnet **ifpp)
 {
 	int unit = 0;
 
 	ifc_name2unit(name, &unit);
 	int error = ifc->ifcs_create(ifc, unit, ifc_data->params);
 	if (error == 0)
 		*ifpp = ifunit(name);
 	return (error);
 }
 
 static int
 ifc_simple_destroy_wrapper(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags)
 {
 	if (ifp->if_dunit < ifc->ifcs_minifs && (flags & IFC_F_FORCE) == 0)
 		return (EINVAL);
 
 	ifc->ifcs_destroy(ifp);
 	return (0);
 }
 
 struct if_clone *
 if_clone_simple(const char *name, ifcs_create_t create, ifcs_destroy_t destroy,
 	u_int minifs)
 {
 	struct if_clone *ifc;
 	u_int unit;
 
 	ifc = if_clone_alloc(name, 0);
 	ifc->ifc_match = ifc_simple_match;
 	ifc->ifc_create = ifc_simple_create_wrapper;
 	ifc->ifc_destroy = ifc_simple_destroy_wrapper;
 	ifc->ifcs_create = create;
 	ifc->ifcs_destroy = destroy;
 	ifc->ifcs_minifs = minifs;
 	ifc->ifc_flags = IFC_F_AUTOUNIT;
 
 	if (if_clone_attach(ifc) != 0)
 		return (NULL);
 
 	for (unit = 0; unit < minifs; unit++) {
 		char name[IFNAMSIZ];
 		int error __unused;
 		struct ifc_data ifd = {};
 		struct ifnet *ifp;
 
 		snprintf(name, IFNAMSIZ, "%s%d", ifc->ifc_name, unit);
 		error = if_clone_createif(ifc, name, IFNAMSIZ, &ifd, &ifp);
 		KASSERT(error == 0,
 		    ("%s: failed to create required interface %s",
 		    __func__, name));
 	}
 
 	EVENTHANDLER_INVOKE(if_clone_event, ifc);
 
 	return (ifc);
 }
 #endif
 
 /*
  * Unregister a network interface cloner.
  */
 void
 if_clone_detach(struct if_clone *ifc)
 {
 
 	IF_CLONERS_LOCK();
 	LIST_REMOVE(ifc, ifc_list);
 	V_if_cloners_count--;
 	IF_CLONERS_UNLOCK();
 
 	/* destroy all interfaces for this cloner */
 	while (!LIST_EMPTY(&ifc->ifc_iflist))
 		if_clone_destroyif_flags(ifc, LIST_FIRST(&ifc->ifc_iflist), IFC_F_FORCE);
 
 	IF_CLONE_REMREF(ifc);
 }
 
 static void
 if_clone_free(struct if_clone *ifc)
 {
 
 	KASSERT(LIST_EMPTY(&ifc->ifc_iflist),
 	    ("%s: ifc_iflist not empty", __func__));
 
 	IF_CLONE_LOCK_DESTROY(ifc);
 	delete_unrhdr(ifc->ifc_unrhdr);
 	free(ifc, M_CLONE);
 }
 
 /*
  * Provide list of interface cloners to userspace.
  */
 int
 if_clone_list(struct if_clonereq *ifcr)
 {
 	char *buf, *dst, *outbuf = NULL;
 	struct if_clone *ifc;
 	int buf_count, count, err = 0;
 
 	if (ifcr->ifcr_count < 0)
 		return (EINVAL);
 
 	IF_CLONERS_LOCK();
 	/*
 	 * Set our internal output buffer size.  We could end up not
 	 * reporting a cloner that is added between the unlock and lock
 	 * below, but that's not a major problem.  Not caping our
 	 * allocation to the number of cloners actually in the system
 	 * could be because that would let arbitrary users cause us to
 	 * allocate arbitrary amounts of kernel memory.
 	 */
 	buf_count = (V_if_cloners_count < ifcr->ifcr_count) ?
 	    V_if_cloners_count : ifcr->ifcr_count;
 	IF_CLONERS_UNLOCK();
 
 	outbuf = malloc(IFNAMSIZ*buf_count, M_CLONE, M_WAITOK | M_ZERO);
 
 	IF_CLONERS_LOCK();
 
 	ifcr->ifcr_total = V_if_cloners_count;
 	if ((dst = ifcr->ifcr_buffer) == NULL) {
 		/* Just asking how many there are. */
 		goto done;
 	}
 	count = (V_if_cloners_count < buf_count) ?
 	    V_if_cloners_count : buf_count;
 
 	for (ifc = LIST_FIRST(&V_if_cloners), buf = outbuf;
 	    ifc != NULL && count != 0;
 	    ifc = LIST_NEXT(ifc, ifc_list), count--, buf += IFNAMSIZ) {
 		strlcpy(buf, ifc->ifc_name, IFNAMSIZ);
 	}
 
 done:
 	IF_CLONERS_UNLOCK();
 	if (err == 0 && dst != NULL)
 		err = copyout(outbuf, dst, buf_count*IFNAMSIZ);
 	if (outbuf != NULL)
 		free(outbuf, M_CLONE);
 	return (err);
 }
 
 #ifdef VIMAGE
 /*
  * if_clone_restoregroup() is used in context of if_vmove().
  *
  * Since if_detach_internal() has removed the interface from ALL groups, we
  * need to "restore" interface membership in the cloner's group.  Note that
  * interface belongs to cloner in its home vnet, so we first find the original
  * cloner, and then we confirm that cloner with the same name exists in the
  * current vnet.
  */
 void
 if_clone_restoregroup(struct ifnet *ifp)
 {
 	struct if_clone *ifc;
 	struct ifnet *ifcifp;
 	char ifc_name[IFCLOSIZ] = { [0] = '\0' };
 
 	CURVNET_SET_QUIET(ifp->if_home_vnet);
 	IF_CLONERS_LOCK();
 	LIST_FOREACH(ifc, &V_if_cloners, ifc_list) {
 		IF_CLONE_LOCK(ifc);
 		LIST_FOREACH(ifcifp, &ifc->ifc_iflist, if_clones) {
 			if (ifp == ifcifp) {
 				strncpy(ifc_name, ifc->ifc_name, IFCLOSIZ-1);
 				break;
 			}
 		}
 		IF_CLONE_UNLOCK(ifc);
 		if (ifc_name[0] != '\0')
 			break;
 	}
 	CURVNET_RESTORE();
 	LIST_FOREACH(ifc, &V_if_cloners, ifc_list)
 		if (strcmp(ifc->ifc_name, ifc_name) == 0 &&
 		    ((ifc->ifc_flags & IFC_NOGROUP) == 0))
 			break;
 	IF_CLONERS_UNLOCK();
 
 	if (ifc != NULL)
 		if_addgroup(ifp, ifc_name);
 }
 #endif
 
 /*
  * A utility function to extract unit numbers from interface names of
  * the form name###.
  *
  * Returns 0 on success and an error on failure.
  */
 int
 ifc_name2unit(const char *name, int *unit)
 {
 	const char	*cp;
 	int		cutoff = INT_MAX / 10;
 	int		cutlim = INT_MAX % 10;
 
 	for (cp = name; *cp != '\0' && (*cp < '0' || *cp > '9'); cp++)
 		;
 	if (*cp == '\0') {
 		*unit = -1;
 	} else if (cp[0] == '0' && cp[1] != '\0') {
 		/* Disallow leading zeroes. */
 		return (EINVAL);
 	} else {
 		for (*unit = 0; *cp != '\0'; cp++) {
 			if (*cp < '0' || *cp > '9') {
 				/* Bogus unit number. */
 				return (EINVAL);
 			}
 			if (*unit > cutoff ||
 			    (*unit == cutoff && *cp - '0' > cutlim))
 				return (EINVAL);
 			*unit = (*unit * 10) + (*cp - '0');
 		}
 	}
 
 	return (0);
 }
 
 static int
 ifc_alloc_unit_specific(struct if_clone *ifc, int *unit)
 {
 	char name[IFNAMSIZ];
 
 	if (*unit > ifc->ifc_maxunit)
 		return (ENOSPC);
 
 	if (alloc_unr_specific(ifc->ifc_unrhdr, *unit) == -1)
 		return (EEXIST);
 
 	snprintf(name, IFNAMSIZ, "%s%d", ifc->ifc_name, *unit);
 	if (ifunit(name) != NULL) {
 		free_unr(ifc->ifc_unrhdr, *unit);
 		return (EEXIST);
 	}
 
 	IF_CLONE_ADDREF(ifc);
 
 	return (0);
 }
 
 static int
 ifc_alloc_unit_next(struct if_clone *ifc, int *unit)
 {
 	int error;
 
 	*unit = alloc_unr(ifc->ifc_unrhdr);
 	if (*unit == -1)
 		return (ENOSPC);
 
 	free_unr(ifc->ifc_unrhdr, *unit);
 	for (;;) {
 		error = ifc_alloc_unit_specific(ifc, unit);
 		if (error != EEXIST)
 			break;
 
 		(*unit)++;
 	}
 
 	return (error);
 }
 
 int
 ifc_alloc_unit(struct if_clone *ifc, int *unit)
 {
 	if (*unit < 0)
 		return (ifc_alloc_unit_next(ifc, unit));
 	else
 		return (ifc_alloc_unit_specific(ifc, unit));
 }
 
 void
 ifc_free_unit(struct if_clone *ifc, int unit)
 {
 
 	free_unr(ifc->ifc_unrhdr, unit);
 	IF_CLONE_REMREF(ifc);
 }
 
 static int
 ifc_simple_match(struct if_clone *ifc, const char *name)
 {
 	const char *cp;
 	int i;
 
 	/* Match the name */
 	for (cp = name, i = 0; i < strlen(ifc->ifc_name); i++, cp++) {
 		if (ifc->ifc_name[i] != *cp)
 			return (0);
 	}
 
 	/* Make sure there's a unit number or nothing after the name */
 	for (; *cp != '\0'; cp++) {
 		if (*cp < '0' || *cp > '9')
 			return (0);
 	}
 
 	return (1);
 }
 
 static int
 ifc_handle_unit(struct if_clone *ifc, char *name, size_t len, int *punit)
 {
 	char *dp;
 	int wildcard;
 	int unit;
 	int err;
 
 	err = ifc_name2unit(name, &unit);
 	if (err != 0)
 		return (err);
 
 	wildcard = (unit < 0);
 
 	err = ifc_alloc_unit(ifc, &unit);
 	if (err != 0)
 		return (err);
 
 	/* In the wildcard case, we need to update the name. */
 	if (wildcard) {
 		for (dp = name; *dp != '\0'; dp++);
 		if (snprintf(dp, len - (dp-name), "%d", unit) >
 		    len - (dp-name) - 1) {
 			/*
 			 * This can only be a programmer error and
 			 * there's no straightforward way to recover if
 			 * it happens.
 			 */
 			panic("if_clone_create(): interface name too long");
 		}
 	}
 	*punit = unit;
 
 	return (0);
 }
 
 int
 ifc_copyin(const struct ifc_data *ifd, void *target, size_t len)
 {
 	if (ifd->params == NULL)
 		return (EINVAL);
 
 	if (ifd->flags & IFC_F_SYSSPACE) {
 		memcpy(target, ifd->params, len);
 		return (0);
 	} else
 		return (copyin(ifd->params, target, len));
 }
 
 const char *
 ifc_name(struct if_clone *ifc)
 {
 	return (ifc->ifc_name);
 }
 
 void
 ifc_flags_set(struct if_clone *ifc, int flags)
 {
 	ifc->ifc_flags = flags;
 }
 
 int
 ifc_flags_get(struct if_clone *ifc)
 {
 	return (ifc->ifc_flags);
 }
diff --git a/sys/net/if_dead.c b/sys/net/if_dead.c
index 5721e9490776..8d645c29817b 100644
--- a/sys/net/if_dead.c
+++ b/sys/net/if_dead.c
@@ -1,143 +1,144 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * When an interface has been detached but not yet freed, we set the various
  * ifnet function pointers to "ifdead" versions.  This prevents unexpected
  * calls from the network stack into the device driver after if_detach() has
  * returned.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 
 static int
 ifdead_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa,
     struct route *ro)
 {
 
 	m_freem(m);
 	return (ENXIO);
 }
 
 static void
 ifdead_input(struct ifnet *ifp, struct mbuf *m)
 {
 
 	m_freem(m);
 }
 
 static void
 ifdead_start(struct ifnet *ifp)
 {
 
 }
 
 static int
 ifdead_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 
 	return (ENXIO);
 }
 
 static int
 ifdead_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
     struct sockaddr *sa)
 {
 
 	*llsa = NULL;
 	return (ENXIO);
 }
 
 static void
 ifdead_qflush(struct ifnet *ifp)
 {
 
 }
 
 static int
 ifdead_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 
 	m_freem(m);
 	return (ENXIO);
 }
 
 static uint64_t
 ifdead_get_counter(struct ifnet *ifp, ift_counter cnt)
 {
 
 	return (0);
 }
 
 static int
 ifdead_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 	return (EOPNOTSUPP);
 }
 
 static void
 ifdead_ratelimit_query(struct ifnet *ifp __unused,
       struct if_ratelimit_query_results *q)
 {
 	/*
 	 * This guy does not support
 	 * this interface. Not sure
 	 * why we would specify a
 	 * flag on the interface
 	 * that says we do.
 	 */
 	q->rate_table = NULL;
 	q->flags = RT_NOSUPPORT;
 	q->max_flows = 0;
 	q->number_of_rates = 0;
 }
 
 void
 if_dead(struct ifnet *ifp)
 {
 
 	ifp->if_output = ifdead_output;
 	ifp->if_input = ifdead_input;
 	ifp->if_start = ifdead_start;
 	ifp->if_ioctl = ifdead_ioctl;
 	ifp->if_resolvemulti = ifdead_resolvemulti;
 	ifp->if_qflush = ifdead_qflush;
 	ifp->if_transmit = ifdead_transmit;
 	ifp->if_get_counter = ifdead_get_counter;
 	ifp->if_snd_tag_alloc = ifdead_snd_tag_alloc;
 	ifp->if_ratelimit_query = ifdead_ratelimit_query;
 }
diff --git a/sys/net/if_disc.c b/sys/net/if_disc.c
index 14d544dfd86a..62313415de91 100644
--- a/sys/net/if_disc.c
+++ b/sys/net/if_disc.c
@@ -1,246 +1,247 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)if_loop.c	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 /*
  * Discard interface driver for protocol testing and timing.
  * (Based on the loopback.)
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/bpf.h>
 #include <net/vnet.h>
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #ifdef TINY_DSMTU
 #define	DSMTU	(1024+512)
 #else
 #define DSMTU	65532
 #endif
 
 struct disc_softc {
 	struct ifnet *sc_ifp;
 };
 
 static int	discoutput(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *);
 static int	discioctl(struct ifnet *, u_long, caddr_t);
 static int	disc_clone_create(struct if_clone *, int, caddr_t);
 static void	disc_clone_destroy(struct ifnet *);
 
 static const char discname[] = "disc";
 static MALLOC_DEFINE(M_DISC, discname, "Discard interface");
 
 VNET_DEFINE_STATIC(struct if_clone *, disc_cloner);
 #define	V_disc_cloner	VNET(disc_cloner)
 
 static int
 disc_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct ifnet		*ifp;
 	struct disc_softc	*sc;
 
 	sc = malloc(sizeof(struct disc_softc), M_DISC, M_WAITOK | M_ZERO);
 	ifp = sc->sc_ifp = if_alloc(IFT_LOOP);
 	if (ifp == NULL) {
 		free(sc, M_DISC);
 		return (ENOSPC);
 	}
 
 	ifp->if_softc = sc;
 	if_initname(ifp, discname, unit);
 	ifp->if_mtu = DSMTU;
 	/*
 	 * IFF_LOOPBACK should not be removed from disc's flags because
 	 * it controls what PF-specific routes are magically added when
 	 * a network address is assigned to the interface.  Things just
 	 * won't work as intended w/o such routes because the output
 	 * interface selection for a packet is totally route-driven.
 	 * A valid alternative to IFF_LOOPBACK can be IFF_BROADCAST or
 	 * IFF_POINTOPOINT, but it would result in different properties
 	 * of the interface.
 	 */
 	ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST;
 	ifp->if_drv_flags = IFF_DRV_RUNNING;
 	ifp->if_ioctl = discioctl;
 	ifp->if_output = discoutput;
 	ifp->if_hdrlen = 0;
 	ifp->if_addrlen = 0;
 	ifp->if_snd.ifq_maxlen = 20;
 	if_attach(ifp);
 	bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));
 
 	return (0);
 }
 
 static void
 disc_clone_destroy(struct ifnet *ifp)
 {
 	struct disc_softc	*sc;
 
 	sc = ifp->if_softc;
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 	if_free(ifp);
 
 	free(sc, M_DISC);
 }
 
 static void
 vnet_disc_init(const void *unused __unused)
 {
 
 	V_disc_cloner = if_clone_simple(discname, disc_clone_create,
 	    disc_clone_destroy, 0);
 }
 VNET_SYSINIT(vnet_disc_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_disc_init, NULL);
 
 static void
 vnet_disc_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_disc_cloner);
 }
 VNET_SYSUNINIT(vnet_disc_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_disc_uninit, NULL);
 
 static int
 disc_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 	case MOD_UNLOAD:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t disc_mod = {
 	"if_disc",
 	disc_modevent,
 	NULL
 };
 
 DECLARE_MODULE(if_disc, disc_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 
 static int
 discoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
     struct route *ro)
 {
 	u_int32_t af;
 
 	M_ASSERTPKTHDR(m);
 
 	/* BPF writes need to be handled specially. */
 	if (dst->sa_family == AF_UNSPEC)
 		bcopy(dst->sa_data, &af, sizeof(af));
 	else
 		af = RO_GET_FAMILY(ro, dst);
 
 	if (bpf_peers_present(ifp->if_bpf))
 		bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m);
 
 	m->m_pkthdr.rcvif = ifp;
 
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
 
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Process an ioctl request.
  */
 static int
 discioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	int error = 0;
 
 	switch (cmd) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 
 		/*
 		 * Everything else is done at a higher level.
 		 */
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (ifr == NULL) {
 			error = EAFNOSUPPORT;		/* XXX */
 			break;
 		}
 		switch (ifr->ifr_addr.sa_family) {
 #ifdef INET
 		case AF_INET:
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			break;
 #endif
 		default:
 			error = EAFNOSUPPORT;
 			break;
 		}
 		break;
 	case SIOCSIFMTU:
 		ifp->if_mtu = ifr->ifr_mtu;
 		break;
 	default:
 		error = EINVAL;
 	}
 	return (error);
 }
diff --git a/sys/net/if_edsc.c b/sys/net/if_edsc.c
index 85a8e8cca2d3..8de583713410 100644
--- a/sys/net/if_edsc.c
+++ b/sys/net/if_edsc.c
@@ -1,375 +1,375 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following edsclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following edsclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE EDSCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)if_loop.c	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 /*
  * Discard interface driver for protocol testing and timing.
  * Mimics an Ethernet device so that VLANs can be attached to it etc.
  */
 
 #include <sys/param.h>		/* types, important constants */
 #include <sys/kernel.h>		/* SYSINIT for load-time initializations */
 #include <sys/malloc.h>		/* malloc(9) */
 #include <sys/module.h>		/* module(9) */
 #include <sys/mbuf.h>		/* mbuf(9) */
 #include <sys/socket.h>		/* struct ifreq */
 #include <sys/sockio.h>		/* socket ioctl's */
 /* #include <sys/systm.h> if you need printf(9) or other all-purpose globals */
 
 #include <net/bpf.h>		/* bpf(9) */
 #include <net/ethernet.h>	/* Ethernet related constants and types */
 #include <net/if.h>
 #include <net/if_var.h>		/* basic part of ifnet(9) */
+#include <net/if_private.h>
 #include <net/if_clone.h>	/* network interface cloning */
 #include <net/if_types.h>	/* IFT_ETHER and friends */
-#include <net/if_var.h>		/* kernel-only part of ifnet(9) */
 #include <net/vnet.h>
 
 static const char edscname[] = "edsc";
 
 /*
  * Software configuration of an interface specific to this device type.
  */
 struct edsc_softc {
 	struct ifnet	*sc_ifp; /* ptr to generic interface configuration */
 
 	/*
 	 * A non-null driver can keep various things here, for instance,
 	 * the hardware revision, cached values of write-only registers, etc.
 	 */
 };
 
 /*
  * Attach to the interface cloning framework.
  */
 VNET_DEFINE_STATIC(struct if_clone *, edsc_cloner);
 #define	V_edsc_cloner	VNET(edsc_cloner)
 static int	edsc_clone_create(struct if_clone *, int, caddr_t);
 static void	edsc_clone_destroy(struct ifnet *);
 
 /*
  * Interface driver methods.
  */
 static void	edsc_init(void *dummy);
 /* static void edsc_input(struct ifnet *ifp, struct mbuf *m); would be here */
 static int	edsc_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
 static void	edsc_start(struct ifnet *ifp);
 
 /*
  * We'll allocate softc instances from this.
  */
 static		MALLOC_DEFINE(M_EDSC, edscname, "Ethernet discard interface");
 
 /*
  * Create an interface instance.
  */
 static int
 edsc_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct edsc_softc	*sc;
 	struct ifnet		*ifp;
 	struct ether_addr	eaddr;
 
 	/*
 	 * Allocate soft and ifnet structures.  Link each to the other.
 	 */
 	sc = malloc(sizeof(struct edsc_softc), M_EDSC, M_WAITOK | M_ZERO);
 	ifp = sc->sc_ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		free(sc, M_EDSC);
 		return (ENOSPC);
 	}
 
 	ifp->if_softc = sc;
 
 	/*
 	 * Get a name for this particular interface in its ifnet structure.
 	 */
 	if_initname(ifp, edscname, unit);
 
 	/*
 	 * Typical Ethernet interface flags: we can do broadcast and
 	 * multicast but can't hear our own broadcasts or multicasts.
 	 */
 	ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX;
 
 	/*
 	 * We can pretent we have the whole set of hardware features
 	 * because we just discard all packets we get from the upper layer.
 	 * However, the features are disabled initially.  They can be
 	 * enabled via edsc_ioctl() when needed.
 	 */
 	ifp->if_capabilities =
 	    IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM |
 	    IFCAP_HWCSUM | IFCAP_TSO |
 	    IFCAP_JUMBO_MTU;
 	ifp->if_capenable = 0;
 
 	/*
 	 * Set the interface driver methods.
 	 */
 	ifp->if_init = edsc_init;
 	/* ifp->if_input = edsc_input; */
 	ifp->if_ioctl = edsc_ioctl;
 	ifp->if_start = edsc_start;
 
 	/*
 	 * Set the maximum output queue length from the global parameter.
 	 */
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 
 	/*
 	 * Generate an arbitrary MAC address for the cloned interface.
 	 */
 	ether_gen_addr(ifp, &eaddr);
 
 	/*
 	 * Do ifnet initializations common to all Ethernet drivers
 	 * and attach to the network interface framework.
 	 */
 	ether_ifattach(ifp, eaddr.octet);
 
 	/*
 	 * Now we can mark the interface as running, i.e., ready
 	 * for operation.
 	 */
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 
 	return (0);
 }
 
 /*
  * Destroy an interface instance.
  */
 static void
 edsc_clone_destroy(struct ifnet *ifp)
 {
 	struct edsc_softc	*sc = ifp->if_softc;
 
 	/*
 	 * Detach from the network interface framework.
 	 */
 	ether_ifdetach(ifp);
 
 	/*
 	 * Free memory occupied by ifnet and softc.
 	 */
 	if_free(ifp);
 	free(sc, M_EDSC);
 }
 
 /*
  * This method is invoked from ether_ioctl() when it's time
  * to bring up the hardware.
  */
 static void
 edsc_init(void *dummy)
 {
 #if 0	/* what a hardware driver would do here... */
 	struct edsc_soft	*sc = (struct edsc_softc *)dummy;
 	struct ifnet		*ifp = sc->sc_ifp;
 
 	/* blah-blah-blah */
 #endif
 }
 
 /*
  * Network interfaces are controlled via the ioctl(2) syscall.
  */
 static int
 edsc_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifreq		*ifr = (struct ifreq *)data;
 
 	switch (cmd) {
 	case SIOCSIFCAP:
 #if 1
 		/*
 		 * Just turn on any capabilities requested.
 		 * The generic ifioctl() function has already made sure
 		 * that they are supported, i.e., set in if_capabilities.
 		 */
 		ifp->if_capenable = ifr->ifr_reqcap;
 #else
 		/*
 		 * A h/w driver would need to analyze the requested
 		 * bits and program the hardware, e.g.:
 		 */
 		mask = ifp->if_capenable ^ ifr->ifr_reqcap;
 
 		if (mask & IFCAP_VLAN_HWTAGGING) {
 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
 
 			if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING)
 				/* blah-blah-blah */
 			else
 				/* etc-etc-etc */
 		}
 #endif
 		break;
 
 	default:
 		/*
 		 * Offload the rest onto the common Ethernet handler.
 		 */
 		return (ether_ioctl(ifp, cmd, data));
 	}
 
 	return (0);
 }
 
 /*
  * Process the output queue.
  */
 static void
 edsc_start(struct ifnet *ifp)
 {
 	struct mbuf		*m;
 
 	/*
 	 * A hardware interface driver can set IFF_DRV_OACTIVE
 	 * in ifp->if_drv_flags:
 	 *
 	 * ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 	 *
 	 * to prevent if_start from being invoked again while the
 	 * transmission is under way.  The flag is to protect the
 	 * device's transmitter, not the method itself.  The output
 	 * queue is locked and several threads can process it in
 	 * parallel safely, so the driver can use other means to
 	 * serialize access to the transmitter.
 	 *
 	 * If using IFF_DRV_OACTIVE, the driver should clear the flag
 	 * not earlier than the current transmission is complete, e.g.,
 	 * upon an interrupt from the device, not just before returning
 	 * from if_start.  This method merely starts the transmission,
 	 * which may proceed asynchronously.
 	 */
 
 	/*
 	 * We loop getting packets from the queue until it's empty.
 	 * A h/w driver would loop until the device can accept more
 	 * data into its buffer, or while there are free transmit
 	 * descriptors, or whatever.
 	 */
 	for (;;) {
 		/*
 		 * Try to dequeue one packet.  Stop if the queue is empty.
 		 * Use IF_DEQUEUE() here if ALTQ(9) support is unneeded.
 		 */
 		IFQ_DEQUEUE(&ifp->if_snd, m);
 		if (m == NULL)
 			break;
 
 		/*
 		 * Let bpf(9) at the packet.
 		 */
 		BPF_MTAP(ifp, m);
 
 		/*
 		 * Update the interface counters.
 		 */
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 
 		/*
 		 * Finally, just drop the packet.
 		 * TODO: Reply to ARP requests unless IFF_NOARP is set.
 		 */
 		m_freem(m);
 	}
 
 	/*
 	 * ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 	 * would be here only if the transmission were synchronous.
 	 */
 }
 
 static void
 vnet_edsc_init(const void *unused __unused)
 {
 
 	/*
 	 * Connect to the network interface cloning framework.
 	 * The last argument is the number of units to be created
 	 * from the outset.  It's also the minimum number of units
 	 * allowed.  We don't want any units created as soon as the
 	 * driver is loaded.
 	 */
 	V_edsc_cloner = if_clone_simple(edscname, edsc_clone_create,
 	    edsc_clone_destroy, 0);
 }
 VNET_SYSINIT(vnet_edsc_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_edsc_init, NULL);
 
 static void
 vnet_edsc_uninit(const void *unused __unused)
 {
 
 	/*
 	 * Disconnect from the cloning framework.
 	 * Existing interfaces will be disposed of properly.
 	 */
 	if_clone_detach(V_edsc_cloner);
 }
 VNET_SYSUNINIT(vnet_edsc_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_edsc_uninit, NULL);
 
 /*
  * This function provides handlers for module events, namely load and unload.
  */
 static int
 edsc_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 	case MOD_UNLOAD:
 		break;
 	default:
 		/*
 		 * There are other event types, but we don't handle them.
 		 * See module(9).
 		 */
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t edsc_mod = {
 	"if_edsc",			/* name */
 	edsc_modevent,			/* event handler */
 	NULL				/* additional data */
 };
 
 DECLARE_MODULE(if_edsc, edsc_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
diff --git a/sys/net/if_enc.c b/sys/net/if_enc.c
index af1d16f11a18..da6ce7a1a815 100644
--- a/sys/net/if_enc.c
+++ b/sys/net/if_enc.c
@@ -1,449 +1,450 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2006 The FreeBSD Project.
  * Copyright (c) 2015 Andrey V. Elsukov <ae@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_enc.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/bpf.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_var.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 
 #include <netipsec/ipsec.h>
 #include <netipsec/xform.h>
 
 #define ENCMTU		(1024+512)
 
 /* XXX this define must have the same value as in OpenBSD */
 #define M_CONF		0x0400	/* payload was encrypted (ESP-transport) */
 #define M_AUTH		0x0800	/* payload was authenticated (AH or ESP auth) */
 #define M_AUTH_AH	0x2000	/* header was authenticated (AH) */
 
 struct enchdr {
 	u_int32_t af;
 	u_int32_t spi;
 	u_int32_t flags;
 };
 struct enc_softc {
 	struct	ifnet *sc_ifp;
 };
 VNET_DEFINE_STATIC(struct enc_softc *, enc_sc);
 #define	V_enc_sc	VNET(enc_sc)
 VNET_DEFINE_STATIC(struct if_clone *, enc_cloner);
 #define	V_enc_cloner	VNET(enc_cloner)
 
 static int	enc_ioctl(struct ifnet *, u_long, caddr_t);
 static int	enc_output(struct ifnet *, struct mbuf *,
     const struct sockaddr *, struct route *);
 static int	enc_clone_create(struct if_clone *, int, caddr_t);
 static void	enc_clone_destroy(struct ifnet *);
 static int	enc_add_hhooks(struct enc_softc *);
 static void	enc_remove_hhooks(struct enc_softc *);
 
 static const char encname[] = "enc";
 
 #define	IPSEC_ENC_AFTER_PFIL	0x04
 /*
  * Before and after are relative to when we are stripping the
  * outer IP header.
  *
  * AFTER_PFIL flag used only for bpf_mask_*. It enables BPF capturing
  * after PFIL hook execution. It might be useful when PFIL hook does
  * some changes to the packet, e.g. address translation. If PFIL hook
  * consumes mbuf, nothing will be captured.
  */
 VNET_DEFINE_STATIC(int, filter_mask_in) = IPSEC_ENC_BEFORE;
 VNET_DEFINE_STATIC(int, bpf_mask_in) = IPSEC_ENC_BEFORE;
 VNET_DEFINE_STATIC(int, filter_mask_out) = IPSEC_ENC_BEFORE;
 VNET_DEFINE_STATIC(int, bpf_mask_out) = IPSEC_ENC_BEFORE | IPSEC_ENC_AFTER;
 #define	V_filter_mask_in	VNET(filter_mask_in)
 #define	V_bpf_mask_in		VNET(bpf_mask_in)
 #define	V_filter_mask_out	VNET(filter_mask_out)
 #define	V_bpf_mask_out		VNET(bpf_mask_out)
 
 static SYSCTL_NODE(_net, OID_AUTO, enc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "enc sysctl");
 static SYSCTL_NODE(_net_enc, OID_AUTO, in, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "enc input sysctl");
 static SYSCTL_NODE(_net_enc, OID_AUTO, out, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "enc output sysctl");
 SYSCTL_INT(_net_enc_in, OID_AUTO, ipsec_filter_mask,
     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(filter_mask_in), 0,
     "IPsec input firewall filter mask");
 SYSCTL_INT(_net_enc_in, OID_AUTO, ipsec_bpf_mask,
     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(bpf_mask_in), 0,
     "IPsec input bpf mask");
 SYSCTL_INT(_net_enc_out, OID_AUTO, ipsec_filter_mask,
     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(filter_mask_out), 0,
     "IPsec output firewall filter mask");
 SYSCTL_INT(_net_enc_out, OID_AUTO, ipsec_bpf_mask,
     CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(bpf_mask_out), 0,
     "IPsec output bpf mask");
 
 static void
 enc_clone_destroy(struct ifnet *ifp)
 {
 	struct enc_softc *sc;
 
 	sc = ifp->if_softc;
 	KASSERT(sc == V_enc_sc, ("sc != ifp->if_softc"));
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 	if_free(ifp);
 	free(sc, M_DEVBUF);
 	V_enc_sc = NULL;
 }
 
 static int
 enc_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct ifnet *ifp;
 	struct enc_softc *sc;
 
 	sc = malloc(sizeof(struct enc_softc), M_DEVBUF,
 	    M_WAITOK | M_ZERO);
 	ifp = sc->sc_ifp = if_alloc(IFT_ENC);
 	if (ifp == NULL) {
 		free(sc, M_DEVBUF);
 		return (ENOSPC);
 	}
 	if (V_enc_sc != NULL) {
 		if_free(ifp);
 		free(sc, M_DEVBUF);
 		return (EEXIST);
 	}
 	V_enc_sc = sc;
 	if_initname(ifp, encname, unit);
 	ifp->if_mtu = ENCMTU;
 	ifp->if_ioctl = enc_ioctl;
 	ifp->if_output = enc_output;
 	ifp->if_softc = sc;
 	if_attach(ifp);
 	bpfattach(ifp, DLT_ENC, sizeof(struct enchdr));
 	return (0);
 }
 
 static int
 enc_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
     struct route *ro)
 {
 
 	m_freem(m);
 	return (0);
 }
 
 static int
 enc_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 
 	if (cmd != SIOCSIFFLAGS)
 		return (EINVAL);
 	if (ifp->if_flags & IFF_UP)
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	else
 		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	return (0);
 }
 
 static void
 enc_bpftap(struct ifnet *ifp, struct mbuf *m, const struct secasvar *sav,
     int32_t hhook_type, uint8_t enc, uint8_t af)
 {
 	struct enchdr hdr;
 
 	if (hhook_type == HHOOK_TYPE_IPSEC_IN &&
 	    (enc & V_bpf_mask_in) == 0)
 		return;
 	else if (hhook_type == HHOOK_TYPE_IPSEC_OUT &&
 	    (enc & V_bpf_mask_out) == 0)
 		return;
 	if (bpf_peers_present(ifp->if_bpf) == 0)
 		return;
 	hdr.af = af;
 	hdr.spi = sav->spi;
 	hdr.flags = 0;
 	if (sav->alg_enc != SADB_EALG_NONE)
 		hdr.flags |= M_CONF;
 	if (sav->alg_auth != SADB_AALG_NONE)
 		hdr.flags |= M_AUTH;
 	bpf_mtap2(ifp->if_bpf, &hdr, sizeof(hdr), m);
 }
 
 /*
  * One helper hook function is used by any hook points.
  * + from hhook_type we can determine the packet direction:
  *   HHOOK_TYPE_IPSEC_IN or HHOOK_TYPE_IPSEC_OUT;
  * + from hhook_id we can determine address family: AF_INET or AF_INET6;
  * + udata contains pointer to enc_softc;
  * + ctx_data contains pointer to struct ipsec_ctx_data.
  */
 static int
 enc_hhook(int32_t hhook_type, int32_t hhook_id, void *udata, void *ctx_data,
     void *hdata, struct osd *hosd)
 {
 	struct ipsec_ctx_data *ctx;
 	struct enc_softc *sc;
 	struct ifnet *ifp, *rcvif;
 	struct pfil_head *ph;
 	int pdir;
 
 	sc = (struct enc_softc *)udata;
 	ifp = sc->sc_ifp;
 	if ((ifp->if_flags & IFF_UP) == 0)
 		return (0);
 
 	ctx = (struct ipsec_ctx_data *)ctx_data;
 	/* XXX: wrong hook point was used by caller? */
 	if (ctx->af != hhook_id)
 		return (EPFNOSUPPORT);
 
 	enc_bpftap(ifp, *ctx->mp, ctx->sav, hhook_type, ctx->enc, ctx->af);
 	switch (hhook_type) {
 	case HHOOK_TYPE_IPSEC_IN:
 		if (ctx->enc == IPSEC_ENC_BEFORE) {
 			/* Do accounting only once */
 			if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 			if_inc_counter(ifp, IFCOUNTER_IBYTES,
 			    (*ctx->mp)->m_pkthdr.len);
 		}
 		if ((ctx->enc & V_filter_mask_in) == 0)
 			return (0); /* skip pfil processing */
 		pdir = PFIL_IN;
 		break;
 	case HHOOK_TYPE_IPSEC_OUT:
 		if (ctx->enc == IPSEC_ENC_BEFORE) {
 			/* Do accounting only once */
 			if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
 			    (*ctx->mp)->m_pkthdr.len);
 		}
 		if ((ctx->enc & V_filter_mask_out) == 0)
 			return (0); /* skip pfil processing */
 		pdir = PFIL_OUT;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	switch (hhook_id) {
 #ifdef INET
 	case AF_INET:
 		ph = V_inet_pfil_head;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ph = V_inet6_pfil_head;
 		break;
 #endif
 	default:
 		ph = NULL;
 	}
 	if (ph == NULL || (pdir == PFIL_OUT && !PFIL_HOOKED_OUT(ph)) ||
 	    (pdir == PFIL_IN && !PFIL_HOOKED_IN(ph)))
 		return (0);
 	/* Make a packet looks like it was received on enc(4) */
 	rcvif = (*ctx->mp)->m_pkthdr.rcvif;
 	(*ctx->mp)->m_pkthdr.rcvif = ifp;
 	if (pfil_run_hooks(ph, ctx->mp, ifp, pdir, ctx->inp) != PFIL_PASS) {
 		*ctx->mp = NULL; /* consumed by filter */
 		return (EACCES);
 	}
 	(*ctx->mp)->m_pkthdr.rcvif = rcvif;
 	enc_bpftap(ifp, *ctx->mp, ctx->sav, hhook_type,
 	    IPSEC_ENC_AFTER_PFIL, ctx->af);
 	return (0);
 }
 
 static int
 enc_add_hhooks(struct enc_softc *sc)
 {
 	struct hookinfo hki;
 	int error;
 
 	error = EPFNOSUPPORT;
 	hki.hook_func = enc_hhook;
 	hki.hook_helper = NULL;
 	hki.hook_udata = sc;
 #ifdef INET
 	hki.hook_id = AF_INET;
 	hki.hook_type = HHOOK_TYPE_IPSEC_IN;
 	error = hhook_add_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET],
 	    &hki, HHOOK_WAITOK);
 	if (error != 0)
 		return (error);
 	hki.hook_type = HHOOK_TYPE_IPSEC_OUT;
 	error = hhook_add_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET],
 	    &hki, HHOOK_WAITOK);
 	if (error != 0)
 		return (error);
 #endif
 #ifdef INET6
 	hki.hook_id = AF_INET6;
 	hki.hook_type = HHOOK_TYPE_IPSEC_IN;
 	error = hhook_add_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET6],
 	    &hki, HHOOK_WAITOK);
 	if (error != 0)
 		return (error);
 	hki.hook_type = HHOOK_TYPE_IPSEC_OUT;
 	error = hhook_add_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET6],
 	    &hki, HHOOK_WAITOK);
 	if (error != 0)
 		return (error);
 #endif
 	return (error);
 }
 
 static void
 enc_remove_hhooks(struct enc_softc *sc)
 {
 	struct hookinfo hki;
 
 	hki.hook_func = enc_hhook;
 	hki.hook_helper = NULL;
 	hki.hook_udata = sc;
 #ifdef INET
 	hki.hook_id = AF_INET;
 	hki.hook_type = HHOOK_TYPE_IPSEC_IN;
 	hhook_remove_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET], &hki);
 	hki.hook_type = HHOOK_TYPE_IPSEC_OUT;
 	hhook_remove_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET], &hki);
 #endif
 #ifdef INET6
 	hki.hook_id = AF_INET6;
 	hki.hook_type = HHOOK_TYPE_IPSEC_IN;
 	hhook_remove_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET6], &hki);
 	hki.hook_type = HHOOK_TYPE_IPSEC_OUT;
 	hhook_remove_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET6], &hki);
 #endif
 }
 
 static void
 vnet_enc_init(const void *unused __unused)
 {
 
 	V_enc_sc = NULL;
 	V_enc_cloner = if_clone_simple(encname, enc_clone_create,
 	    enc_clone_destroy, 1);
 }
 VNET_SYSINIT(vnet_enc_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_enc_init, NULL);
 
 static void
 vnet_enc_init_proto(void *unused __unused)
 {
 	KASSERT(V_enc_sc != NULL, ("%s: V_enc_sc is %p\n", __func__, V_enc_sc));
 
 	if (enc_add_hhooks(V_enc_sc) != 0)
 		enc_clone_destroy(V_enc_sc->sc_ifp);
 }
 VNET_SYSINIT(vnet_enc_init_proto, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_enc_init_proto, NULL);
 
 static void
 vnet_enc_uninit(const void *unused __unused)
 {
 	KASSERT(V_enc_sc != NULL, ("%s: V_enc_sc is %p\n", __func__, V_enc_sc));
 
 	if_clone_detach(V_enc_cloner);
 }
 VNET_SYSUNINIT(vnet_enc_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_enc_uninit, NULL);
 
 /*
  * The hhook consumer needs to go before ip[6]_destroy are called on
  * SI_ORDER_THIRD.
  */
 static void
 vnet_enc_uninit_hhook(const void *unused __unused)
 {
 	KASSERT(V_enc_sc != NULL, ("%s: V_enc_sc is %p\n", __func__, V_enc_sc));
 
 	enc_remove_hhooks(V_enc_sc);
 }
 VNET_SYSUNINIT(vnet_enc_uninit_hhook, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
     vnet_enc_uninit_hhook, NULL);
 
 static int
 enc_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 	case MOD_UNLOAD:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t enc_mod = {
 	"if_enc",
 	enc_modevent,
 	0
 };
 
 DECLARE_MODULE(if_enc, enc_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_enc, 1);
diff --git a/sys/net/if_epair.c b/sys/net/if_epair.c
index cbe75de52c0f..81538c0cb157 100644
--- a/sys/net/if_epair.c
+++ b/sys/net/if_epair.c
@@ -1,931 +1,932 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2008 The FreeBSD Foundation
  * Copyright (c) 2009-2021 Bjoern A. Zeeb <bz@FreeBSD.org>
  *
  * This software was developed by CK Software GmbH under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  * notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  * notice, this list of conditions and the following disclaimer in the
  * documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * A pair of virtual back-to-back connected ethernet like interfaces
  * (``two interfaces with a virtual cross-over cable'').
  *
  * This is mostly intended to be used to provide connectivity between
  * different virtual network stack instances.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_rss.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/hash.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/libkern.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/taskqueue.h>
 #include <sys/types.h>
 #include <sys/buf_ring.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_media.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #ifdef RSS
 #include <net/rss_config.h>
 #ifdef INET
 #include <netinet/in_rss.h>
 #endif
 #ifdef INET6
 #include <netinet6/in6_rss.h>
 #endif
 #endif
 #include <net/vnet.h>
 
 static const char epairname[] = "epair";
 #define	RXRSIZE	4096	/* Probably overkill by 4-8x. */
 
 static MALLOC_DEFINE(M_EPAIR, epairname,
     "Pair of virtual cross-over connected Ethernet-like interfaces");
 
 VNET_DEFINE_STATIC(struct if_clone *, epair_cloner);
 #define	V_epair_cloner	VNET(epair_cloner)
 
 static unsigned int next_index = 0;
 #define	EPAIR_LOCK_INIT()		mtx_init(&epair_n_index_mtx, "epairidx", \
 					    NULL, MTX_DEF)
 #define	EPAIR_LOCK_DESTROY()		mtx_destroy(&epair_n_index_mtx)
 #define	EPAIR_LOCK()			mtx_lock(&epair_n_index_mtx)
 #define	EPAIR_UNLOCK()			mtx_unlock(&epair_n_index_mtx)
 
 #define BIT_QUEUE_TASK		0
 #define BIT_MBUF_QUEUED		1
 
 struct epair_softc;
 struct epair_queue {
 	int			 id;
 	struct buf_ring		*rxring[2];
 	volatile int		 ridx;		/* 0 || 1 */
 	volatile long		 state;		/* taskqueue coordination */
 	struct task		 tx_task;
 	struct epair_softc	*sc;
 };
 
 static struct mtx epair_n_index_mtx;
 struct epair_softc {
 	struct ifnet		*ifp;		/* This ifp. */
 	struct ifnet		*oifp;		/* other ifp of pair. */
 	int			 num_queues;
 	struct epair_queue	*queues;
 	struct ifmedia		 media;		/* Media config (fake). */
 	STAILQ_ENTRY(epair_softc) entry;
 };
 
 struct epair_tasks_t {
 	int			 tasks;
 	struct taskqueue	 *tq[MAXCPU];
 };
 
 static struct epair_tasks_t epair_tasks;
 
 static void
 epair_clear_mbuf(struct mbuf *m)
 {
 	/* Remove any CSUM_SND_TAG as ether_input will barf. */
 	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) {
 		m_snd_tag_rele(m->m_pkthdr.snd_tag);
 		m->m_pkthdr.snd_tag = NULL;
 		m->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
 	}
 
 	m_tag_delete_nonpersistent(m);
 }
 
 static void
 epair_if_input(struct epair_softc *sc, struct epair_queue *q, int ridx)
 {
 	struct ifnet *ifp;
 	struct mbuf *m;
 
 	ifp = sc->ifp;
 	CURVNET_SET(ifp->if_vnet);
 	while (! buf_ring_empty(q->rxring[ridx])) {
 		m = buf_ring_dequeue_mc(q->rxring[ridx]);
 		if (m == NULL)
 			continue;
 
 		MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 		(*ifp->if_input)(ifp, m);
 
 	}
 	CURVNET_RESTORE();
 }
 
 static void
 epair_tx_start_deferred(void *arg, int pending)
 {
 	struct epair_queue *q = (struct epair_queue *)arg;
 	struct epair_softc *sc = q->sc;
 	int ridx, nidx;
 
 	if_ref(sc->ifp);
 	ridx = atomic_load_int(&q->ridx);
 	do {
 		nidx = (ridx == 0) ? 1 : 0;
 	} while (!atomic_fcmpset_int(&q->ridx, &ridx, nidx));
 	epair_if_input(sc, q, ridx);
 
 	atomic_clear_long(&q->state, (1 << BIT_QUEUE_TASK));
 	if (atomic_testandclear_long(&q->state, BIT_MBUF_QUEUED))
 		taskqueue_enqueue(epair_tasks.tq[q->id], &q->tx_task);
 
 	if_rele(sc->ifp);
 }
 
 static struct epair_queue *
 epair_select_queue(struct epair_softc *sc, struct mbuf *m)
 {
 	uint32_t bucket;
 #ifdef RSS
 	struct ether_header *eh;
 	int ret;
 
 	ret = rss_m2bucket(m, &bucket);
 	if (ret) {
 		/* Actually hash the packet. */
 		eh = mtod(m, struct ether_header *);
 
 		switch (ntohs(eh->ether_type)) {
 #ifdef INET
 		case ETHERTYPE_IP:
 			rss_soft_m2cpuid_v4(m, 0, &bucket);
 			break;
 #endif
 #ifdef INET6
 		case ETHERTYPE_IPV6:
 			rss_soft_m2cpuid_v6(m, 0, &bucket);
 			break;
 #endif
 		default:
 			bucket = 0;
 			break;
 		}
 	}
 	bucket %= sc->num_queues;
 #else
 	bucket = 0;
 #endif
 	return (&sc->queues[bucket]);
 }
 
 static void
 epair_prepare_mbuf(struct mbuf *m, struct ifnet *src_ifp)
 {
 	M_ASSERTPKTHDR(m);
 	epair_clear_mbuf(m);
 	if_setrcvif(m, src_ifp);
 	M_SETFIB(m, src_ifp->if_fib);
 
 	MPASS(m->m_nextpkt == NULL);
 	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 }
 
 static void
 epair_menq(struct mbuf *m, struct epair_softc *osc)
 {
 	struct ifnet *ifp, *oifp;
 	int len, ret;
 	int ridx;
 	short mflags;
 
 	/*
 	 * I know this looks weird. We pass the "other sc" as we need that one
 	 * and can get both ifps from it as well.
 	 */
 	oifp = osc->ifp;
 	ifp = osc->oifp;
 
 	epair_prepare_mbuf(m, oifp);
 
 	/* Save values as once the mbuf is queued, it's not ours anymore. */
 	len = m->m_pkthdr.len;
 	mflags = m->m_flags;
 
 	struct epair_queue *q = epair_select_queue(osc, m);
 
 	atomic_set_long(&q->state, (1 << BIT_MBUF_QUEUED));
 	ridx = atomic_load_int(&q->ridx);
 	ret = buf_ring_enqueue(q->rxring[ridx], m);
 	if (ret != 0) {
 		/* Ring is full. */
 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 		m_freem(m);
 		return;
 	}
 
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 	/*
 	 * IFQ_HANDOFF_ADJ/ip_handoff() update statistics,
 	 * but as we bypass all this we have to duplicate
 	 * the logic another time.
 	 */
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 	if (mflags & (M_BCAST|M_MCAST))
 		if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 	/* Someone else received the packet. */
 	if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
 
 	if (!atomic_testandset_long(&q->state, BIT_QUEUE_TASK))
 		taskqueue_enqueue(epair_tasks.tq[q->id], &q->tx_task);
 }
 
 static void
 epair_start(struct ifnet *ifp)
 {
 	struct mbuf *m;
 	struct epair_softc *sc;
 	struct ifnet *oifp;
 
 	/*
 	 * We get packets here from ether_output via if_handoff()
 	 * and need to put them into the input queue of the oifp
 	 * and will put the packet into the receive-queue (rxq) of the
 	 * other interface (oifp) of our pair.
 	 */
 	sc = ifp->if_softc;
 	oifp = sc->oifp;
 	sc = oifp->if_softc;
 	for (;;) {
 		IFQ_DEQUEUE(&ifp->if_snd, m);
 		if (m == NULL)
 			break;
 		M_ASSERTPKTHDR(m);
 		BPF_MTAP(ifp, m);
 
 		/* In case either interface is not usable drop the packet. */
 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 		    (ifp->if_flags & IFF_UP) == 0 ||
 		    (oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 		    (oifp->if_flags & IFF_UP) == 0) {
 			m_freem(m);
 			continue;
 		}
 
 		epair_menq(m, sc);
 	}
 }
 
 static int
 epair_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epair_softc *sc;
 	struct ifnet *oifp;
 #ifdef ALTQ
 	int len;
 	short mflags;
 #endif
 
 	if (m == NULL)
 		return (0);
 	M_ASSERTPKTHDR(m);
 
 	/*
 	 * We are not going to use the interface en/dequeue mechanism
 	 * on the TX side. We are called from ether_output_frame()
 	 * and will put the packet into the receive-queue (rxq) of the
 	 * other interface (oifp) of our pair.
 	 */
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		m_freem(m);
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENXIO);
 	}
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENETDOWN);
 	}
 
 	BPF_MTAP(ifp, m);
 
 	/*
 	 * In case the outgoing interface is not usable,
 	 * drop the packet.
 	 */
 	sc = ifp->if_softc;
 	oifp = sc->oifp;
 	if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    (oifp->if_flags & IFF_UP) == 0) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (0);
 	}
 
 #ifdef ALTQ
 	len = m->m_pkthdr.len;
 	mflags = m->m_flags;
 	int error = 0;
 
 	/* Support ALTQ via the classic if_start() path. */
 	IF_LOCK(&ifp->if_snd);
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		ALTQ_ENQUEUE(&ifp->if_snd, m, NULL, error);
 		if (error)
 			if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 		IF_UNLOCK(&ifp->if_snd);
 		if (!error) {
 			if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 			if (mflags & (M_BCAST|M_MCAST))
 				if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 			epair_start(ifp);
 		}
 		return (error);
 	}
 	IF_UNLOCK(&ifp->if_snd);
 #endif
 
 	epair_menq(m, oifp->if_softc);
 	return (0);
 }
 
 static void
 epair_qflush(struct ifnet *ifp __unused)
 {
 }
 
 static int
 epair_media_change(struct ifnet *ifp __unused)
 {
 
 	/* Do nothing. */
 	return (0);
 }
 
 static void
 epair_media_status(struct ifnet *ifp __unused, struct ifmediareq *imr)
 {
 
 	imr->ifm_status = IFM_AVALID | IFM_ACTIVE;
 	imr->ifm_active = IFM_ETHER | IFM_10G_T | IFM_FDX;
 }
 
 static int
 epair_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct epair_softc *sc;
 	struct ifreq *ifr;
 	int error;
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 	case SIOCSIFFLAGS:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		error = 0;
 		break;
 
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		sc = ifp->if_softc;
 		error = ifmedia_ioctl(ifp, ifr, &sc->media, cmd);
 		break;
 
 	case SIOCSIFMTU:
 		/* We basically allow all kinds of MTUs. */
 		ifp->if_mtu = ifr->ifr_mtu;
 		error = 0;
 		break;
 
 	default:
 		/* Let the common ethernet handler process this. */
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 
 	return (error);
 }
 
 static void
 epair_init(void *dummy __unused)
 {
 }
 
 /*
  * Interface cloning functions.
  * We use our private ones so that we can create/destroy our secondary
  * device along with the primary one.
  */
 static int
 epair_clone_match(struct if_clone *ifc, const char *name)
 {
 	const char *cp;
 
 	/*
 	 * Our base name is epair.
 	 * Our interfaces will be named epair<n>[ab].
 	 * So accept anything of the following list:
 	 * - epair
 	 * - epair<n>
 	 * but not the epair<n>[ab] versions.
 	 */
 	if (strncmp(epairname, name, sizeof(epairname)-1) != 0)
 		return (0);
 
 	for (cp = name + sizeof(epairname) - 1; *cp != '\0'; cp++) {
 		if (*cp < '0' || *cp > '9')
 			return (0);
 	}
 
 	return (1);
 }
 
 static void
 epair_clone_add(struct if_clone *ifc, struct epair_softc *scb)
 {
 	struct ifnet *ifp;
 	uint8_t eaddr[ETHER_ADDR_LEN];	/* 00:00:00:00:00:00 */
 
 	ifp = scb->ifp;
 	/* Copy epairNa etheraddr and change the last byte. */
 	memcpy(eaddr, scb->oifp->if_hw_addr, ETHER_ADDR_LEN);
 	eaddr[5] = 0x0b;
 	ether_ifattach(ifp, eaddr);
 
 	if_clone_addif(ifc, ifp);
 }
 
 static struct epair_softc *
 epair_alloc_sc(struct if_clone *ifc)
 {
 	struct epair_softc *sc;
 
 	struct ifnet *ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL)
 		return (NULL);
 
 	sc = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO);
 	sc->ifp = ifp;
 	sc->num_queues = epair_tasks.tasks;
 	sc->queues = mallocarray(sc->num_queues, sizeof(struct epair_queue),
 	    M_EPAIR, M_WAITOK);
 	for (int i = 0; i < sc->num_queues; i++) {
 		struct epair_queue *q = &sc->queues[i];
 		q->id = i;
 		q->rxring[0] = buf_ring_alloc(RXRSIZE, M_EPAIR, M_WAITOK, NULL);
 		q->rxring[1] = buf_ring_alloc(RXRSIZE, M_EPAIR, M_WAITOK, NULL);
 		q->ridx = 0;
 		q->state = 0;
 		q->sc = sc;
 		NET_TASK_INIT(&q->tx_task, 0, epair_tx_start_deferred, q);
 	}
 
 	/* Initialise pseudo media types. */
 	ifmedia_init(&sc->media, 0, epair_media_change, epair_media_status);
 	ifmedia_add(&sc->media, IFM_ETHER | IFM_10G_T, 0, NULL);
 	ifmedia_set(&sc->media, IFM_ETHER | IFM_10G_T);
 
 	return (sc);
 }
 
 static void
 epair_setup_ifp(struct epair_softc *sc, char *name, int unit)
 {
 	struct ifnet *ifp = sc->ifp;
 
 	ifp->if_softc = sc;
 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
 	ifp->if_dname = epairname;
 	ifp->if_dunit = unit;
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_flags |= IFF_KNOWSEPOCH;
 	ifp->if_capabilities = IFCAP_VLAN_MTU;
 	ifp->if_capenable = IFCAP_VLAN_MTU;
 	ifp->if_transmit = epair_transmit;
 	ifp->if_qflush = epair_qflush;
 	ifp->if_start = epair_start;
 	ifp->if_ioctl = epair_ioctl;
 	ifp->if_init  = epair_init;
 	if_setsendqlen(ifp, ifqmaxlen);
 	if_setsendqready(ifp);
 
 	ifp->if_baudrate = IF_Gbps(10);	/* arbitrary maximum */
 }
 
 static void
 epair_generate_mac(struct epair_softc *sc, uint8_t *eaddr)
 {
 	uint32_t key[3];
 	uint32_t hash;
 	uint64_t hostid;
 
 	EPAIR_LOCK();
 #ifdef SMP
 	/* Get an approximate distribution. */
 	hash = next_index % mp_ncpus;
 #else
 	hash = 0;
 #endif
 	EPAIR_UNLOCK();
 
 	/*
 	 * Calculate the etheraddr hashing the hostid and the
 	 * interface index. The result would be hopefully unique.
 	 * Note that the "a" component of an epair instance may get moved
 	 * to a different VNET after creation. In that case its index
 	 * will be freed and the index can get reused by new epair instance.
 	 * Make sure we do not create same etheraddr again.
 	 */
 	getcredhostid(curthread->td_ucred, (unsigned long *)&hostid);
 	if (hostid == 0)
 		arc4rand(&hostid, sizeof(hostid), 0);
 
 	struct ifnet *ifp = sc->ifp;
 	EPAIR_LOCK();
 	if (ifp->if_index > next_index)
 		next_index = ifp->if_index;
 	else
 		next_index++;
 
 	key[0] = (uint32_t)next_index;
 	EPAIR_UNLOCK();
 	key[1] = (uint32_t)(hostid & 0xffffffff);
 	key[2] = (uint32_t)((hostid >> 32) & 0xfffffffff);
 	hash = jenkins_hash32(key, 3, 0);
 
 	eaddr[0] = 0x02;
 	memcpy(&eaddr[1], &hash, 4);
 	eaddr[5] = 0x0a;
 }
 
 static void
 epair_free_sc(struct epair_softc *sc)
 {
 	if (sc == NULL)
 		return;
 
 	if_free(sc->ifp);
 	ifmedia_removeall(&sc->media);
 	for (int i = 0; i < sc->num_queues; i++) {
 		struct epair_queue *q = &sc->queues[i];
 		buf_ring_free(q->rxring[0], M_EPAIR);
 		buf_ring_free(q->rxring[1], M_EPAIR);
 	}
 	free(sc->queues, M_EPAIR);
 	free(sc, M_EPAIR);
 }
 
 static void
 epair_set_state(struct ifnet *ifp, bool running)
 {
 	if (running) {
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 		if_link_state_change(ifp, LINK_STATE_UP);
 	} else {
 		if_link_state_change(ifp, LINK_STATE_DOWN);
 		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	}
 }
 
 static int
 epair_handle_unit(struct if_clone *ifc, char *name, size_t len, int *punit)
 {
 	int error = 0, unit, wildcard;
 	char *dp;
 
 	/* Try to see if a special unit was requested. */
 	error = ifc_name2unit(name, &unit);
 	if (error != 0)
 		return (error);
 	wildcard = (unit < 0);
 
 	error = ifc_alloc_unit(ifc, &unit);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * If no unit had been given, we need to adjust the ifName.
 	 * Also make sure there is space for our extra [ab] suffix.
 	 */
 	for (dp = name; *dp != '\0'; dp++);
 	if (wildcard) {
 		int slen = snprintf(dp, len - (dp - name), "%d", unit);
 		if (slen > len - (dp - name) - 1) {
 			/* ifName too long. */
 			error = ENOSPC;
 			goto done;
 		}
 		dp += slen;
 	}
 	if (len - (dp - name) - 1 < 1) {
 		/* No space left for our [ab] suffix. */
 		error = ENOSPC;
 		goto done;
 	}
 	*dp = 'b';
 	/* Must not change dp so we can replace 'a' by 'b' later. */
 	*(dp+1) = '\0';
 
 	/* Check if 'a' and 'b' interfaces already exist. */ 
 	if (ifunit(name) != NULL) {
 		error = EEXIST;
 		goto done;
 	}
 
 	*dp = 'a';
 	if (ifunit(name) != NULL) {
 		error = EEXIST;
 		goto done;
 	}
 	*punit = unit;
 done:
 	if (error != 0)
 		ifc_free_unit(ifc, unit);
 
 	return (error);
 }
 
 static int
 epair_clone_create(struct if_clone *ifc, char *name, size_t len,
     struct ifc_data *ifd, struct ifnet **ifpp)
 {
 	struct epair_softc *sca, *scb;
 	struct ifnet *ifp;
 	char *dp;
 	int error, unit;
 	uint8_t eaddr[ETHER_ADDR_LEN];	/* 00:00:00:00:00:00 */
 
 	error = epair_handle_unit(ifc, name, len, &unit);
 	if (error != 0)
 		return (error);
 
 	/* Allocate memory for both [ab] interfaces */
 	sca = epair_alloc_sc(ifc);
 	scb = epair_alloc_sc(ifc);
 	if (sca == NULL || scb == NULL) {
 		epair_free_sc(sca);
 		epair_free_sc(scb);
 		ifc_free_unit(ifc, unit);
 		return (ENOSPC);
 	}
 
 	/*
 	 * Cross-reference the interfaces so we will be able to free both.
 	 */
 	sca->oifp = scb->ifp;
 	scb->oifp = sca->ifp;
 
 	/* Finish initialization of interface <n>a. */
 	ifp = sca->ifp;
 	epair_setup_ifp(sca, name, unit);
 	epair_generate_mac(sca, eaddr);
 
 	ether_ifattach(ifp, eaddr);
 
 	/* Swap the name and finish initialization of interface <n>b. */
 	dp = name + strlen(name) - 1;
 	*dp = 'b';
 
 	epair_setup_ifp(scb, name, unit);
 
 	ifp = scb->ifp;
 	/* We need to play some tricks here for the second interface. */
 	strlcpy(name, epairname, len);
 	/* Correctly set the name for the cloner list. */
 	strlcpy(name, scb->ifp->if_xname, len);
 
 	epair_clone_add(ifc, scb);
 
 	/*
 	 * Restore name to <n>a as the ifp for this will go into the
 	 * cloner list for the initial call.
 	 */
 	strlcpy(name, sca->ifp->if_xname, len);
 
 	/* Tell the world, that we are ready to rock. */
 	epair_set_state(sca->ifp, true);
 	epair_set_state(scb->ifp, true);
 
 	*ifpp = sca->ifp;
 
 	return (0);
 }
 
 static void
 epair_drain_rings(struct epair_softc *sc)
 {
 	int ridx;
 	struct mbuf *m;
 
 	for (ridx = 0; ridx < 2; ridx++) {
 		for (int i = 0; i < sc->num_queues; i++) {
 			struct epair_queue *q = &sc->queues[i];
 			do {
 				m = buf_ring_dequeue_sc(q->rxring[ridx]);
 				if (m == NULL)
 					break;
 				m_freem(m);
 			} while (1);
 		}
 	}
 }
 
 static int
 epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags)
 {
 	struct ifnet *oifp;
 	struct epair_softc *sca, *scb;
 	int unit, error;
 
 	/*
 	 * In case we called into if_clone_destroyif() ourselves
 	 * again to remove the second interface, the softc will be
 	 * NULL. In that case so not do anything but return success.
 	 */
 	if (ifp->if_softc == NULL)
 		return (0);
 
 	unit = ifp->if_dunit;
 	sca = ifp->if_softc;
 	oifp = sca->oifp;
 	scb = oifp->if_softc;
 
 	/* Frist get the interfaces down and detached. */
 	epair_set_state(ifp, false);
 	epair_set_state(oifp, false);
 
 	ether_ifdetach(ifp);
 	ether_ifdetach(oifp);
 
 	/* Third free any queued packets and all the resources. */
 	CURVNET_SET_QUIET(oifp->if_vnet);
 	epair_drain_rings(scb);
 	oifp->if_softc = NULL;
 	error = if_clone_destroyif(ifc, oifp);
 	if (error)
 		panic("%s: if_clone_destroyif() for our 2nd iface failed: %d",
 		    __func__, error);
 	epair_free_sc(scb);
 	CURVNET_RESTORE();
 
 	epair_drain_rings(sca);
 	epair_free_sc(sca);
 
 	/* Last free the cloner unit. */
 	ifc_free_unit(ifc, unit);
 
 	return (0);
 }
 
 static void
 vnet_epair_init(const void *unused __unused)
 {
 	struct if_clone_addreq req = {
 		.match_f = epair_clone_match,
 		.create_f = epair_clone_create,
 		.destroy_f = epair_clone_destroy,
 	};
 	V_epair_cloner = ifc_attach_cloner(epairname, &req);
 }
 VNET_SYSINIT(vnet_epair_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_epair_init, NULL);
 
 static void
 vnet_epair_uninit(const void *unused __unused)
 {
 
 	ifc_detach_cloner(V_epair_cloner);
 }
 VNET_SYSUNINIT(vnet_epair_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_epair_uninit, NULL);
 
 static int
 epair_mod_init(void)
 {
 	char name[32];
 	epair_tasks.tasks = 0;
 
 #ifdef RSS
 	int cpu;
 
 	CPU_FOREACH(cpu) {
 		cpuset_t cpu_mask;
 
 		/* Pin to this CPU so we get appropriate NUMA allocations. */
 		thread_lock(curthread);
 		sched_bind(curthread, cpu);
 		thread_unlock(curthread);
 
 		snprintf(name, sizeof(name), "epair_task_%d", cpu);
 
 		epair_tasks.tq[cpu] = taskqueue_create(name, M_WAITOK,
 		    taskqueue_thread_enqueue,
 		    &epair_tasks.tq[cpu]);
 		CPU_SETOF(cpu, &cpu_mask);
 		taskqueue_start_threads_cpuset(&epair_tasks.tq[cpu], 1, PI_NET,
 		    &cpu_mask, "%s", name);
 
 		epair_tasks.tasks++;
 	}
 	thread_lock(curthread);
 	sched_unbind(curthread);
 	thread_unlock(curthread);
 #else
 	snprintf(name, sizeof(name), "epair_task");
 
 	epair_tasks.tq[0] = taskqueue_create(name, M_WAITOK,
 	    taskqueue_thread_enqueue,
 	    &epair_tasks.tq[0]);
 	taskqueue_start_threads(&epair_tasks.tq[0], 1, PI_NET, "%s", name);
 
 	epair_tasks.tasks = 1;
 #endif
 
 	return (0);
 }
 
 static void
 epair_mod_cleanup(void)
 {
 
 	for (int i = 0; i < epair_tasks.tasks; i++) {
 		taskqueue_drain_all(epair_tasks.tq[i]);
 		taskqueue_free(epair_tasks.tq[i]);
 	}
 }
 
 static int
 epair_modevent(module_t mod, int type, void *data)
 {
 	int ret;
 
 	switch (type) {
 	case MOD_LOAD:
 		EPAIR_LOCK_INIT();
 		ret = epair_mod_init();
 		if (ret != 0)
 			return (ret);
 		if (bootverbose)
 			printf("%s: %s initialized.\n", __func__, epairname);
 		break;
 	case MOD_UNLOAD:
 		epair_mod_cleanup();
 		EPAIR_LOCK_DESTROY();
 		if (bootverbose)
 			printf("%s: %s unloaded.\n", __func__, epairname);
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t epair_mod = {
 	"if_epair",
 	epair_modevent,
 	0
 };
 
 DECLARE_MODULE(if_epair, epair_mod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE);
 MODULE_VERSION(if_epair, 3);
diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c
index a42e15dda97a..b3fc71006a35 100644
--- a/sys/net/if_ethersubr.c
+++ b/sys/net/if_ethersubr.c
@@ -1,1487 +1,1488 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_ethersubr.c	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_netgraph.h"
 #include "opt_mbuf_profiling.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/devctl.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/msan.h>
 #include <sys/proc.h>
 #include <sys/priv.h>
 #include <sys/random.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/uuid.h>
 
 #include <net/ieee_oui.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_arp.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/if_llc.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if_bridgevar.h>
 #include <net/if_vlan_var.h>
 #include <net/if_llatbl.h>
 #include <net/pfil.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netpfil/pf/pf_mtag.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip_carp.h>
 #include <netinet/ip_var.h>
 #endif
 #ifdef INET6
 #include <netinet6/nd6.h>
 #endif
 #include <security/mac/mac_framework.h>
 
 #include <crypto/sha1.h>
 
 #ifdef CTASSERT
 CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2);
 CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN);
 #endif
 
 VNET_DEFINE(pfil_head_t, link_pfil_head);	/* Packet filter hooks */
 
 /* netgraph node hooks for ng_ether(4) */
 void	(*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp);
 void	(*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m);
 int	(*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp);
 void	(*ng_ether_attach_p)(struct ifnet *ifp);
 void	(*ng_ether_detach_p)(struct ifnet *ifp);
 
 void	(*vlan_input_p)(struct ifnet *, struct mbuf *);
 
 /* if_bridge(4) support */
 void	(*bridge_dn_p)(struct mbuf *, struct ifnet *);
 
 /* if_lagg(4) support */
 struct mbuf *(*lagg_input_ethernet_p)(struct ifnet *, struct mbuf *); 
 
 static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] =
 			{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 static	int ether_resolvemulti(struct ifnet *, struct sockaddr **,
 		struct sockaddr *);
 static	int ether_requestencap(struct ifnet *, struct if_encap_req *);
 
 #define senderr(e) do { error = (e); goto bad;} while (0)
 
 static void
 update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst)
 {
 	int csum_flags = 0;
 
 	if (src->m_pkthdr.csum_flags & CSUM_IP)
 		csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID);
 	if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
 		csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
 	if (src->m_pkthdr.csum_flags & CSUM_SCTP)
 		csum_flags |= CSUM_SCTP_VALID;
 	dst->m_pkthdr.csum_flags |= csum_flags;
 	if (csum_flags & CSUM_DATA_VALID)
 		dst->m_pkthdr.csum_data = 0xffff;
 }
 
 /*
  * Handle link-layer encapsulation requests.
  */
 static int
 ether_requestencap(struct ifnet *ifp, struct if_encap_req *req)
 {
 	struct ether_header *eh;
 	struct arphdr *ah;
 	uint16_t etype;
 	const u_char *lladdr;
 
 	if (req->rtype != IFENCAP_LL)
 		return (EOPNOTSUPP);
 
 	if (req->bufsize < ETHER_HDR_LEN)
 		return (ENOMEM);
 
 	eh = (struct ether_header *)req->buf;
 	lladdr = req->lladdr;
 	req->lladdr_off = 0;
 
 	switch (req->family) {
 	case AF_INET:
 		etype = htons(ETHERTYPE_IP);
 		break;
 	case AF_INET6:
 		etype = htons(ETHERTYPE_IPV6);
 		break;
 	case AF_ARP:
 		ah = (struct arphdr *)req->hdata;
 		ah->ar_hrd = htons(ARPHRD_ETHER);
 
 		switch(ntohs(ah->ar_op)) {
 		case ARPOP_REVREQUEST:
 		case ARPOP_REVREPLY:
 			etype = htons(ETHERTYPE_REVARP);
 			break;
 		case ARPOP_REQUEST:
 		case ARPOP_REPLY:
 		default:
 			etype = htons(ETHERTYPE_ARP);
 			break;
 		}
 
 		if (req->flags & IFENCAP_FLAG_BROADCAST)
 			lladdr = ifp->if_broadcastaddr;
 		break;
 	default:
 		return (EAFNOSUPPORT);
 	}
 
 	memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type));
 	memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN);
 	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 	req->bufsize = sizeof(struct ether_header);
 
 	return (0);
 }
 
 static int
 ether_resolve_addr(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro, u_char *phdr,
 	uint32_t *pflags, struct llentry **plle)
 {
 	uint32_t lleflags = 0;
 	int error = 0;
 #if defined(INET) || defined(INET6)
 	struct ether_header *eh = (struct ether_header *)phdr;
 	uint16_t etype;
 #endif
 
 	if (plle)
 		*plle = NULL;
 
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if ((m->m_flags & (M_BCAST | M_MCAST)) == 0)
 			error = arpresolve(ifp, 0, m, dst, phdr, &lleflags,
 			    plle);
 		else {
 			if (m->m_flags & M_BCAST)
 				memcpy(eh->ether_dhost, ifp->if_broadcastaddr,
 				    ETHER_ADDR_LEN);
 			else {
 				const struct in_addr *a;
 				a = &(((const struct sockaddr_in *)dst)->sin_addr);
 				ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost);
 			}
 			etype = htons(ETHERTYPE_IP);
 			memcpy(&eh->ether_type, &etype, sizeof(etype));
 			memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if ((m->m_flags & M_MCAST) == 0) {
 			int af = RO_GET_FAMILY(ro, dst);
 			error = nd6_resolve(ifp, LLE_SF(af, 0), m, dst, phdr,
 			    &lleflags, plle);
 		} else {
 			const struct in6_addr *a6;
 			a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr);
 			ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost);
 			etype = htons(ETHERTYPE_IPV6);
 			memcpy(&eh->ether_type, &etype, sizeof(etype));
 			memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 		}
 		break;
 #endif
 	default:
 		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
 		if (m != NULL)
 			m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 
 	if (error == EHOSTDOWN) {
 		if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0)
 			error = EHOSTUNREACH;
 	}
 
 	if (error != 0)
 		return (error);
 
 	*pflags = RT_MAY_LOOP;
 	if (lleflags & LLE_IFADDR)
 		*pflags |= RT_L2_ME;
 
 	return (0);
 }
 
 /*
  * Ethernet output routine.
  * Encapsulate a packet of type family for the local net.
  * Use trailer local net encapsulation if enough data in first
  * packet leaves a multiple of 512 bytes of data in remainder.
  */
 int
 ether_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 	int error = 0;
 	char linkhdr[ETHER_HDR_LEN], *phdr;
 	struct ether_header *eh;
 	struct pf_mtag *t;
 	bool loop_copy;
 	int hlen;	/* link layer header length */
 	uint32_t pflags;
 	struct llentry *lle = NULL;
 	int addref = 0;
 
 	phdr = NULL;
 	pflags = 0;
 	if (ro != NULL) {
 		/* XXX BPF uses ro_prepend */
 		if (ro->ro_prepend != NULL) {
 			phdr = ro->ro_prepend;
 			hlen = ro->ro_plen;
 		} else if (!(m->m_flags & (M_BCAST | M_MCAST))) {
 			if ((ro->ro_flags & RT_LLE_CACHE) != 0) {
 				lle = ro->ro_lle;
 				if (lle != NULL &&
 				    (lle->la_flags & LLE_VALID) == 0) {
 					LLE_FREE(lle);
 					lle = NULL;	/* redundant */
 					ro->ro_lle = NULL;
 				}
 				if (lle == NULL) {
 					/* if we lookup, keep cache */
 					addref = 1;
 				} else
 					/*
 					 * Notify LLE code that
 					 * the entry was used
 					 * by datapath.
 					 */
 					llentry_provide_feedback(lle);
 			}
 			if (lle != NULL) {
 				phdr = lle->r_linkdata;
 				hlen = lle->r_hdrlen;
 				pflags = lle->r_flags;
 			}
 		}
 	}
 
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		senderr(error);
 #endif
 
 	M_PROFILE(m);
 	if (ifp->if_flags & IFF_MONITOR)
 		senderr(ENETDOWN);
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		senderr(ENETDOWN);
 
 	if (phdr == NULL) {
 		/* No prepend data supplied. Try to calculate ourselves. */
 		phdr = linkhdr;
 		hlen = ETHER_HDR_LEN;
 		error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags,
 		    addref ? &lle : NULL);
 		if (addref && lle != NULL)
 			ro->ro_lle = lle;
 		if (error != 0)
 			return (error == EWOULDBLOCK ? 0 : error);
 	}
 
 	if ((pflags & RT_L2_ME) != 0) {
 		update_mbuf_csumflags(m, m);
 		return (if_simloop(ifp, m, RO_GET_FAMILY(ro, dst), 0));
 	}
 	loop_copy = (pflags & RT_MAY_LOOP) != 0;
 
 	/*
 	 * Add local net header.  If no space in first mbuf,
 	 * allocate another.
 	 *
 	 * Note that we do prepend regardless of RT_HAS_HEADER flag.
 	 * This is done because BPF code shifts m_data pointer
 	 * to the end of ethernet header prior to calling if_output().
 	 */
 	M_PREPEND(m, hlen, M_NOWAIT);
 	if (m == NULL)
 		senderr(ENOBUFS);
 	if ((pflags & RT_HAS_HEADER) == 0) {
 		eh = mtod(m, struct ether_header *);
 		memcpy(eh, phdr, hlen);
 	}
 
 	/*
 	 * If a simplex interface, and the packet is being sent to our
 	 * Ethernet address or a broadcast address, loopback a copy.
 	 * XXX To make a simplex device behave exactly like a duplex
 	 * device, we should copy in the case of sending to our own
 	 * ethernet address (thus letting the original actually appear
 	 * on the wire). However, we don't do that here for security
 	 * reasons and compatibility with the original behavior.
 	 */
 	if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) &&
 	    ((t = pf_find_mtag(m)) == NULL || !t->routed)) {
 		struct mbuf *n;
 
 		/*
 		 * Because if_simloop() modifies the packet, we need a
 		 * writable copy through m_dup() instead of a readonly
 		 * one as m_copy[m] would give us. The alternative would
 		 * be to modify if_simloop() to handle the readonly mbuf,
 		 * but performancewise it is mostly equivalent (trading
 		 * extra data copying vs. extra locking).
 		 *
 		 * XXX This is a local workaround.  A number of less
 		 * often used kernel parts suffer from the same bug.
 		 * See PR kern/105943 for a proposed general solution.
 		 */
 		if ((n = m_dup(m, M_NOWAIT)) != NULL) {
 			update_mbuf_csumflags(m, n);
 			(void)if_simloop(ifp, n, RO_GET_FAMILY(ro, dst), hlen);
 		} else
 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 	}
 
        /*
 	* Bridges require special output handling.
 	*/
 	if (ifp->if_bridge) {
 		BRIDGE_OUTPUT(ifp, m, error);
 		return (error);
 	}
 
 #if defined(INET) || defined(INET6)
 	if (ifp->if_carp &&
 	    (error = (*carp_output_p)(ifp, m, dst)))
 		goto bad;
 #endif
 
 	/* Handle ng_ether(4) processing, if any */
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_output_p != NULL,
 		    ("ng_ether_output_p is NULL"));
 		if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) {
 bad:			if (m != NULL)
 				m_freem(m);
 			return (error);
 		}
 		if (m == NULL)
 			return (0);
 	}
 
 	/* Continue with link-layer output */
 	return ether_output_frame(ifp, m);
 }
 
 static bool
 ether_set_pcp(struct mbuf **mp, struct ifnet *ifp, uint8_t pcp)
 {
 	struct ether_8021q_tag qtag;
 	struct ether_header *eh;
 
 	eh = mtod(*mp, struct ether_header *);
 	if (ntohs(eh->ether_type) == ETHERTYPE_VLAN ||
 	    ntohs(eh->ether_type) == ETHERTYPE_QINQ)
 		return (true);
 
 	qtag.vid = 0;
 	qtag.pcp = pcp;
 	qtag.proto = ETHERTYPE_VLAN;
 	if (ether_8021q_frame(mp, ifp, ifp, &qtag))
 		return (true);
 	if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	return (false);
 }
 
 /*
  * Ethernet link layer output routine to send a raw frame to the device.
  *
  * This assumes that the 14 byte Ethernet header is present and contiguous
  * in the first mbuf (if BRIDGE'ing).
  */
 int
 ether_output_frame(struct ifnet *ifp, struct mbuf *m)
 {
 	uint8_t pcp;
 
 	pcp = ifp->if_pcp;
 	if (pcp != IFNET_PCP_NONE && ifp->if_type != IFT_L2VLAN &&
 	    !ether_set_pcp(&m, ifp, pcp))
 		return (0);
 
 	if (PFIL_HOOKED_OUT(V_link_pfil_head))
 		switch (pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_OUT,
 		    NULL)) {
 		case PFIL_DROPPED:
 			return (EACCES);
 		case PFIL_CONSUMED:
 			return (0);
 		}
 
 #ifdef EXPERIMENTAL
 #if defined(INET6) && defined(INET)
 	/* draft-ietf-6man-ipv6only-flag */
 	/* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) {
 		struct ether_header *eh;
 
 		eh = mtod(m, struct ether_header *);
 		switch (ntohs(eh->ether_type)) {
 		case ETHERTYPE_IP:
 		case ETHERTYPE_ARP:
 		case ETHERTYPE_REVARP:
 			m_freem(m);
 			return (EAFNOSUPPORT);
 			/* NOTREACHED */
 			break;
 		};
 	}
 #endif
 #endif
 
 	/*
 	 * Queue message on interface, update output statistics if successful,
 	 * and start output if interface not yet active.
 	 *
 	 * If KMSAN is enabled, use it to verify that the data does not contain
 	 * any uninitialized bytes.
 	 */
 	kmsan_check_mbuf(m, "ether_output");
 	return ((ifp->if_transmit)(ifp, m));
 }
 
 /*
  * Process a received Ethernet packet; the packet is in the
  * mbuf chain m with the ethernet header at the front.
  */
 static void
 ether_input_internal(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ether_header *eh;
 	u_short etype;
 
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		return;
 	}
 #ifdef DIAGNOSTIC
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n");
 		m_freem(m);
 		return;
 	}
 #endif
 	if (m->m_len < ETHER_HDR_LEN) {
 		/* XXX maybe should pullup? */
 		if_printf(ifp, "discard frame w/o leading ethernet "
 				"header (len %u pkt len %u)\n",
 				m->m_len, m->m_pkthdr.len);
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		return;
 	}
 	eh = mtod(m, struct ether_header *);
 	etype = ntohs(eh->ether_type);
 	random_harvest_queue_ether(m, sizeof(*m));
 
 #ifdef EXPERIMENTAL
 #if defined(INET6) && defined(INET)
 	/* draft-ietf-6man-ipv6only-flag */
 	/* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) {
 		switch (etype) {
 		case ETHERTYPE_IP:
 		case ETHERTYPE_ARP:
 		case ETHERTYPE_REVARP:
 			m_freem(m);
 			return;
 			/* NOTREACHED */
 			break;
 		};
 	}
 #endif
 #endif
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 
 	if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 		if (ETHER_IS_BROADCAST(eh->ether_dhost))
 			m->m_flags |= M_BCAST;
 		else
 			m->m_flags |= M_MCAST;
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 	}
 
 #ifdef MAC
 	/*
 	 * Tag the mbuf with an appropriate MAC label before any other
 	 * consumers can get to it.
 	 */
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	/*
 	 * Give bpf a chance at the packet.
 	 */
 	ETHER_BPF_MTAP(ifp, m);
 
 	/*
 	 * If the CRC is still on the packet, trim it off. We do this once
 	 * and once only in case we are re-entered. Nothing else on the
 	 * Ethernet receive path expects to see the FCS.
 	 */
 	if (m->m_flags & M_HASFCS) {
 		m_adj(m, -ETHER_CRC_LEN);
 		m->m_flags &= ~M_HASFCS;
 	}
 
 	if (!(ifp->if_capenable & IFCAP_HWSTATS))
 		if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 
 	/* Allow monitor mode to claim this frame, after stats are updated. */
 	if (ifp->if_flags & IFF_MONITOR) {
 		m_freem(m);
 		CURVNET_RESTORE();
 		return;
 	}
 
 	/* Handle input from a lagg(4) port */
 	if (ifp->if_type == IFT_IEEE8023ADLAG) {
 		KASSERT(lagg_input_ethernet_p != NULL,
 		    ("%s: if_lagg not loaded!", __func__));
 		m = (*lagg_input_ethernet_p)(ifp, m);
 		if (m != NULL)
 			ifp = m->m_pkthdr.rcvif;
 		else {
 			CURVNET_RESTORE();
 			return;
 		}
 	}
 
 	/*
 	 * If the hardware did not process an 802.1Q tag, do this now,
 	 * to allow 802.1P priority frames to be passed to the main input
 	 * path correctly.
 	 */
 	if ((m->m_flags & M_VLANTAG) == 0 &&
 	    ((etype == ETHERTYPE_VLAN) || (etype == ETHERTYPE_QINQ))) {
 		struct ether_vlan_header *evl;
 
 		if (m->m_len < sizeof(*evl) &&
 		    (m = m_pullup(m, sizeof(*evl))) == NULL) {
 #ifdef DIAGNOSTIC
 			if_printf(ifp, "cannot pullup VLAN header\n");
 #endif
 			if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 			CURVNET_RESTORE();
 			return;
 		}
 
 		evl = mtod(m, struct ether_vlan_header *);
 		m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
 		m->m_flags |= M_VLANTAG;
 
 		bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
 		    ETHER_HDR_LEN - ETHER_TYPE_LEN);
 		m_adj(m, ETHER_VLAN_ENCAP_LEN);
 		eh = mtod(m, struct ether_header *);
 	}
 
 	M_SETFIB(m, ifp->if_fib);
 
 	/* Allow ng_ether(4) to claim this frame. */
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_input_p != NULL,
 		    ("%s: ng_ether_input_p is NULL", __func__));
 		m->m_flags &= ~M_PROMISC;
 		(*ng_ether_input_p)(ifp, &m);
 		if (m == NULL) {
 			CURVNET_RESTORE();
 			return;
 		}
 		eh = mtod(m, struct ether_header *);
 	}
 
 	/*
 	 * Allow if_bridge(4) to claim this frame.
 	 * The BRIDGE_INPUT() macro will update ifp if the bridge changed it
 	 * and the frame should be delivered locally.
 	 */
 	if (ifp->if_bridge != NULL) {
 		m->m_flags &= ~M_PROMISC;
 		BRIDGE_INPUT(ifp, m);
 		if (m == NULL) {
 			CURVNET_RESTORE();
 			return;
 		}
 		eh = mtod(m, struct ether_header *);
 	}
 
 #if defined(INET) || defined(INET6)
 	/*
 	 * Clear M_PROMISC on frame so that carp(4) will see it when the
 	 * mbuf flows up to Layer 3.
 	 * FreeBSD's implementation of carp(4) uses the inprotosw
 	 * to dispatch IPPROTO_CARP. carp(4) also allocates its own
 	 * Ethernet addresses of the form 00:00:5e:00:01:xx, which
 	 * is outside the scope of the M_PROMISC test below.
 	 * TODO: Maintain a hash table of ethernet addresses other than
 	 * ether_dhost which may be active on this ifp.
 	 */
 	if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) {
 		m->m_flags &= ~M_PROMISC;
 	} else
 #endif
 	{
 		/*
 		 * If the frame received was not for our MAC address, set the
 		 * M_PROMISC flag on the mbuf chain. The frame may need to
 		 * be seen by the rest of the Ethernet input path in case of
 		 * re-entry (e.g. bridge, vlan, netgraph) but should not be
 		 * seen by upper protocol layers.
 		 */
 		if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
 		    bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0)
 			m->m_flags |= M_PROMISC;
 	}
 
 	ether_demux(ifp, m);
 	CURVNET_RESTORE();
 }
 
 /*
  * Ethernet input dispatch; by default, direct dispatch here regardless of
  * global configuration.  However, if RSS is enabled, hook up RSS affinity
  * so that when deferred or hybrid dispatch is enabled, we can redistribute
  * load based on RSS.
  *
  * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or
  * not it had already done work distribution via multi-queue.  Then we could
  * direct dispatch in the event load balancing was already complete and
  * handle the case of interfaces with different capabilities better.
  *
  * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions
  * at multiple layers?
  *
  * XXXRW: For now, enable all this only if RSS is compiled in, although it
  * works fine without RSS.  Need to characterise the performance overhead
  * of the detour through the netisr code in the event the result is always
  * direct dispatch.
  */
 static void
 ether_nh_input(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	KASSERT(m->m_pkthdr.rcvif != NULL,
 	    ("%s: NULL interface pointer", __func__));
 	ether_input_internal(m->m_pkthdr.rcvif, m);
 }
 
 static struct netisr_handler	ether_nh = {
 	.nh_name = "ether",
 	.nh_handler = ether_nh_input,
 	.nh_proto = NETISR_ETHER,
 #ifdef RSS
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_DIRECT,
 	.nh_m2cpuid = rss_m2cpuid,
 #else
 	.nh_policy = NETISR_POLICY_SOURCE,
 	.nh_dispatch = NETISR_DISPATCH_DIRECT,
 #endif
 };
 
 static void
 ether_init(__unused void *arg)
 {
 
 	netisr_register(&ether_nh);
 }
 SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL);
 
 static void
 vnet_ether_init(__unused void *arg)
 {
 	struct pfil_head_args args;
 
 	args.pa_version = PFIL_VERSION;
 	args.pa_flags = PFIL_IN | PFIL_OUT;
 	args.pa_type = PFIL_TYPE_ETHERNET;
 	args.pa_headname = PFIL_ETHER_NAME;
 	V_link_pfil_head = pfil_head_register(&args);
 
 #ifdef VIMAGE
 	netisr_register_vnet(&ether_nh);
 #endif
 }
 VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY,
     vnet_ether_init, NULL);
 
 #ifdef VIMAGE
 static void
 vnet_ether_pfil_destroy(__unused void *arg)
 {
 
 	pfil_head_unregister(V_link_pfil_head);
 }
 VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY,
     vnet_ether_pfil_destroy, NULL);
 
 static void
 vnet_ether_destroy(__unused void *arg)
 {
 
 	netisr_unregister_vnet(&ether_nh);
 }
 VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY,
     vnet_ether_destroy, NULL);
 #endif
 
 static void
 ether_input(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct mbuf *mn;
 	bool needs_epoch;
 
 	needs_epoch = !(ifp->if_flags & IFF_KNOWSEPOCH);
 
 	/*
 	 * The drivers are allowed to pass in a chain of packets linked with
 	 * m_nextpkt. We split them up into separate packets here and pass
 	 * them up. This allows the drivers to amortize the receive lock.
 	 */
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	if (__predict_false(needs_epoch))
 		NET_EPOCH_ENTER(et);
 	while (m) {
 		mn = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 
 		/*
 		 * We will rely on rcvif being set properly in the deferred
 		 * context, so assert it is correct here.
 		 */
 		MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 		KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p "
 		    "rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp));
 		netisr_dispatch(NETISR_ETHER, m);
 		m = mn;
 	}
 	if (__predict_false(needs_epoch))
 		NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 }
 
 /*
  * Upper layer processing for a received Ethernet packet.
  */
 void
 ether_demux(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ether_header *eh;
 	int i, isr;
 	u_short ether_type;
 
 	NET_EPOCH_ASSERT();
 	KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__));
 
 	/* Do not grab PROMISC frames in case we are re-entered. */
 	if (PFIL_HOOKED_IN(V_link_pfil_head) && !(m->m_flags & M_PROMISC)) {
 		i = pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_IN, NULL);
 		if (i != 0 || m == NULL)
 			return;
 	}
 
 	eh = mtod(m, struct ether_header *);
 	ether_type = ntohs(eh->ether_type);
 
 	/*
 	 * If this frame has a VLAN tag other than 0, call vlan_input()
 	 * if its module is loaded. Otherwise, drop.
 	 */
 	if ((m->m_flags & M_VLANTAG) &&
 	    EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) {
 		if (ifp->if_vlantrunk == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			m_freem(m);
 			return;
 		}
 		KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!",
 		    __func__));
 		/* Clear before possibly re-entering ether_input(). */
 		m->m_flags &= ~M_PROMISC;
 		(*vlan_input_p)(ifp, m);
 		return;
 	}
 
 	/*
 	 * Pass promiscuously received frames to the upper layer if the user
 	 * requested this by setting IFF_PPROMISC. Otherwise, drop them.
 	 */
 	if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) {
 		m_freem(m);
 		return;
 	}
 
 	/*
 	 * Reset layer specific mbuf flags to avoid confusing upper layers.
 	 */
 	m->m_flags &= ~M_VLANTAG;
 	m_clrprotoflags(m);
 
 	/*
 	 * Dispatch frame to upper layer.
 	 */
 	switch (ether_type) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		isr = NETISR_IP;
 		break;
 
 	case ETHERTYPE_ARP:
 		if (ifp->if_flags & IFF_NOARP) {
 			/* Discard packet if ARP is disabled on interface */
 			m_freem(m);
 			return;
 		}
 		isr = NETISR_ARP;
 		break;
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		goto discard;
 	}
 
 	/* Strip off Ethernet header. */
 	m_adj(m, ETHER_HDR_LEN);
 
 	netisr_dispatch(isr, m);
 	return;
 
 discard:
 	/*
 	 * Packet is to be discarded.  If netgraph is present,
 	 * hand the packet to it for last chance processing;
 	 * otherwise dispose of it.
 	 */
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_input_orphan_p != NULL,
 		    ("ng_ether_input_orphan_p is NULL"));
 		(*ng_ether_input_orphan_p)(ifp, m);
 		return;
 	}
 	m_freem(m);
 }
 
 /*
  * Convert Ethernet address to printable (loggable) representation.
  * This routine is for compatibility; it's better to just use
  *
  *	printf("%6D", <pointer to address>, ":");
  *
  * since there's no static buffer involved.
  */
 char *
 ether_sprintf(const u_char *ap)
 {
 	static char etherbuf[18];
 	snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":");
 	return (etherbuf);
 }
 
 /*
  * Perform common duties while attaching to interface list
  */
 void
 ether_ifattach(struct ifnet *ifp, const u_int8_t *lla)
 {
 	int i;
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 
 	ifp->if_addrlen = ETHER_ADDR_LEN;
 	ifp->if_hdrlen = ETHER_HDR_LEN;
 	ifp->if_mtu = ETHERMTU;
 	if_attach(ifp);
 	ifp->if_output = ether_output;
 	ifp->if_input = ether_input;
 	ifp->if_resolvemulti = ether_resolvemulti;
 	ifp->if_requestencap = ether_requestencap;
 #ifdef VIMAGE
 	ifp->if_reassign = ether_reassign;
 #endif
 	if (ifp->if_baudrate == 0)
 		ifp->if_baudrate = IF_Mbps(10);		/* just a default */
 	ifp->if_broadcastaddr = etherbroadcastaddr;
 
 	ifa = ifp->if_addr;
 	KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	sdl->sdl_type = IFT_ETHER;
 	sdl->sdl_alen = ifp->if_addrlen;
 	bcopy(lla, LLADDR(sdl), ifp->if_addrlen);
 
 	if (ifp->if_hw_addr != NULL)
 		bcopy(lla, ifp->if_hw_addr, ifp->if_addrlen);
 
 	bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
 	if (ng_ether_attach_p != NULL)
 		(*ng_ether_attach_p)(ifp);
 
 	/* Announce Ethernet MAC address if non-zero. */
 	for (i = 0; i < ifp->if_addrlen; i++)
 		if (lla[i] != 0)
 			break; 
 	if (i != ifp->if_addrlen)
 		if_printf(ifp, "Ethernet address: %6D\n", lla, ":");
 
 	uuid_ether_add(LLADDR(sdl));
 
 	/* Add necessary bits are setup; announce it now. */
 	EVENTHANDLER_INVOKE(ether_ifattach_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("ETHERNET", ifp->if_xname, "IFATTACH", NULL);
 }
 
 /*
  * Perform common duties while detaching an Ethernet interface
  */
 void
 ether_ifdetach(struct ifnet *ifp)
 {
 	struct sockaddr_dl *sdl;
 
 	sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr);
 	uuid_ether_del(LLADDR(sdl));
 
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_detach_p != NULL,
 		    ("ng_ether_detach_p is NULL"));
 		(*ng_ether_detach_p)(ifp);
 	}
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 }
 
 #ifdef VIMAGE
 void
 ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused)
 {
 
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_detach_p != NULL,
 		    ("ng_ether_detach_p is NULL"));
 		(*ng_ether_detach_p)(ifp);
 	}
 
 	if (ng_ether_attach_p != NULL) {
 		CURVNET_SET_QUIET(new_vnet);
 		(*ng_ether_attach_p)(ifp);
 		CURVNET_RESTORE();
 	}
 }
 #endif
 
 SYSCTL_DECL(_net_link);
 SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Ethernet");
 
 #if 0
 /*
  * This is for reference.  We have a table-driven version
  * of the little-endian crc32 generator, which is faster
  * than the double-loop.
  */
 uint32_t
 ether_crc32_le(const uint8_t *buf, size_t len)
 {
 	size_t i;
 	uint32_t crc;
 	int bit;
 	uint8_t data;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
 			carry = (crc ^ data) & 1;
 			crc >>= 1;
 			if (carry)
 				crc = (crc ^ ETHER_CRC_POLY_LE);
 		}
 	}
 
 	return (crc);
 }
 #else
 uint32_t
 ether_crc32_le(const uint8_t *buf, size_t len)
 {
 	static const uint32_t crctab[] = {
 		0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac,
 		0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c,
 		0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c,
 		0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c
 	};
 	size_t i;
 	uint32_t crc;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		crc ^= buf[i];
 		crc = (crc >> 4) ^ crctab[crc & 0xf];
 		crc = (crc >> 4) ^ crctab[crc & 0xf];
 	}
 
 	return (crc);
 }
 #endif
 
 uint32_t
 ether_crc32_be(const uint8_t *buf, size_t len)
 {
 	size_t i;
 	uint32_t crc, carry;
 	int bit;
 	uint8_t data;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
 			carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01);
 			crc <<= 1;
 			if (carry)
 				crc = (crc ^ ETHER_CRC_POLY_BE) | carry;
 		}
 	}
 
 	return (crc);
 }
 
 int
 ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	struct ifaddr *ifa = (struct ifaddr *) data;
 	struct ifreq *ifr = (struct ifreq *) data;
 	int error = 0;
 
 	switch (command) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 
 		switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			ifp->if_init(ifp->if_softc);	/* before arpwhohas */
 			arp_ifinit(ifp, ifa);
 			break;
 #endif
 		default:
 			ifp->if_init(ifp->if_softc);
 			break;
 		}
 		break;
 
 	case SIOCGIFADDR:
 		bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0],
 		    ETHER_ADDR_LEN);
 		break;
 
 	case SIOCSIFMTU:
 		/*
 		 * Set the interface MTU.
 		 */
 		if (ifr->ifr_mtu > ETHERMTU) {
 			error = EINVAL;
 		} else {
 			ifp->if_mtu = ifr->ifr_mtu;
 		}
 		break;
 
 	case SIOCSLANPCP:
 		error = priv_check(curthread, PRIV_NET_SETLANPCP);
 		if (error != 0)
 			break;
 		if (ifr->ifr_lan_pcp > 7 &&
 		    ifr->ifr_lan_pcp != IFNET_PCP_NONE) {
 			error = EINVAL;
 		} else {
 			ifp->if_pcp = ifr->ifr_lan_pcp;
 			/* broadcast event about PCP change */
 			EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP);
 		}
 		break;
 
 	case SIOCGLANPCP:
 		ifr->ifr_lan_pcp = ifp->if_pcp;
 		break;
 
 	default:
 		error = EINVAL;			/* XXX netbsd has ENOTTY??? */
 		break;
 	}
 	return (error);
 }
 
 static int
 ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
 	struct sockaddr *sa)
 {
 	struct sockaddr_dl *sdl;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	u_char *e_addr;
 
 	switch(sa->sa_family) {
 	case AF_LINK:
 		/*
 		 * No mapping needed. Just check that it's a valid MC address.
 		 */
 		sdl = (struct sockaddr_dl *)sa;
 		e_addr = LLADDR(sdl);
 		if (!ETHER_IS_MULTICAST(e_addr))
 			return EADDRNOTAVAIL;
 		*llsa = NULL;
 		return 0;
 
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)sa;
 		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 			return EADDRNOTAVAIL;
 		sdl = link_init_sdl(ifp, *llsa, IFT_ETHER);
 		sdl->sdl_alen = ETHER_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return 0;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)sa;
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			/*
 			 * An IP6 address of 0 means listen to all
 			 * of the Ethernet multicast address used for IP6.
 			 * (This is used for multicast routers.)
 			 */
 			ifp->if_flags |= IFF_ALLMULTI;
 			*llsa = NULL;
 			return 0;
 		}
 		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 			return EADDRNOTAVAIL;
 		sdl = link_init_sdl(ifp, *llsa, IFT_ETHER);
 		sdl->sdl_alen = ETHER_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return 0;
 #endif
 
 	default:
 		/*
 		 * Well, the text isn't quite right, but it's the name
 		 * that counts...
 		 */
 		return EAFNOSUPPORT;
 	}
 }
 
 static moduledata_t ether_mod = {
 	.name = "ether",
 };
 
 void
 ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen)
 {
 	struct ether_vlan_header vlan;
 	struct mbuf mv, mb;
 
 	KASSERT((m->m_flags & M_VLANTAG) != 0,
 	    ("%s: vlan information not present", __func__));
 	KASSERT(m->m_len >= sizeof(struct ether_header),
 	    ("%s: mbuf not large enough for header", __func__));
 	bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header));
 	vlan.evl_proto = vlan.evl_encap_proto;
 	vlan.evl_encap_proto = htons(ETHERTYPE_VLAN);
 	vlan.evl_tag = htons(m->m_pkthdr.ether_vtag);
 	m->m_len -= sizeof(struct ether_header);
 	m->m_data += sizeof(struct ether_header);
 	/*
 	 * If a data link has been supplied by the caller, then we will need to
 	 * re-create a stack allocated mbuf chain with the following structure:
 	 *
 	 * (1) mbuf #1 will contain the supplied data link
 	 * (2) mbuf #2 will contain the vlan header
 	 * (3) mbuf #3 will contain the original mbuf's packet data
 	 *
 	 * Otherwise, submit the packet and vlan header via bpf_mtap2().
 	 */
 	if (data != NULL) {
 		mv.m_next = m;
 		mv.m_data = (caddr_t)&vlan;
 		mv.m_len = sizeof(vlan);
 		mb.m_next = &mv;
 		mb.m_data = data;
 		mb.m_len = dlen;
 		bpf_mtap(bp, &mb);
 	} else
 		bpf_mtap2(bp, &vlan, sizeof(vlan), m);
 	m->m_len += sizeof(struct ether_header);
 	m->m_data -= sizeof(struct ether_header);
 }
 
 struct mbuf *
 ether_vlanencap_proto(struct mbuf *m, uint16_t tag, uint16_t proto)
 {
 	struct ether_vlan_header *evl;
 
 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
 	if (m == NULL)
 		return (NULL);
 	/* M_PREPEND takes care of m_len, m_pkthdr.len for us */
 
 	if (m->m_len < sizeof(*evl)) {
 		m = m_pullup(m, sizeof(*evl));
 		if (m == NULL)
 			return (NULL);
 	}
 
 	/*
 	 * Transform the Ethernet header into an Ethernet header
 	 * with 802.1Q encapsulation.
 	 */
 	evl = mtod(m, struct ether_vlan_header *);
 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
 	    (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
 	evl->evl_encap_proto = htons(proto);
 	evl->evl_tag = htons(tag);
 	return (m);
 }
 
 static SYSCTL_NODE(_net_link, IFT_L2VLAN, vlan, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IEEE 802.1Q VLAN");
 static SYSCTL_NODE(_net_link_vlan, PF_LINK, link,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "for consistency");
 
 VNET_DEFINE_STATIC(int, soft_pad);
 #define	V_soft_pad	VNET(soft_pad)
 SYSCTL_INT(_net_link_vlan, OID_AUTO, soft_pad, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(soft_pad), 0,
     "pad short frames before tagging");
 
 /*
  * For now, make preserving PCP via an mbuf tag optional, as it increases
  * per-packet memory allocations and frees.  In the future, it would be
  * preferable to reuse ether_vtag for this, or similar.
  */
 VNET_DEFINE(int, vlan_mtag_pcp) = 0;
 #define	V_vlan_mtag_pcp	VNET(vlan_mtag_pcp)
 SYSCTL_INT(_net_link_vlan, OID_AUTO, mtag_pcp, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(vlan_mtag_pcp), 0,
     "Retain VLAN PCP information as packets are passed up the stack");
 
 bool
 ether_8021q_frame(struct mbuf **mp, struct ifnet *ife, struct ifnet *p,
     struct ether_8021q_tag *qtag)
 {
 	struct m_tag *mtag;
 	int n;
 	uint16_t tag;
 	static const char pad[8];	/* just zeros */
 
 	/*
 	 * Pad the frame to the minimum size allowed if told to.
 	 * This option is in accord with IEEE Std 802.1Q, 2003 Ed.,
 	 * paragraph C.4.4.3.b.  It can help to work around buggy
 	 * bridges that violate paragraph C.4.4.3.a from the same
 	 * document, i.e., fail to pad short frames after untagging.
 	 * E.g., a tagged frame 66 bytes long (incl. FCS) is OK, but
 	 * untagging it will produce a 62-byte frame, which is a runt
 	 * and requires padding.  There are VLAN-enabled network
 	 * devices that just discard such runts instead or mishandle
 	 * them somehow.
 	 */
 	if (V_soft_pad && p->if_type == IFT_ETHER) {
 		for (n = ETHERMIN + ETHER_HDR_LEN - (*mp)->m_pkthdr.len;
 		     n > 0; n -= sizeof(pad)) {
 			if (!m_append(*mp, min(n, sizeof(pad)), pad))
 				break;
 		}
 		if (n > 0) {
 			m_freem(*mp);
 			*mp = NULL;
 			if_printf(ife, "cannot pad short frame");
 			return (false);
 		}
 	}
 
 	/*
 	 * If PCP is set in mbuf, use it
 	 */
 	if ((*mp)->m_flags & M_VLANTAG) {
 		qtag->pcp = EVL_PRIOFTAG((*mp)->m_pkthdr.ether_vtag);
 	}
 
 	/*
 	 * If underlying interface can do VLAN tag insertion itself,
 	 * just pass the packet along. However, we need some way to
 	 * tell the interface where the packet came from so that it
 	 * knows how to find the VLAN tag to use, so we attach a
 	 * packet tag that holds it.
 	 */
 	if (V_vlan_mtag_pcp && (mtag = m_tag_locate(*mp, MTAG_8021Q,
 	    MTAG_8021Q_PCP_OUT, NULL)) != NULL)
 		tag = EVL_MAKETAG(qtag->vid, *(uint8_t *)(mtag + 1), 0);
 	else
 		tag = EVL_MAKETAG(qtag->vid, qtag->pcp, 0);
 	if ((p->if_capenable & IFCAP_VLAN_HWTAGGING) &&
 	    (qtag->proto == ETHERTYPE_VLAN)) {
 		(*mp)->m_pkthdr.ether_vtag = tag;
 		(*mp)->m_flags |= M_VLANTAG;
 	} else {
 		*mp = ether_vlanencap_proto(*mp, tag, qtag->proto);
 		if (*mp == NULL) {
 			if_printf(ife, "unable to prepend 802.1Q header");
 			return (false);
 		}
 	}
 	return (true);
 }
 
 /*
  * Allocate an address from the FreeBSD Foundation OUI.  This uses a
  * cryptographic hash function on the containing jail's name, UUID and the
  * interface name to attempt to provide a unique but stable address.
  * Pseudo-interfaces which require a MAC address should use this function to
  * allocate non-locally-administered addresses.
  */
 void
 ether_gen_addr(struct ifnet *ifp, struct ether_addr *hwaddr)
 {
 	SHA1_CTX ctx;
 	char *buf;
 	char uuid[HOSTUUIDLEN + 1];
 	uint64_t addr;
 	int i, sz;
 	char digest[SHA1_RESULTLEN];
 	char jailname[MAXHOSTNAMELEN];
 
 	getcredhostuuid(curthread->td_ucred, uuid, sizeof(uuid));
 	if (strncmp(uuid, DEFAULT_HOSTUUID, sizeof(uuid)) == 0) {
 		/* Fall back to a random mac address. */
 		goto rando;
 	}
 
 	/* If each (vnet) jail would also have a unique hostuuid this would not
 	 * be necessary. */
 	getjailname(curthread->td_ucred, jailname, sizeof(jailname));
 	sz = asprintf(&buf, M_TEMP, "%s-%s-%s", uuid, if_name(ifp),
 	    jailname);
 	if (sz < 0) {
 		/* Fall back to a random mac address. */
 		goto rando;
 	}
 
 	SHA1Init(&ctx);
 	SHA1Update(&ctx, buf, sz);
 	SHA1Final(digest, &ctx);
 	free(buf, M_TEMP);
 
 	addr = ((digest[0] << 16) | (digest[1] << 8) | digest[2]) &
 	    OUI_FREEBSD_GENERATED_MASK;
 	addr = OUI_FREEBSD(addr);
 	for (i = 0; i < ETHER_ADDR_LEN; ++i) {
 		hwaddr->octet[i] = addr >> ((ETHER_ADDR_LEN - i - 1) * 8) &
 		    0xFF;
 	}
 
 	return;
 rando:
 	arc4rand(hwaddr, sizeof(*hwaddr), 0);
 	/* Unicast */
 	hwaddr->octet[0] &= 0xFE;
 	/* Locally administered. */
 	hwaddr->octet[0] |= 0x02;
 }
 
 DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
 MODULE_VERSION(ether, 1);
diff --git a/sys/net/if_fwsubr.c b/sys/net/if_fwsubr.c
index 321721737d36..5e5fc7a181e5 100644
--- a/sys/net/if_fwsubr.c
+++ b/sys/net/if_fwsubr.c
@@ -1,873 +1,874 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2004 Doug Rabson
  * Copyright (c) 1982, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/if_llc.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/bpf.h>
 #include <net/firewire.h>
 #include <net/if_llatbl.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #endif
 #ifdef INET6
 #include <netinet6/nd6.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 static MALLOC_DEFINE(M_FWCOM, "fw_com", "firewire interface internals");
 
 struct fw_hwaddr firewire_broadcastaddr = {
 	0xffffffff,
 	0xffffffff,
 	0xff,
 	0xff,
 	0xffff,
 	0xffffffff
 };
 
 static int
 firewire_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
     struct route *ro)
 {
 	struct fw_com *fc = IFP2FWC(ifp);
 	int error, type;
 	struct m_tag *mtag;
 	union fw_encap *enc;
 	struct fw_hwaddr *destfw;
 	uint8_t speed;
 	uint16_t psize, fsize, dsize;
 	struct mbuf *mtail;
 	int unicast, dgl, foff;
 	static int next_dgl;
 #if defined(INET) || defined(INET6)
 	int is_gw = 0;
 #endif
 	int af = RO_GET_FAMILY(ro, dst);
 
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		goto bad;
 #endif
 
 	if (!((ifp->if_flags & IFF_UP) &&
 	   (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
 		error = ENETDOWN;
 		goto bad;
 	}
 
 #if defined(INET) || defined(INET6)
 	if (ro != NULL)
 		is_gw = (ro->ro_flags & RT_HAS_GW) != 0;
 #endif
 	/*
 	 * For unicast, we make a tag to store the lladdr of the
 	 * destination. This might not be the first time we have seen
 	 * the packet (for instance, the arp code might be trying to
 	 * re-send it after receiving an arp reply) so we only
 	 * allocate a tag if there isn't one there already. For
 	 * multicast, we will eventually use a different tag to store
 	 * the channel number.
 	 */
 	unicast = !(m->m_flags & (M_BCAST | M_MCAST));
 	if (unicast) {
 		mtag = m_tag_locate(m, MTAG_FIREWIRE, MTAG_FIREWIRE_HWADDR, NULL);
 		if (!mtag) {
 			mtag = m_tag_alloc(MTAG_FIREWIRE, MTAG_FIREWIRE_HWADDR,
 			    sizeof (struct fw_hwaddr), M_NOWAIT);
 			if (!mtag) {
 				error = ENOMEM;
 				goto bad;
 			}
 			m_tag_prepend(m, mtag);
 		}
 		destfw = (struct fw_hwaddr *)(mtag + 1);
 	} else {
 		destfw = NULL;
 	}
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		type = ETHERTYPE_IP;
 		break;
 	case AF_ARP:
 		type = ETHERTYPE_ARP;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		type = ETHERTYPE_IPV6;
 		break;
 #endif
 	default:
 		if_printf(ifp, "can't handle af%d\n", af);
 		error = EAFNOSUPPORT;
 		goto bad;
 	}
 
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET:
 		/*
 		 * Only bother with arp for unicast. Allocation of
 		 * channels etc. for firewire is quite different and
 		 * doesn't fit into the arp model.
 		 */
 		if (unicast) {
 			error = arpresolve(ifp, is_gw, m, dst,
 			    (u_char *) destfw, NULL, NULL);
 			if (error)
 				return (error == EWOULDBLOCK ? 0 : error);
 		}
 		break;
 
 	case AF_ARP:
 	{
 		struct arphdr *ah;
 		ah = mtod(m, struct arphdr *);
 		ah->ar_hrd = htons(ARPHRD_IEEE1394);
 		if (unicast)
 			*destfw = *(struct fw_hwaddr *) ar_tha(ah);
 
 		/*
 		 * The standard arp code leaves a hole for the target
 		 * hardware address which we need to close up.
 		 */
 		bcopy(ar_tpa(ah), ar_tha(ah), ah->ar_pln);
 		m_adj(m, -ah->ar_hln);
 		break;
 	}
 #endif
 
 #ifdef INET6
 	case AF_INET6:
 		if (unicast) {
 			error = nd6_resolve(fc->fc_ifp, LLE_SF(af, is_gw), m,
 			    dst, (u_char *) destfw, NULL, NULL);
 			if (error)
 				return (error == EWOULDBLOCK ? 0 : error);
 		}
 		break;
 #endif
 
 	default:
 		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
 		error = EAFNOSUPPORT;
 		goto bad;
 	}
 
 	/*
 	 * Let BPF tap off a copy before we encapsulate.
 	 */
 	if (bpf_peers_present(ifp->if_bpf)) {
 		struct fw_bpfhdr h;
 		if (unicast)
 			bcopy(destfw, h.firewire_dhost, 8);
 		else
 			bcopy(&firewire_broadcastaddr, h.firewire_dhost, 8);
 		bcopy(&fc->fc_hwaddr, h.firewire_shost, 8);
 		h.firewire_type = htons(type);
 		bpf_mtap2(ifp->if_bpf, &h, sizeof(h), m);
 	}
 
 	/*
 	 * Punt on MCAP for now and send all multicast packets on the
 	 * broadcast channel.
 	 */
 	if (m->m_flags & M_MCAST)
 		m->m_flags |= M_BCAST;
 
 	/*
 	 * Figure out what speed to use and what the largest supported
 	 * packet size is. For unicast, this is the minimum of what we
 	 * can speak and what they can hear. For broadcast, lets be
 	 * conservative and use S100. We could possibly improve that
 	 * by examining the bus manager's speed map or similar. We
 	 * also reduce the packet size for broadcast to account for
 	 * the GASP header.
 	 */
 	if (unicast) {
 		speed = min(fc->fc_speed, destfw->sspd);
 		psize = min(512 << speed, 2 << destfw->sender_max_rec);
 	} else {
 		speed = 0;
 		psize = 512 - 2*sizeof(uint32_t);
 	}
 
 	/*
 	 * Next, we encapsulate, possibly fragmenting the original
 	 * datagram if it won't fit into a single packet.
 	 */
 	if (m->m_pkthdr.len <= psize - sizeof(uint32_t)) {
 		/*
 		 * No fragmentation is necessary.
 		 */
 		M_PREPEND(m, sizeof(uint32_t), M_NOWAIT);
 		if (!m) {
 			error = ENOBUFS;
 			goto bad;
 		}
 		enc = mtod(m, union fw_encap *);
 		enc->unfrag.ether_type = type;
 		enc->unfrag.lf = FW_ENCAP_UNFRAG;
 		enc->unfrag.reserved = 0;
 
 		/*
 		 * Byte swap the encapsulation header manually.
 		 */
 		enc->ul[0] = htonl(enc->ul[0]);
 
 		error = (ifp->if_transmit)(ifp, m);
 		return (error);
 	} else {
 		/*
 		 * Fragment the datagram, making sure to leave enough
 		 * space for the encapsulation header in each packet.
 		 */
 		fsize = psize - 2*sizeof(uint32_t);
 		dgl = next_dgl++;
 		dsize = m->m_pkthdr.len;
 		foff = 0;
 		while (m) {
 			if (m->m_pkthdr.len > fsize) {
 				/*
 				 * Split off the tail segment from the
 				 * datagram, copying our tags over.
 				 */
 				mtail = m_split(m, fsize, M_NOWAIT);
 				m_tag_copy_chain(mtail, m, M_NOWAIT);
 			} else {
 				mtail = NULL;
 			}
 
 			/*
 			 * Add our encapsulation header to this
 			 * fragment and hand it off to the link.
 			 */
 			M_PREPEND(m, 2*sizeof(uint32_t), M_NOWAIT);
 			if (!m) {
 				error = ENOBUFS;
 				goto bad;
 			}
 			enc = mtod(m, union fw_encap *);
 			if (foff == 0) {
 				enc->firstfrag.lf = FW_ENCAP_FIRST;
 				enc->firstfrag.reserved1 = 0;
 				enc->firstfrag.reserved2 = 0;
 				enc->firstfrag.datagram_size = dsize - 1;
 				enc->firstfrag.ether_type = type;
 				enc->firstfrag.dgl = dgl;
 			} else {
 				if (mtail)
 					enc->nextfrag.lf = FW_ENCAP_NEXT;
 				else
 					enc->nextfrag.lf = FW_ENCAP_LAST;
 				enc->nextfrag.reserved1 = 0;
 				enc->nextfrag.reserved2 = 0;
 				enc->nextfrag.reserved3 = 0;
 				enc->nextfrag.datagram_size = dsize - 1;
 				enc->nextfrag.fragment_offset = foff;
 				enc->nextfrag.dgl = dgl;
 			}
 			foff += m->m_pkthdr.len - 2*sizeof(uint32_t);
 
 			/*
 			 * Byte swap the encapsulation header manually.
 			 */
 			enc->ul[0] = htonl(enc->ul[0]);
 			enc->ul[1] = htonl(enc->ul[1]);
 
 			error = (ifp->if_transmit)(ifp, m);
 			if (error) {
 				if (mtail)
 					m_freem(mtail);
 				return (ENOBUFS);
 			}
 
 			m = mtail;
 		}
 
 		return (0);
 	}
 
 bad:
 	if (m)
 		m_freem(m);
 	return (error);
 }
 
 static struct mbuf *
 firewire_input_fragment(struct fw_com *fc, struct mbuf *m, int src)
 {
 	union fw_encap *enc;
 	struct fw_reass *r;
 	struct mbuf *mf, *mprev;
 	int dsize;
 	int fstart, fend, start, end, islast;
 	uint32_t id;
 
 	/*
 	 * Find an existing reassembly buffer or create a new one.
 	 */
 	enc = mtod(m, union fw_encap *);
 	id = enc->firstfrag.dgl | (src << 16);
 	STAILQ_FOREACH(r, &fc->fc_frags, fr_link)
 		if (r->fr_id == id)
 			break;
 	if (!r) {
 		r = malloc(sizeof(struct fw_reass), M_TEMP, M_NOWAIT);
 		if (!r) {
 			m_freem(m);
 			return 0;
 		}
 		r->fr_id = id;
 		r->fr_frags = 0;
 		STAILQ_INSERT_HEAD(&fc->fc_frags, r, fr_link);
 	}
 
 	/*
 	 * If this fragment overlaps any other fragment, we must discard
 	 * the partial reassembly and start again.
 	 */
 	if (enc->firstfrag.lf == FW_ENCAP_FIRST)
 		fstart = 0;
 	else
 		fstart = enc->nextfrag.fragment_offset;
 	fend = fstart + m->m_pkthdr.len - 2*sizeof(uint32_t);
 	dsize = enc->nextfrag.datagram_size;
 	islast = (enc->nextfrag.lf == FW_ENCAP_LAST);
 
 	for (mf = r->fr_frags; mf; mf = mf->m_nextpkt) {
 		enc = mtod(mf, union fw_encap *);
 		if (enc->nextfrag.datagram_size != dsize) {
 			/*
 			 * This fragment must be from a different
 			 * packet.
 			 */
 			goto bad;
 		}
 		if (enc->firstfrag.lf == FW_ENCAP_FIRST)
 			start = 0;
 		else
 			start = enc->nextfrag.fragment_offset;
 		end = start + mf->m_pkthdr.len - 2*sizeof(uint32_t);
 		if ((fstart < end && fend > start) ||
 		    (islast && enc->nextfrag.lf == FW_ENCAP_LAST)) {
 			/*
 			 * Overlap - discard reassembly buffer and start
 			 * again with this fragment.
 			 */
 			goto bad;
 		}
 	}
 
 	/*
 	 * Find where to put this fragment in the list.
 	 */
 	for (mf = r->fr_frags, mprev = NULL; mf;
 	    mprev = mf, mf = mf->m_nextpkt) {
 		enc = mtod(mf, union fw_encap *);
 		if (enc->firstfrag.lf == FW_ENCAP_FIRST)
 			start = 0;
 		else
 			start = enc->nextfrag.fragment_offset;
 		if (start >= fend)
 			break;
 	}
 
 	/*
 	 * If this is a last fragment and we are not adding at the end
 	 * of the list, discard the buffer.
 	 */
 	if (islast && mprev && mprev->m_nextpkt)
 		goto bad;
 
 	if (mprev) {
 		m->m_nextpkt = mprev->m_nextpkt;
 		mprev->m_nextpkt = m;
 
 		/*
 		 * Coalesce forwards and see if we can make a whole
 		 * datagram.
 		 */
 		enc = mtod(mprev, union fw_encap *);
 		if (enc->firstfrag.lf == FW_ENCAP_FIRST)
 			start = 0;
 		else
 			start = enc->nextfrag.fragment_offset;
 		end = start + mprev->m_pkthdr.len - 2*sizeof(uint32_t);
 		while (end == fstart) {
 			/*
 			 * Strip off the encap header from m and
 			 * append it to mprev, freeing m.
 			 */
 			m_adj(m, 2*sizeof(uint32_t));
 			mprev->m_nextpkt = m->m_nextpkt;
 			mprev->m_pkthdr.len += m->m_pkthdr.len;
 			m_cat(mprev, m);
 
 			if (mprev->m_pkthdr.len == dsize + 1 + 2*sizeof(uint32_t)) {
 				/*
 				 * We have assembled a complete packet
 				 * we must be finished. Make sure we have
 				 * merged the whole chain.
 				 */
 				STAILQ_REMOVE(&fc->fc_frags, r, fw_reass, fr_link);
 				free(r, M_TEMP);
 				m = mprev->m_nextpkt;
 				while (m) {
 					mf = m->m_nextpkt;
 					m_freem(m);
 					m = mf;
 				}
 				mprev->m_nextpkt = NULL;
 
 				return (mprev);
 			}
 
 			/*
 			 * See if we can continue merging forwards.
 			 */
 			end = fend;
 			m = mprev->m_nextpkt;
 			if (m) {
 				enc = mtod(m, union fw_encap *);
 				if (enc->firstfrag.lf == FW_ENCAP_FIRST)
 					fstart = 0;
 				else
 					fstart = enc->nextfrag.fragment_offset;
 				fend = fstart + m->m_pkthdr.len
 				    - 2*sizeof(uint32_t);
 			} else {
 				break;
 			}
 		}
 	} else {
 		m->m_nextpkt = 0;
 		r->fr_frags = m;
 	}
 
 	return (0);
 
 bad:
 	while (r->fr_frags) {
 		mf = r->fr_frags;
 		r->fr_frags = mf->m_nextpkt;
 		m_freem(mf);
 	}
 	m->m_nextpkt = 0;
 	r->fr_frags = m;
 
 	return (0);
 }
 
 void
 firewire_input(struct ifnet *ifp, struct mbuf *m, uint16_t src)
 {
 	struct fw_com *fc = IFP2FWC(ifp);
 	union fw_encap *enc;
 	int type, isr;
 
 	/*
 	 * The caller has already stripped off the packet header
 	 * (stream or wreqb) and marked the mbuf's M_BCAST flag
 	 * appropriately. We de-encapsulate the IP packet and pass it
 	 * up the line after handling link-level fragmentation.
 	 */
 	if (m->m_pkthdr.len < sizeof(uint32_t)) {
 		if_printf(ifp, "discarding frame without "
 		    "encapsulation header (len %u pkt len %u)\n",
 		    m->m_len, m->m_pkthdr.len);
 	}
 
 	m = m_pullup(m, sizeof(uint32_t));
 	if (m == NULL)
 		return;
 	enc = mtod(m, union fw_encap *);
 
 	/*
 	 * Byte swap the encapsulation header manually.
 	 */
 	enc->ul[0] = ntohl(enc->ul[0]);
 
 	if (enc->unfrag.lf != 0) {
 		m = m_pullup(m, 2*sizeof(uint32_t));
 		if (!m)
 			return;
 		enc = mtod(m, union fw_encap *);
 		enc->ul[1] = ntohl(enc->ul[1]);
 		m = firewire_input_fragment(fc, m, src);
 		if (!m)
 			return;
 		enc = mtod(m, union fw_encap *);
 		type = enc->firstfrag.ether_type;
 		m_adj(m, 2*sizeof(uint32_t));
 	} else {
 		type = enc->unfrag.ether_type;
 		m_adj(m, sizeof(uint32_t));
 	}
 
 	if (m->m_pkthdr.rcvif == NULL) {
 		if_printf(ifp, "discard frame w/o interface pointer\n");
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		return;
 	}
 #ifdef DIAGNOSTIC
 	if (m->m_pkthdr.rcvif != ifp) {
 		if_printf(ifp, "Warning, frame marked as received on %s\n",
 			m->m_pkthdr.rcvif->if_xname);
 	}
 #endif
 
 #ifdef MAC
 	/*
 	 * Tag the mbuf with an appropriate MAC label before any other
 	 * consumers can get to it.
 	 */
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	/*
 	 * Give bpf a chance at the packet. The link-level driver
 	 * should have left us a tag with the EUID of the sender.
 	 */
 	if (bpf_peers_present(ifp->if_bpf)) {
 		struct fw_bpfhdr h;
 		struct m_tag *mtag;
 
 		mtag = m_tag_locate(m, MTAG_FIREWIRE, MTAG_FIREWIRE_SENDER_EUID, 0);
 		if (mtag)
 			bcopy(mtag + 1, h.firewire_shost, 8);
 		else
 			bcopy(&firewire_broadcastaddr, h.firewire_dhost, 8);
 		bcopy(&fc->fc_hwaddr, h.firewire_dhost, 8);
 		h.firewire_type = htons(type);
 		bpf_mtap2(ifp->if_bpf, &h, sizeof(h), m);
 	}
 
 	if (ifp->if_flags & IFF_MONITOR) {
 		/*
 		 * Interface marked for monitoring; discard packet.
 		 */
 		m_freem(m);
 		return;
 	}
 
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 
 	/* Discard packet if interface is not up */
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		return;
 	}
 
 	if (m->m_flags & (M_BCAST|M_MCAST))
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 
 	switch (type) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		isr = NETISR_IP;
 		break;
 
 	case ETHERTYPE_ARP:
 	{
 		struct arphdr *ah;
 		ah = mtod(m, struct arphdr *);
 
 		/*
 		 * Adjust the arp packet to insert an empty tha slot.
 		 */
 		m->m_len += ah->ar_hln;
 		m->m_pkthdr.len += ah->ar_hln;
 		bcopy(ar_tha(ah), ar_tpa(ah), ah->ar_pln);
 		isr = NETISR_ARP;
 		break;
 	}
 #endif
 
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 		isr = NETISR_IPV6;
 		break;
 #endif
 
 	default:
 		m_freem(m);
 		return;
 	}
 
 	M_SETFIB(m, ifp->if_fib);
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	netisr_dispatch(isr, m);
 	CURVNET_RESTORE();
 }
 
 int
 firewire_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	struct ifaddr *ifa = (struct ifaddr *) data;
 	struct ifreq *ifr = (struct ifreq *) data;
 	int error = 0;
 
 	switch (command) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 
 		switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			ifp->if_init(ifp->if_softc);	/* before arpwhohas */
 			arp_ifinit(ifp, ifa);
 			break;
 #endif
 		default:
 			ifp->if_init(ifp->if_softc);
 			break;
 		}
 		break;
 
 	case SIOCGIFADDR:
 		bcopy(&IFP2FWC(ifp)->fc_hwaddr, &ifr->ifr_addr.sa_data[0],
 		    sizeof(struct fw_hwaddr));
 		break;
 
 	case SIOCSIFMTU:
 		/*
 		 * Set the interface MTU.
 		 */
 		if (ifr->ifr_mtu > 1500) {
 			error = EINVAL;
 		} else {
 			ifp->if_mtu = ifr->ifr_mtu;
 		}
 		break;
 	default:
 		error = EINVAL;			/* XXX netbsd has ENOTTY??? */
 		break;
 	}
 	return (error);
 }
 
 static int
 firewire_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
     struct sockaddr *sa)
 {
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 
 	switch(sa->sa_family) {
 	case AF_LINK:
 		/*
 		 * No mapping needed.
 		 */
 		*llsa = NULL;
 		return 0;
 
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)sa;
 		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 			return EADDRNOTAVAIL;
 		*llsa = NULL;
 		return 0;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)sa;
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			/*
 			 * An IP6 address of 0 means listen to all
 			 * of the Ethernet multicast address used for IP6.
 			 * (This is used for multicast routers.)
 			 */
 			ifp->if_flags |= IFF_ALLMULTI;
 			*llsa = NULL;
 			return 0;
 		}
 		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 			return EADDRNOTAVAIL;
 		*llsa = NULL;
 		return 0;
 #endif
 
 	default:
 		/*
 		 * Well, the text isn't quite right, but it's the name
 		 * that counts...
 		 */
 		return EAFNOSUPPORT;
 	}
 }
 
 void
 firewire_ifattach(struct ifnet *ifp, struct fw_hwaddr *llc)
 {
 	struct fw_com *fc = IFP2FWC(ifp);
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 	static const char* speeds[] = {
 		"S100", "S200", "S400", "S800",
 		"S1600", "S3200"
 	};
 
 	fc->fc_speed = llc->sspd;
 	STAILQ_INIT(&fc->fc_frags);
 
 	ifp->if_addrlen = sizeof(struct fw_hwaddr);
 	ifp->if_hdrlen = 0;
 	if_attach(ifp);
 	ifp->if_mtu = 1500;	/* XXX */
 	ifp->if_output = firewire_output;
 	ifp->if_resolvemulti = firewire_resolvemulti;
 	ifp->if_broadcastaddr = (u_char *) &firewire_broadcastaddr;
 
 	ifa = ifp->if_addr;
 	KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	sdl->sdl_type = IFT_IEEE1394;
 	sdl->sdl_alen = ifp->if_addrlen;
 	bcopy(llc, LLADDR(sdl), ifp->if_addrlen);
 
 	bpfattach(ifp, DLT_APPLE_IP_OVER_IEEE1394,
 	    sizeof(struct fw_hwaddr));
 
 	if_printf(ifp, "Firewire address: %8D @ 0x%04x%08x, %s, maxrec %d\n",
 	    (uint8_t *) &llc->sender_unique_ID_hi, ":",
 	    ntohs(llc->sender_unicast_FIFO_hi),
 	    ntohl(llc->sender_unicast_FIFO_lo),
 	    speeds[llc->sspd],
 	    (2 << llc->sender_max_rec));
 }
 
 void
 firewire_ifdetach(struct ifnet *ifp)
 {
 	bpfdetach(ifp);
 	if_detach(ifp);
 }
 
 void
 firewire_busreset(struct ifnet *ifp)
 {
 	struct fw_com *fc = IFP2FWC(ifp);
 	struct fw_reass *r;
 	struct mbuf *m;
 
 	/*
 	 * Discard any partial datagrams since the host ids may have changed.
 	 */
 	while ((r = STAILQ_FIRST(&fc->fc_frags))) {
 		STAILQ_REMOVE_HEAD(&fc->fc_frags, fr_link);
 		while (r->fr_frags) {
 			m = r->fr_frags;
 			r->fr_frags = m->m_nextpkt;
 			m_freem(m);
 		}
 		free(r, M_TEMP);
 	}
 }
 
 static void *
 firewire_alloc(u_char type, struct ifnet *ifp)
 {
 	struct fw_com	*fc;
 
 	fc = malloc(sizeof(struct fw_com), M_FWCOM, M_WAITOK | M_ZERO);
 	fc->fc_ifp = ifp;
 
 	return (fc);
 }
 
 static void
 firewire_free(void *com, u_char type)
 {
 
 	free(com, M_FWCOM);
 }
 
 static int
 firewire_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		if_register_com_alloc(IFT_IEEE1394,
 		    firewire_alloc, firewire_free);
 		break;
 	case MOD_UNLOAD:
 		if_deregister_com_alloc(IFT_IEEE1394);
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static moduledata_t firewire_mod = {
 	"if_firewire",
 	firewire_modevent,
 	0
 };
 
 DECLARE_MODULE(if_firewire, firewire_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
 MODULE_VERSION(if_firewire, 1);
diff --git a/sys/net/if_gif.c b/sys/net/if_gif.c
index ea4b80690e50..18160c21411e 100644
--- a/sys/net/if_gif.c
+++ b/sys/net/if_gif.c
@@ -1,723 +1,724 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * Copyright (c) 2018 Andrey V. Elsukov <ae@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: if_gif.c,v 1.87 2001/10/19 08:50:27 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/conf.h>
 #include <machine/cpu.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/bpf.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_ecn.h>
 #ifdef	INET
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #endif	/* INET */
 
 #ifdef INET6
 #ifndef INET
 #include <netinet/in.h>
 #endif
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_ecn.h>
 #include <netinet6/ip6_var.h>
 #endif /* INET6 */
 
 #include <netinet/ip_encap.h>
 #include <net/ethernet.h>
 #include <net/if_bridgevar.h>
 #include <net/if_gif.h>
 
 #include <security/mac/mac_framework.h>
 
 static const char gifname[] = "gif";
 
 MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface");
 static struct sx gif_ioctl_sx;
 SX_SYSINIT(gif_ioctl_sx, &gif_ioctl_sx, "gif_ioctl");
 
 void	(*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, int af);
 void	(*ng_gif_input_orphan_p)(struct ifnet *ifp, struct mbuf *m, int af);
 void	(*ng_gif_attach_p)(struct ifnet *ifp);
 void	(*ng_gif_detach_p)(struct ifnet *ifp);
 
 #ifdef VIMAGE
 static void	gif_reassign(struct ifnet *, struct vnet *, char *);
 #endif
 static void	gif_delete_tunnel(struct gif_softc *);
 static int	gif_ioctl(struct ifnet *, u_long, caddr_t);
 static int	gif_transmit(struct ifnet *, struct mbuf *);
 static void	gif_qflush(struct ifnet *);
 static int	gif_clone_create(struct if_clone *, int, caddr_t);
 static void	gif_clone_destroy(struct ifnet *);
 VNET_DEFINE_STATIC(struct if_clone *, gif_cloner);
 #define	V_gif_cloner	VNET(gif_cloner)
 
 SYSCTL_DECL(_net_link);
 static SYSCTL_NODE(_net_link, IFT_GIF, gif, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Generic Tunnel Interface");
 #ifndef MAX_GIF_NEST
 /*
  * This macro controls the default upper limitation on nesting of gif tunnels.
  * Since, setting a large value to this macro with a careless configuration
  * may introduce system crash, we don't allow any nestings by default.
  * If you need to configure nested gif tunnels, you can define this macro
  * in your kernel configuration file.  However, if you do so, please be
  * careful to configure the tunnels so that it won't make a loop.
  */
 #define MAX_GIF_NEST 1
 #endif
 VNET_DEFINE_STATIC(int, max_gif_nesting) = MAX_GIF_NEST;
 #define	V_max_gif_nesting	VNET(max_gif_nesting)
 SYSCTL_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(max_gif_nesting), 0, "Max nested tunnels");
 
 static int
 gif_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct gif_softc *sc;
 
 	sc = malloc(sizeof(struct gif_softc), M_GIF, M_WAITOK | M_ZERO);
 	sc->gif_fibnum = curthread->td_proc->p_fibnum;
 	GIF2IFP(sc) = if_alloc(IFT_GIF);
 	GIF2IFP(sc)->if_softc = sc;
 	if_initname(GIF2IFP(sc), gifname, unit);
 
 	GIF2IFP(sc)->if_addrlen = 0;
 	GIF2IFP(sc)->if_mtu    = GIF_MTU;
 	GIF2IFP(sc)->if_flags  = IFF_POINTOPOINT | IFF_MULTICAST;
 	GIF2IFP(sc)->if_ioctl  = gif_ioctl;
 	GIF2IFP(sc)->if_transmit = gif_transmit;
 	GIF2IFP(sc)->if_qflush = gif_qflush;
 	GIF2IFP(sc)->if_output = gif_output;
 #ifdef VIMAGE
 	GIF2IFP(sc)->if_reassign = gif_reassign;
 #endif
 	GIF2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE;
 	GIF2IFP(sc)->if_capenable |= IFCAP_LINKSTATE;
 	if_attach(GIF2IFP(sc));
 	bpfattach(GIF2IFP(sc), DLT_NULL, sizeof(u_int32_t));
 	if (ng_gif_attach_p != NULL)
 		(*ng_gif_attach_p)(GIF2IFP(sc));
 
 	return (0);
 }
 
 #ifdef VIMAGE
 static void
 gif_reassign(struct ifnet *ifp, struct vnet *new_vnet __unused,
     char *unused __unused)
 {
 	struct gif_softc *sc;
 
 	sx_xlock(&gif_ioctl_sx);
 	sc = ifp->if_softc;
 	if (sc != NULL)
 		gif_delete_tunnel(sc);
 	sx_xunlock(&gif_ioctl_sx);
 }
 #endif /* VIMAGE */
 
 static void
 gif_clone_destroy(struct ifnet *ifp)
 {
 	struct gif_softc *sc;
 
 	sx_xlock(&gif_ioctl_sx);
 	sc = ifp->if_softc;
 	gif_delete_tunnel(sc);
 	if (ng_gif_detach_p != NULL)
 		(*ng_gif_detach_p)(ifp);
 	bpfdetach(ifp);
 	if_detach(ifp);
 	ifp->if_softc = NULL;
 	sx_xunlock(&gif_ioctl_sx);
 
 	GIF_WAIT();
 	if_free(ifp);
 	free(sc, M_GIF);
 }
 
 static void
 vnet_gif_init(const void *unused __unused)
 {
 
 	V_gif_cloner = if_clone_simple(gifname, gif_clone_create,
 	    gif_clone_destroy, 0);
 #ifdef INET
 	in_gif_init();
 #endif
 #ifdef INET6
 	in6_gif_init();
 #endif
 }
 VNET_SYSINIT(vnet_gif_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_gif_init, NULL);
 
 static void
 vnet_gif_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_gif_cloner);
 #ifdef INET
 	in_gif_uninit();
 #endif
 #ifdef INET6
 	in6_gif_uninit();
 #endif
 }
 VNET_SYSUNINIT(vnet_gif_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_gif_uninit, NULL);
 
 static int
 gifmodevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 	case MOD_UNLOAD:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t gif_mod = {
 	"if_gif",
 	gifmodevent,
 	0
 };
 
 DECLARE_MODULE(if_gif, gif_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_gif, 1);
 
 struct gif_list *
 gif_hashinit(void)
 {
 	struct gif_list *hash;
 	int i;
 
 	hash = malloc(sizeof(struct gif_list) * GIF_HASH_SIZE,
 	    M_GIF, M_WAITOK);
 	for (i = 0; i < GIF_HASH_SIZE; i++)
 		CK_LIST_INIT(&hash[i]);
 
 	return (hash);
 }
 
 void
 gif_hashdestroy(struct gif_list *hash)
 {
 
 	free(hash, M_GIF);
 }
 
 #define	MTAG_GIF	1080679712
 static int
 gif_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct gif_softc *sc;
 	struct etherip_header *eth;
 #ifdef INET
 	struct ip *ip;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	uint32_t t;
 #endif
 	uint32_t af;
 	uint8_t proto, ecn;
 	int error;
 
 	NET_EPOCH_ASSERT();
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error) {
 		m_freem(m);
 		goto err;
 	}
 #endif
 	error = ENETDOWN;
 	sc = ifp->if_softc;
 	if ((ifp->if_flags & IFF_MONITOR) != 0 ||
 	    (ifp->if_flags & IFF_UP) == 0 ||
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    sc->gif_family == 0 ||
 	    (error = if_tunnel_check_nesting(ifp, m, MTAG_GIF,
 		V_max_gif_nesting)) != 0) {
 		m_freem(m);
 		goto err;
 	}
 	/* Now pull back the af that we stashed in the csum_data. */
 	if (ifp->if_bridge)
 		af = AF_LINK;
 	else
 		af = m->m_pkthdr.csum_data;
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 	M_SETFIB(m, sc->gif_fibnum);
 	BPF_MTAP2(ifp, &af, sizeof(af), m);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
 	/* inner AF-specific encapsulation */
 	ecn = 0;
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		proto = IPPROTO_IPV4;
 		if (m->m_len < sizeof(struct ip))
 			m = m_pullup(m, sizeof(struct ip));
 		if (m == NULL) {
 			error = ENOBUFS;
 			goto err;
 		}
 		ip = mtod(m, struct ip *);
 		ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED:
 		    ECN_NOCARE, &ecn, &ip->ip_tos);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		proto = IPPROTO_IPV6;
 		if (m->m_len < sizeof(struct ip6_hdr))
 			m = m_pullup(m, sizeof(struct ip6_hdr));
 		if (m == NULL) {
 			error = ENOBUFS;
 			goto err;
 		}
 		t = 0;
 		ip6 = mtod(m, struct ip6_hdr *);
 		ip6_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED:
 		    ECN_NOCARE, &t, &ip6->ip6_flow);
 		ecn = (ntohl(t) >> 20) & 0xff;
 		break;
 #endif
 	case AF_LINK:
 		proto = IPPROTO_ETHERIP;
 		M_PREPEND(m, sizeof(struct etherip_header), M_NOWAIT);
 		if (m == NULL) {
 			error = ENOBUFS;
 			goto err;
 		}
 		eth = mtod(m, struct etherip_header *);
 		eth->eip_resvh = 0;
 		eth->eip_ver = ETHERIP_VERSION;
 		eth->eip_resvl = 0;
 		break;
 	default:
 		error = EAFNOSUPPORT;
 		m_freem(m);
 		goto err;
 	}
 	/* XXX should we check if our outer source is legal? */
 	/* dispatch to output logic based on outer AF */
 	switch (sc->gif_family) {
 #ifdef INET
 	case AF_INET:
 		error = in_gif_output(ifp, m, proto, ecn);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		error = in6_gif_output(ifp, m, proto, ecn);
 		break;
 #endif
 	default:
 		m_freem(m);
 	}
 err:
 	if (error)
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	return (error);
 }
 
 static void
 gif_qflush(struct ifnet *ifp __unused)
 {
 
 }
 
 int
 gif_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
 	struct route *ro)
 {
 	uint32_t af;
 
 	if (dst->sa_family == AF_UNSPEC)
 		bcopy(dst->sa_data, &af, sizeof(af));
 	else
 		af = RO_GET_FAMILY(ro, dst);
 	/*
 	 * Now save the af in the inbound pkt csum data, this is a cheat since
 	 * we are using the inbound csum_data field to carry the af over to
 	 * the gif_transmit() routine, avoiding using yet another mtag.
 	 */
 	m->m_pkthdr.csum_data = af;
 	return (ifp->if_transmit(ifp, m));
 }
 
 void
 gif_input(struct mbuf *m, struct ifnet *ifp, int proto, uint8_t ecn)
 {
 	struct etherip_header *eip;
 #ifdef INET
 	struct ip *ip;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	uint32_t t;
 #endif
 	struct ether_header *eh;
 	struct ifnet *oldifp;
 	int isr, n, af;
 
 	NET_EPOCH_ASSERT();
 
 	if (ifp == NULL) {
 		/* just in case */
 		m_freem(m);
 		return;
 	}
 	m->m_pkthdr.rcvif = ifp;
 	m_clrprotoflags(m);
 	switch (proto) {
 #ifdef INET
 	case IPPROTO_IPV4:
 		af = AF_INET;
 		if (m->m_len < sizeof(struct ip))
 			m = m_pullup(m, sizeof(struct ip));
 		if (m == NULL)
 			goto drop;
 		ip = mtod(m, struct ip *);
 		if (ip_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED:
 		    ECN_NOCARE, &ecn, &ip->ip_tos) == 0) {
 			m_freem(m);
 			goto drop;
 		}
 		break;
 #endif
 #ifdef INET6
 	case IPPROTO_IPV6:
 		af = AF_INET6;
 		if (m->m_len < sizeof(struct ip6_hdr))
 			m = m_pullup(m, sizeof(struct ip6_hdr));
 		if (m == NULL)
 			goto drop;
 		t = htonl((uint32_t)ecn << 20);
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (ip6_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED:
 		    ECN_NOCARE, &t, &ip6->ip6_flow) == 0) {
 			m_freem(m);
 			goto drop;
 		}
 		break;
 #endif
 	case IPPROTO_ETHERIP:
 		af = AF_LINK;
 		break;
 	default:
 		m_freem(m);
 		goto drop;
 	}
 
 #ifdef MAC
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	if (bpf_peers_present(ifp->if_bpf)) {
 		uint32_t af1 = af;
 		bpf_mtap2(ifp->if_bpf, &af1, sizeof(af1), m);
 	}
 
 	if ((ifp->if_flags & IFF_MONITOR) != 0) {
 		if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 		if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 		m_freem(m);
 		return;
 	}
 
 	if (ng_gif_input_p != NULL) {
 		(*ng_gif_input_p)(ifp, &m, af);
 		if (m == NULL)
 			goto drop;
 	}
 
 	/*
 	 * Put the packet to the network layer input queue according to the
 	 * specified address family.
 	 * Note: older versions of gif_input directly called network layer
 	 * input functions, e.g. ip6_input, here.  We changed the policy to
 	 * prevent too many recursive calls of such input functions, which
 	 * might cause kernel panic.  But the change may introduce another
 	 * problem; if the input queue is full, packets are discarded.
 	 * The kernel stack overflow really happened, and we believed
 	 * queue-full rarely occurs, so we changed the policy.
 	 */
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		isr = NETISR_IP;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		isr = NETISR_IPV6;
 		break;
 #endif
 	case AF_LINK:
 		n = sizeof(struct etherip_header) +
 		    sizeof(struct ether_header);
 		if (n > m->m_len)
 			m = m_pullup(m, n);
 		if (m == NULL)
 			goto drop;
 		eip = mtod(m, struct etherip_header *);
 		if (eip->eip_ver != ETHERIP_VERSION) {
 			/* discard unknown versions */
 			m_freem(m);
 			goto drop;
 		}
 
 		m_adj_decap(m, sizeof(struct etherip_header));
 
 		m->m_flags &= ~(M_BCAST|M_MCAST);
 		m->m_pkthdr.rcvif = ifp;
 
 		if (ifp->if_bridge) {
 			oldifp = ifp;
 			eh = mtod(m, struct ether_header *);
 			if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 				if (ETHER_IS_BROADCAST(eh->ether_dhost))
 					m->m_flags |= M_BCAST;
 				else
 					m->m_flags |= M_MCAST;
 				if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 			}
 			BRIDGE_INPUT(ifp, m);
 
 			if (m != NULL && ifp != oldifp) {
 				/*
 				 * The bridge gave us back itself or one of the
 				 * members for which the frame is addressed.
 				 */
 				ether_demux(ifp, m);
 				return;
 			}
 		}
 		if (m != NULL)
 			m_freem(m);
 		return;
 
 	default:
 		if (ng_gif_input_orphan_p != NULL)
 			(*ng_gif_input_orphan_p)(ifp, m, af);
 		else
 			m_freem(m);
 		return;
 	}
 
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	M_SETFIB(m, ifp->if_fib);
 	netisr_dispatch(isr, m);
 	return;
 drop:
 	if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 }
 
 static int
 gif_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq*)data;
 	struct gif_softc *sc;
 	u_int options;
 	int error;
 
 	switch (cmd) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 	case SIOCGIFMTU:
 	case SIOCSIFFLAGS:
 		return (0);
 	case SIOCSIFMTU:
 		if (ifr->ifr_mtu < GIF_MTU_MIN ||
 		    ifr->ifr_mtu > GIF_MTU_MAX)
 			return (EINVAL);
 		else
 			ifp->if_mtu = ifr->ifr_mtu;
 		return (0);
 	}
 	sx_xlock(&gif_ioctl_sx);
 	sc = ifp->if_softc;
 	if (sc == NULL) {
 		error = ENXIO;
 		goto bad;
 	}
 	error = 0;
 	switch (cmd) {
 	case SIOCDIFPHYADDR:
 		if (sc->gif_family == 0)
 			break;
 		gif_delete_tunnel(sc);
 		break;
 #ifdef INET
 	case SIOCSIFPHYADDR:
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 		error = in_gif_ioctl(sc, cmd, data);
 		break;
 #endif
 #ifdef INET6
 	case SIOCSIFPHYADDR_IN6:
 	case SIOCGIFPSRCADDR_IN6:
 	case SIOCGIFPDSTADDR_IN6:
 		error = in6_gif_ioctl(sc, cmd, data);
 		break;
 #endif
 	case SIOCGTUNFIB:
 		ifr->ifr_fib = sc->gif_fibnum;
 		break;
 	case SIOCSTUNFIB:
 		if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0)
 			break;
 		if (ifr->ifr_fib >= rt_numfibs)
 			error = EINVAL;
 		else
 			sc->gif_fibnum = ifr->ifr_fib;
 		break;
 	case GIFGOPTS:
 		options = sc->gif_options;
 		error = copyout(&options, ifr_data_get_ptr(ifr),
 		    sizeof(options));
 		break;
 	case GIFSOPTS:
 		if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0)
 			break;
 		error = copyin(ifr_data_get_ptr(ifr), &options,
 		    sizeof(options));
 		if (error)
 			break;
 		if (options & ~GIF_OPTMASK) {
 			error = EINVAL;
 			break;
 		}
 		if (sc->gif_options != options) {
 			switch (sc->gif_family) {
 #ifdef INET
 			case AF_INET:
 				error = in_gif_setopts(sc, options);
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				error = in6_gif_setopts(sc, options);
 				break;
 #endif
 			default:
 				/* No need to invoke AF-handler */
 				sc->gif_options = options;
 			}
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (error == 0 && sc->gif_family != 0) {
 		if (
 #ifdef INET
 		    cmd == SIOCSIFPHYADDR ||
 #endif
 #ifdef INET6
 		    cmd == SIOCSIFPHYADDR_IN6 ||
 #endif
 		    0) {
 			if_link_state_change(ifp, LINK_STATE_UP);
 		}
 	}
 bad:
 	sx_xunlock(&gif_ioctl_sx);
 	return (error);
 }
 
 static void
 gif_delete_tunnel(struct gif_softc *sc)
 {
 
 	sx_assert(&gif_ioctl_sx, SA_XLOCKED);
 	if (sc->gif_family != 0) {
 		CK_LIST_REMOVE(sc, srchash);
 		CK_LIST_REMOVE(sc, chain);
 		/* Wait until it become safe to free gif_hdr */
 		GIF_WAIT();
 		free(sc->gif_hdr, M_GIF);
 	}
 	sc->gif_family = 0;
 	GIF2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
 	if_link_state_change(GIF2IFP(sc), LINK_STATE_DOWN);
 }
diff --git a/sys/net/if_gre.c b/sys/net/if_gre.c
index 5ad452ac38e0..2349a359e12e 100644
--- a/sys/net/if_gre.c
+++ b/sys/net/if_gre.c
@@ -1,832 +1,833 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1998 The NetBSD Foundation, Inc.
  * Copyright (c) 2014, 2018 Andrey V. Elsukov <ae@FreeBSD.org>
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Heiko W.Rupp <hwr@pilhuhn.de>
  *
  * IPv6-over-GRE contributed by Gert Doering <gert@greenie.muc.de>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * $NetBSD: if_gre.c,v 1.49 2003/12/11 00:22:29 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #ifdef INET
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #ifdef RSS
 #include <netinet/in_rss.h>
 #endif
 #endif
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #ifdef RSS
 #include <netinet6/in6_rss.h>
 #endif
 #endif
 
 #include <netinet/ip_encap.h>
 #include <netinet/udp.h>
 #include <net/bpf.h>
 #include <net/if_gre.h>
 
 #include <machine/in_cksum.h>
 #include <security/mac/mac_framework.h>
 
 #define	GREMTU			1476
 
 static const char grename[] = "gre";
 MALLOC_DEFINE(M_GRE, grename, "Generic Routing Encapsulation");
 
 static struct sx gre_ioctl_sx;
 SX_SYSINIT(gre_ioctl_sx, &gre_ioctl_sx, "gre_ioctl");
 
 static int	gre_clone_create(struct if_clone *, int, caddr_t);
 static void	gre_clone_destroy(struct ifnet *);
 VNET_DEFINE_STATIC(struct if_clone *, gre_cloner);
 #define	V_gre_cloner	VNET(gre_cloner)
 
 #ifdef VIMAGE
 static void	gre_reassign(struct ifnet *, struct vnet *, char *);
 #endif
 static void	gre_qflush(struct ifnet *);
 static int	gre_transmit(struct ifnet *, struct mbuf *);
 static int	gre_ioctl(struct ifnet *, u_long, caddr_t);
 static int	gre_output(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *);
 static void	gre_delete_tunnel(struct gre_softc *);
 
 SYSCTL_DECL(_net_link);
 static SYSCTL_NODE(_net_link, IFT_TUNNEL, gre, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Generic Routing Encapsulation");
 #ifndef MAX_GRE_NEST
 /*
  * This macro controls the default upper limitation on nesting of gre tunnels.
  * Since, setting a large value to this macro with a careless configuration
  * may introduce system crash, we don't allow any nestings by default.
  * If you need to configure nested gre tunnels, you can define this macro
  * in your kernel configuration file.  However, if you do so, please be
  * careful to configure the tunnels so that it won't make a loop.
  */
 #define MAX_GRE_NEST 1
 #endif
 
 VNET_DEFINE_STATIC(int, max_gre_nesting) = MAX_GRE_NEST;
 #define	V_max_gre_nesting	VNET(max_gre_nesting)
 SYSCTL_INT(_net_link_gre, OID_AUTO, max_nesting, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(max_gre_nesting), 0, "Max nested tunnels");
 
 static void
 vnet_gre_init(const void *unused __unused)
 {
 
 	V_gre_cloner = if_clone_simple(grename, gre_clone_create,
 	    gre_clone_destroy, 0);
 #ifdef INET
 	in_gre_init();
 #endif
 #ifdef INET6
 	in6_gre_init();
 #endif
 }
 VNET_SYSINIT(vnet_gre_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_gre_init, NULL);
 
 static void
 vnet_gre_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_gre_cloner);
 #ifdef INET
 	in_gre_uninit();
 #endif
 #ifdef INET6
 	in6_gre_uninit();
 #endif
 	/* XXX: epoch_call drain */
 }
 VNET_SYSUNINIT(vnet_gre_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_gre_uninit, NULL);
 
 static int
 gre_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct gre_softc *sc;
 
 	sc = malloc(sizeof(struct gre_softc), M_GRE, M_WAITOK | M_ZERO);
 	sc->gre_fibnum = curthread->td_proc->p_fibnum;
 	GRE2IFP(sc) = if_alloc(IFT_TUNNEL);
 	GRE2IFP(sc)->if_softc = sc;
 	if_initname(GRE2IFP(sc), grename, unit);
 
 	GRE2IFP(sc)->if_mtu = GREMTU;
 	GRE2IFP(sc)->if_flags = IFF_POINTOPOINT|IFF_MULTICAST;
 	GRE2IFP(sc)->if_output = gre_output;
 	GRE2IFP(sc)->if_ioctl = gre_ioctl;
 	GRE2IFP(sc)->if_transmit = gre_transmit;
 	GRE2IFP(sc)->if_qflush = gre_qflush;
 #ifdef VIMAGE
 	GRE2IFP(sc)->if_reassign = gre_reassign;
 #endif
 	GRE2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE;
 	GRE2IFP(sc)->if_capenable |= IFCAP_LINKSTATE;
 	if_attach(GRE2IFP(sc));
 	bpfattach(GRE2IFP(sc), DLT_NULL, sizeof(u_int32_t));
 	return (0);
 }
 
 #ifdef VIMAGE
 static void
 gre_reassign(struct ifnet *ifp, struct vnet *new_vnet __unused,
     char *unused __unused)
 {
 	struct gre_softc *sc;
 
 	sx_xlock(&gre_ioctl_sx);
 	sc = ifp->if_softc;
 	if (sc != NULL)
 		gre_delete_tunnel(sc);
 	sx_xunlock(&gre_ioctl_sx);
 }
 #endif /* VIMAGE */
 
 static void
 gre_clone_destroy(struct ifnet *ifp)
 {
 	struct gre_softc *sc;
 
 	sx_xlock(&gre_ioctl_sx);
 	sc = ifp->if_softc;
 	gre_delete_tunnel(sc);
 	bpfdetach(ifp);
 	if_detach(ifp);
 	ifp->if_softc = NULL;
 	sx_xunlock(&gre_ioctl_sx);
 
 	GRE_WAIT();
 	if_free(ifp);
 	free(sc, M_GRE);
 }
 
 static int
 gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct gre_softc *sc;
 	uint32_t opt;
 	int error;
 
 	switch (cmd) {
 	case SIOCSIFMTU:
 		 /* XXX: */
 		if (ifr->ifr_mtu < 576)
 			return (EINVAL);
 		ifp->if_mtu = ifr->ifr_mtu;
 		return (0);
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 	case SIOCSIFFLAGS:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		return (0);
 	case GRESADDRS:
 	case GRESADDRD:
 	case GREGADDRS:
 	case GREGADDRD:
 	case GRESPROTO:
 	case GREGPROTO:
 		return (EOPNOTSUPP);
 	}
 	sx_xlock(&gre_ioctl_sx);
 	sc = ifp->if_softc;
 	if (sc == NULL) {
 		error = ENXIO;
 		goto end;
 	}
 	error = 0;
 	switch (cmd) {
 	case SIOCDIFPHYADDR:
 		if (sc->gre_family == 0)
 			break;
 		gre_delete_tunnel(sc);
 		break;
 #ifdef INET
 	case SIOCSIFPHYADDR:
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 		error = in_gre_ioctl(sc, cmd, data);
 		break;
 #endif
 #ifdef INET6
 	case SIOCSIFPHYADDR_IN6:
 	case SIOCGIFPSRCADDR_IN6:
 	case SIOCGIFPDSTADDR_IN6:
 		error = in6_gre_ioctl(sc, cmd, data);
 		break;
 #endif
 	case SIOCGTUNFIB:
 		ifr->ifr_fib = sc->gre_fibnum;
 		break;
 	case SIOCSTUNFIB:
 		if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0)
 			break;
 		if (ifr->ifr_fib >= rt_numfibs)
 			error = EINVAL;
 		else
 			sc->gre_fibnum = ifr->ifr_fib;
 		break;
 	case GRESKEY:
 	case GRESOPTS:
 	case GRESPORT:
 		if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0)
 			break;
 		if ((error = copyin(ifr_data_get_ptr(ifr), &opt,
 		    sizeof(opt))) != 0)
 			break;
 		if (cmd == GRESKEY) {
 			if (sc->gre_key == opt)
 				break;
 		} else if (cmd == GRESOPTS) {
 			if (opt & ~GRE_OPTMASK) {
 				error = EINVAL;
 				break;
 			}
 			if (sc->gre_options == opt)
 				break;
 		} else if (cmd == GRESPORT) {
 			if (opt != 0 && (opt < V_ipport_hifirstauto ||
 			    opt > V_ipport_hilastauto)) {
 				error = EINVAL;
 				break;
 			}
 			if (sc->gre_port == opt)
 				break;
 			if ((sc->gre_options & GRE_UDPENCAP) == 0) {
 				/*
 				 * UDP encapsulation is not enabled, thus
 				 * there is no need to reattach softc.
 				 */
 				sc->gre_port = opt;
 				break;
 			}
 		}
 		switch (sc->gre_family) {
 #ifdef INET
 		case AF_INET:
 			error = in_gre_setopts(sc, cmd, opt);
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			error = in6_gre_setopts(sc, cmd, opt);
 			break;
 #endif
 		default:
 			/*
 			 * Tunnel is not yet configured.
 			 * We can just change any parameters.
 			 */
 			if (cmd == GRESKEY)
 				sc->gre_key = opt;
 			if (cmd == GRESOPTS)
 				sc->gre_options = opt;
 			if (cmd == GRESPORT)
 				sc->gre_port = opt;
 			break;
 		}
 		/*
 		 * XXX: Do we need to initiate change of interface
 		 * state here?
 		 */
 		break;
 	case GREGKEY:
 		error = copyout(&sc->gre_key, ifr_data_get_ptr(ifr),
 		    sizeof(sc->gre_key));
 		break;
 	case GREGOPTS:
 		error = copyout(&sc->gre_options, ifr_data_get_ptr(ifr),
 		    sizeof(sc->gre_options));
 		break;
 	case GREGPORT:
 		error = copyout(&sc->gre_port, ifr_data_get_ptr(ifr),
 		    sizeof(sc->gre_port));
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (error == 0 && sc->gre_family != 0) {
 		if (
 #ifdef INET
 		    cmd == SIOCSIFPHYADDR ||
 #endif
 #ifdef INET6
 		    cmd == SIOCSIFPHYADDR_IN6 ||
 #endif
 		    0) {
 			if_link_state_change(ifp, LINK_STATE_UP);
 		}
 	}
 end:
 	sx_xunlock(&gre_ioctl_sx);
 	return (error);
 }
 
 static void
 gre_delete_tunnel(struct gre_softc *sc)
 {
 	struct gre_socket *gs;
 
 	sx_assert(&gre_ioctl_sx, SA_XLOCKED);
 	if (sc->gre_family != 0) {
 		CK_LIST_REMOVE(sc, chain);
 		CK_LIST_REMOVE(sc, srchash);
 		GRE_WAIT();
 		free(sc->gre_hdr, M_GRE);
 		sc->gre_family = 0;
 	}
 	/*
 	 * If this Tunnel was the last one that could use UDP socket,
 	 * we should unlink socket from hash table and close it.
 	 */
 	if ((gs = sc->gre_so) != NULL && CK_LIST_EMPTY(&gs->list)) {
 		CK_LIST_REMOVE(gs, chain);
 		soclose(gs->so);
 		NET_EPOCH_CALL(gre_sofree, &gs->epoch_ctx);
 		sc->gre_so = NULL;
 	}
 	GRE2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
 	if_link_state_change(GRE2IFP(sc), LINK_STATE_DOWN);
 }
 
 struct gre_list *
 gre_hashinit(void)
 {
 	struct gre_list *hash;
 	int i;
 
 	hash = malloc(sizeof(struct gre_list) * GRE_HASH_SIZE,
 	    M_GRE, M_WAITOK);
 	for (i = 0; i < GRE_HASH_SIZE; i++)
 		CK_LIST_INIT(&hash[i]);
 
 	return (hash);
 }
 
 void
 gre_hashdestroy(struct gre_list *hash)
 {
 
 	free(hash, M_GRE);
 }
 
 void
 gre_sofree(epoch_context_t ctx)
 {
 	struct gre_socket *gs;
 
 	gs = __containerof(ctx, struct gre_socket, epoch_ctx);
 	free(gs, M_GRE);
 }
 
 static __inline uint16_t
 gre_cksum_add(uint16_t sum, uint16_t a)
 {
 	uint16_t res;
 
 	res = sum + a;
 	return (res + (res < a));
 }
 
 void
 gre_update_udphdr(struct gre_softc *sc, struct udphdr *udp, uint16_t csum)
 {
 
 	sx_assert(&gre_ioctl_sx, SA_XLOCKED);
 	MPASS(sc->gre_options & GRE_UDPENCAP);
 
 	udp->uh_dport = htons(GRE_UDPPORT);
 	udp->uh_sport = htons(sc->gre_port);
 	udp->uh_sum = csum;
 	udp->uh_ulen = 0;
 }
 
 void
 gre_update_hdr(struct gre_softc *sc, struct grehdr *gh)
 {
 	uint32_t *opts;
 	uint16_t flags;
 
 	sx_assert(&gre_ioctl_sx, SA_XLOCKED);
 
 	flags = 0;
 	opts = gh->gre_opts;
 	if (sc->gre_options & GRE_ENABLE_CSUM) {
 		flags |= GRE_FLAGS_CP;
 		sc->gre_hlen += 2 * sizeof(uint16_t);
 		*opts++ = 0;
 	}
 	if (sc->gre_key != 0) {
 		flags |= GRE_FLAGS_KP;
 		sc->gre_hlen += sizeof(uint32_t);
 		*opts++ = htonl(sc->gre_key);
 	}
 	if (sc->gre_options & GRE_ENABLE_SEQ) {
 		flags |= GRE_FLAGS_SP;
 		sc->gre_hlen += sizeof(uint32_t);
 		*opts++ = 0;
 	} else
 		sc->gre_oseq = 0;
 	gh->gre_flags = htons(flags);
 }
 
 int
 gre_input(struct mbuf *m, int off, int proto, void *arg)
 {
 	struct gre_softc *sc = arg;
 	struct grehdr *gh;
 	struct ifnet *ifp;
 	uint32_t *opts;
 #ifdef notyet
 	uint32_t key;
 #endif
 	uint16_t flags;
 	int hlen, isr, af;
 
 	ifp = GRE2IFP(sc);
 	hlen = off + sizeof(struct grehdr) + 4 * sizeof(uint32_t);
 	if (m->m_pkthdr.len < hlen)
 		goto drop;
 	if (m->m_len < hlen) {
 		m = m_pullup(m, hlen);
 		if (m == NULL)
 			goto drop;
 	}
 	gh = (struct grehdr *)mtodo(m, off);
 	flags = ntohs(gh->gre_flags);
 	if (flags & ~GRE_FLAGS_MASK)
 		goto drop;
 	opts = gh->gre_opts;
 	hlen = 2 * sizeof(uint16_t);
 	if (flags & GRE_FLAGS_CP) {
 		/* reserved1 field must be zero */
 		if (((uint16_t *)opts)[1] != 0)
 			goto drop;
 		if (in_cksum_skip(m, m->m_pkthdr.len, off) != 0)
 			goto drop;
 		hlen += 2 * sizeof(uint16_t);
 		opts++;
 	}
 	if (flags & GRE_FLAGS_KP) {
 #ifdef notyet
         /* 
          * XXX: The current implementation uses the key only for outgoing
          * packets. But we can check the key value here, or even in the
          * encapcheck function.
          */
 		key = ntohl(*opts);
 #endif
 		hlen += sizeof(uint32_t);
 		opts++;
     }
 #ifdef notyet
 	} else
 		key = 0;
 
 	if (sc->gre_key != 0 && (key != sc->gre_key || key != 0))
 		goto drop;
 #endif
 	if (flags & GRE_FLAGS_SP) {
 #ifdef notyet
 		seq = ntohl(*opts);
 #endif
 		hlen += sizeof(uint32_t);
 	}
 	switch (ntohs(gh->gre_proto)) {
 	case ETHERTYPE_WCCP:
 		/*
 		 * For WCCP skip an additional 4 bytes if after GRE header
 		 * doesn't follow an IP header.
 		 */
 		if (flags == 0 && (*(uint8_t *)gh->gre_opts & 0xF0) != 0x40)
 			hlen += sizeof(uint32_t);
 		/* FALLTHROUGH */
 	case ETHERTYPE_IP:
 		isr = NETISR_IP;
 		af = AF_INET;
 		break;
 	case ETHERTYPE_IPV6:
 		isr = NETISR_IPV6;
 		af = AF_INET6;
 		break;
 	default:
 		goto drop;
 	}
 	m_adj(m, off + hlen);
 	m_clrprotoflags(m);
 	m->m_pkthdr.rcvif = ifp;
 	M_SETFIB(m, ifp->if_fib);
 #ifdef MAC
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 	BPF_MTAP2(ifp, &af, sizeof(af), m);
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	if ((ifp->if_flags & IFF_MONITOR) != 0)
 		m_freem(m);
 	else
 		netisr_dispatch(isr, m);
 	return (IPPROTO_DONE);
 drop:
 	if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 static int
 gre_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
    struct route *ro)
 {
 	uint32_t af;
 
 	if (dst->sa_family == AF_UNSPEC)
 		bcopy(dst->sa_data, &af, sizeof(af));
 	else
 		af = RO_GET_FAMILY(ro, dst);
 	/*
 	 * Now save the af in the inbound pkt csum data, this is a cheat since
 	 * we are using the inbound csum_data field to carry the af over to
 	 * the gre_transmit() routine, avoiding using yet another mtag.
 	 */
 	m->m_pkthdr.csum_data = af;
 	return (ifp->if_transmit(ifp, m));
 }
 
 static void
 gre_setseqn(struct grehdr *gh, uint32_t seq)
 {
 	uint32_t *opts;
 	uint16_t flags;
 
 	opts = gh->gre_opts;
 	flags = ntohs(gh->gre_flags);
 	KASSERT((flags & GRE_FLAGS_SP) != 0,
 	    ("gre_setseqn called, but GRE_FLAGS_SP isn't set "));
 	if (flags & GRE_FLAGS_CP)
 		opts++;
 	if (flags & GRE_FLAGS_KP)
 		opts++;
 	*opts = htonl(seq);
 }
 
 static uint32_t
 gre_flowid(struct gre_softc *sc, struct mbuf *m, uint32_t af)
 {
 	uint32_t flowid = 0;
 
 	if ((sc->gre_options & GRE_UDPENCAP) == 0 || sc->gre_port != 0)
 		return (flowid);
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 #ifdef RSS
 		flowid = rss_hash_ip4_2tuple(mtod(m, struct ip *)->ip_src,
 		    mtod(m, struct ip *)->ip_dst);
 		break;
 #endif
 		flowid = mtod(m, struct ip *)->ip_src.s_addr ^
 		    mtod(m, struct ip *)->ip_dst.s_addr;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 #ifdef RSS
 		flowid = rss_hash_ip6_2tuple(
 		    &mtod(m, struct ip6_hdr *)->ip6_src,
 		    &mtod(m, struct ip6_hdr *)->ip6_dst);
 		break;
 #endif
 		flowid = mtod(m, struct ip6_hdr *)->ip6_src.s6_addr32[3] ^
 		    mtod(m, struct ip6_hdr *)->ip6_dst.s6_addr32[3];
 		break;
 #endif
 	default:
 		break;
 	}
 	return (flowid);
 }
 
 #define	MTAG_GRE	1307983903
 static int
 gre_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	GRE_RLOCK_TRACKER;
 	struct gre_softc *sc;
 	struct grehdr *gh;
 	struct udphdr *uh;
 	uint32_t af, flowid;
 	int error, len;
 	uint16_t proto;
 
 	len = 0;
 	GRE_RLOCK();
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error) {
 		m_freem(m);
 		goto drop;
 	}
 #endif
 	error = ENETDOWN;
 	sc = ifp->if_softc;
 	if ((ifp->if_flags & IFF_MONITOR) != 0 ||
 	    (ifp->if_flags & IFF_UP) == 0 ||
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    sc->gre_family == 0 ||
 	    (error = if_tunnel_check_nesting(ifp, m, MTAG_GRE,
 		V_max_gre_nesting)) != 0) {
 		m_freem(m);
 		goto drop;
 	}
 	af = m->m_pkthdr.csum_data;
 	BPF_MTAP2(ifp, &af, sizeof(af), m);
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 	flowid = gre_flowid(sc, m, af);
 	M_SETFIB(m, sc->gre_fibnum);
 	M_PREPEND(m, sc->gre_hlen, M_NOWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto drop;
 	}
 	bcopy(sc->gre_hdr, mtod(m, void *), sc->gre_hlen);
 	/* Determine GRE proto */
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		proto = htons(ETHERTYPE_IP);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		proto = htons(ETHERTYPE_IPV6);
 		break;
 #endif
 	default:
 		m_freem(m);
 		error = ENETDOWN;
 		goto drop;
 	}
 	/* Determine offset of GRE header */
 	switch (sc->gre_family) {
 #ifdef INET
 	case AF_INET:
 		len = sizeof(struct ip);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		len = sizeof(struct ip6_hdr);
 		break;
 #endif
 	default:
 		m_freem(m);
 		error = ENETDOWN;
 		goto drop;
 	}
 	if (sc->gre_options & GRE_UDPENCAP) {
 		uh = (struct udphdr *)mtodo(m, len);
 		uh->uh_sport |= htons(V_ipport_hifirstauto) |
 		    (flowid >> 16) | (flowid & 0xFFFF);
 		uh->uh_sport = htons(ntohs(uh->uh_sport) %
 		    V_ipport_hilastauto);
 		uh->uh_ulen = htons(m->m_pkthdr.len - len);
 		uh->uh_sum = gre_cksum_add(uh->uh_sum,
 		    htons(m->m_pkthdr.len - len + IPPROTO_UDP));
 		m->m_pkthdr.csum_flags = sc->gre_csumflags;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 		len += sizeof(struct udphdr);
 	}
 	gh = (struct grehdr *)mtodo(m, len);
 	gh->gre_proto = proto;
 	if (sc->gre_options & GRE_ENABLE_SEQ)
 		gre_setseqn(gh, sc->gre_oseq++);
 	if (sc->gre_options & GRE_ENABLE_CSUM) {
 		*(uint16_t *)gh->gre_opts = in_cksum_skip(m,
 		    m->m_pkthdr.len, len);
 	}
 	len = m->m_pkthdr.len - len;
 	switch (sc->gre_family) {
 #ifdef INET
 	case AF_INET:
 		error = in_gre_output(m, af, sc->gre_hlen);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		error = in6_gre_output(m, af, sc->gre_hlen, flowid);
 		break;
 #endif
 	default:
 		m_freem(m);
 		error = ENETDOWN;
 	}
 drop:
 	if (error)
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	else {
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 	}
 	GRE_RUNLOCK();
 	return (error);
 }
 
 static void
 gre_qflush(struct ifnet *ifp __unused)
 {
 
 }
 
 static int
 gremodevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 	case MOD_UNLOAD:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t gre_mod = {
 	"if_gre",
 	gremodevent,
 	0
 };
 
 DECLARE_MODULE(if_gre, gre_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_gre, 1);
diff --git a/sys/net/if_infiniband.c b/sys/net/if_infiniband.c
index c673b7d30a79..e5830d977e80 100644
--- a/sys/net/if_infiniband.c
+++ b/sys/net/if_infiniband.c
@@ -1,653 +1,654 @@
 /*-
  * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/devctl.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/infiniband.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_lagg.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <netinet/if_ether.h>
 #include <netinet/in.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/nd6.h>
 
 #include <security/mac/mac_framework.h>
 
 /* if_lagg(4) support */
 struct mbuf *(*lagg_input_infiniband_p)(struct ifnet *, struct mbuf *);
 
 #ifdef INET
 static inline void
 infiniband_ipv4_multicast_map(uint32_t addr,
     const uint8_t *broadcast, uint8_t *buf)
 {
 	uint8_t scope;
 
 	addr = ntohl(addr);
 	scope = broadcast[5] & 0xF;
 
 	buf[0] = 0;
 	buf[1] = 0xff;
 	buf[2] = 0xff;
 	buf[3] = 0xff;
 	buf[4] = 0xff;
 	buf[5] = 0x10 | scope;
 	buf[6] = 0x40;
 	buf[7] = 0x1b;
 	buf[8] = broadcast[8];
 	buf[9] = broadcast[9];
 	buf[10] = 0;
 	buf[11] = 0;
 	buf[12] = 0;
 	buf[13] = 0;
 	buf[14] = 0;
 	buf[15] = 0;
 	buf[16] = (addr >> 24) & 0xff;
 	buf[17] = (addr >> 16) & 0xff;
 	buf[18] = (addr >> 8) & 0xff;
 	buf[19] = addr & 0xff;
 }
 #endif
 
 #ifdef INET6
 static inline void
 infiniband_ipv6_multicast_map(const struct in6_addr *addr,
     const uint8_t *broadcast, uint8_t *buf)
 {
 	uint8_t scope;
 
 	scope = broadcast[5] & 0xF;
 
 	buf[0] = 0;
 	buf[1] = 0xff;
 	buf[2] = 0xff;
 	buf[3] = 0xff;
 	buf[4] = 0xff;
 	buf[5] = 0x10 | scope;
 	buf[6] = 0x60;
 	buf[7] = 0x1b;
 	buf[8] = broadcast[8];
 	buf[9] = broadcast[9];
 	memcpy(&buf[10], &addr->s6_addr[6], 10);
 }
 #endif
 
 /*
  * This is for clients that have an infiniband_header in the mbuf.
  */
 void
 infiniband_bpf_mtap(struct ifnet *ifp, struct mbuf *mb)
 {
 	struct infiniband_header *ibh;
 	struct ether_header eh;
 
 	if (mb->m_len < sizeof(*ibh))
 		return;
 
 	ibh = mtod(mb, struct infiniband_header *);
 	eh.ether_type = ibh->ib_protocol;
 	memset(eh.ether_shost, 0, ETHER_ADDR_LEN);
 	memcpy(eh.ether_dhost, ibh->ib_hwaddr + 4, ETHER_ADDR_LEN);
 	mb->m_data += sizeof(*ibh);
 	mb->m_len -= sizeof(*ibh);
 	mb->m_pkthdr.len -= sizeof(*ibh);
 	bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
 	mb->m_data -= sizeof(*ibh);
 	mb->m_len += sizeof(*ibh);
 	mb->m_pkthdr.len += sizeof(*ibh);
 }
 
 static void
 update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst)
 {
 	int csum_flags = 0;
 
 	if (src->m_pkthdr.csum_flags & CSUM_IP)
 		csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID);
 	if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
 		csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
 	if (src->m_pkthdr.csum_flags & CSUM_SCTP)
 		csum_flags |= CSUM_SCTP_VALID;
 	dst->m_pkthdr.csum_flags |= csum_flags;
 	if (csum_flags & CSUM_DATA_VALID)
 		dst->m_pkthdr.csum_data = 0xffff;
 }
 
 /*
  * Handle link-layer encapsulation requests.
  */
 static int
 infiniband_requestencap(struct ifnet *ifp, struct if_encap_req *req)
 {
 	struct infiniband_header *ih;
 	struct arphdr *ah;
 	uint16_t etype;
 	const uint8_t *lladdr;
 
 	if (req->rtype != IFENCAP_LL)
 		return (EOPNOTSUPP);
 
 	if (req->bufsize < INFINIBAND_HDR_LEN)
 		return (ENOMEM);
 
 	ih = (struct infiniband_header *)req->buf;
 	lladdr = req->lladdr;
 	req->lladdr_off = 0;
 
 	switch (req->family) {
 	case AF_INET:
 		etype = htons(ETHERTYPE_IP);
 		break;
 	case AF_INET6:
 		etype = htons(ETHERTYPE_IPV6);
 		break;
 	case AF_ARP:
 		ah = (struct arphdr *)req->hdata;
 		ah->ar_hrd = htons(ARPHRD_INFINIBAND);
 
 		switch (ntohs(ah->ar_op)) {
 		case ARPOP_REVREQUEST:
 		case ARPOP_REVREPLY:
 			etype = htons(ETHERTYPE_REVARP);
 			break;
 		case ARPOP_REQUEST:
 		case ARPOP_REPLY:
 		default:
 			etype = htons(ETHERTYPE_ARP);
 			break;
 		}
 
 		if (req->flags & IFENCAP_FLAG_BROADCAST)
 			lladdr = ifp->if_broadcastaddr;
 		break;
 	default:
 		return (EAFNOSUPPORT);
 	}
 
 	ih->ib_protocol = etype;
 	ih->ib_reserved = 0;
 	memcpy(ih->ib_hwaddr, lladdr, INFINIBAND_ADDR_LEN);
 	req->bufsize = sizeof(struct infiniband_header);
 
 	return (0);
 }
 
 static int
 infiniband_resolve_addr(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *dst, struct route *ro, uint8_t *phdr,
     uint32_t *pflags, struct llentry **plle)
 {
 #if defined(INET) || defined(INET6)
 	struct infiniband_header *ih = (struct infiniband_header *)phdr;
 #endif
 	uint32_t lleflags = 0;
 	int error = 0;
 
 	if (plle)
 		*plle = NULL;
 
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) {
 			error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle);
 		} else {
 			if (m->m_flags & M_BCAST) {
 				memcpy(ih->ib_hwaddr, ifp->if_broadcastaddr,
 				    INFINIBAND_ADDR_LEN);
 			} else {
 				infiniband_ipv4_multicast_map(
 				    ((const struct sockaddr_in *)dst)->sin_addr.s_addr,
 				    ifp->if_broadcastaddr, ih->ib_hwaddr);
 			}
 			ih->ib_protocol = htons(ETHERTYPE_IP);
 			ih->ib_reserved = 0;
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if ((m->m_flags & M_MCAST) == 0) {
 			int af = RO_GET_FAMILY(ro, dst);
 			error = nd6_resolve(ifp, LLE_SF(af, 0), m, dst, phdr,
 			    &lleflags, plle);
 		} else {
 			infiniband_ipv6_multicast_map(
 			    &((const struct sockaddr_in6 *)dst)->sin6_addr,
 			    ifp->if_broadcastaddr, ih->ib_hwaddr);
 			ih->ib_protocol = htons(ETHERTYPE_IPV6);
 			ih->ib_reserved = 0;
 		}
 		break;
 #endif
 	default:
 		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
 		if (m != NULL)
 			m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 
 	if (error == EHOSTDOWN) {
 		if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0)
 			error = EHOSTUNREACH;
 	}
 
 	if (error != 0)
 		return (error);
 
 	*pflags = RT_MAY_LOOP;
 	if (lleflags & LLE_IFADDR)
 		*pflags |= RT_L2_ME;
 
 	return (0);
 }
 
 /*
  * Infiniband output routine.
  */
 static int
 infiniband_output(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *dst, struct route *ro)
 {
 	uint8_t linkhdr[INFINIBAND_HDR_LEN];
 	uint8_t *phdr;
 	struct llentry *lle = NULL;
 	struct infiniband_header *ih;
 	int error = 0;
 	int hlen;	/* link layer header length */
 	uint32_t pflags;
 	bool addref;
 
 	NET_EPOCH_ASSERT();
 
 	addref = false;
 	phdr = NULL;
 	pflags = 0;
 	if (ro != NULL) {
 		/* XXX BPF uses ro_prepend */
 		if (ro->ro_prepend != NULL) {
 			phdr = ro->ro_prepend;
 			hlen = ro->ro_plen;
 		} else if (!(m->m_flags & (M_BCAST | M_MCAST))) {
 			if ((ro->ro_flags & RT_LLE_CACHE) != 0) {
 				lle = ro->ro_lle;
 				if (lle != NULL &&
 				    (lle->la_flags & LLE_VALID) == 0) {
 					LLE_FREE(lle);
 					lle = NULL;	/* redundant */
 					ro->ro_lle = NULL;
 				}
 				if (lle == NULL) {
 					/* if we lookup, keep cache */
 					addref = 1;
 				} else
 					/*
 					 * Notify LLE code that
 					 * the entry was used
 					 * by datapath.
 					 */
 					llentry_provide_feedback(lle);
 			}
 			if (lle != NULL) {
 				phdr = lle->r_linkdata;
 				hlen = lle->r_hdrlen;
 				pflags = lle->r_flags;
 			}
 		}
 	}
 
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		goto bad;
 #endif
 
 	M_PROFILE(m);
 	if (ifp->if_flags & IFF_MONITOR) {
 		error = ENETDOWN;
 		goto bad;
 	}
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
 		error = ENETDOWN;
 		goto bad;
 	}
 
 	if (phdr == NULL) {
 		/* No prepend data supplied. Try to calculate ourselves. */
 		phdr = linkhdr;
 		hlen = INFINIBAND_HDR_LEN;
 		error = infiniband_resolve_addr(ifp, m, dst, ro, phdr, &pflags,
 		    addref ? &lle : NULL);
 		if (addref && lle != NULL)
 			ro->ro_lle = lle;
 		if (error != 0)
 			return (error == EWOULDBLOCK ? 0 : error);
 	}
 
 	if ((pflags & RT_L2_ME) != 0) {
 		update_mbuf_csumflags(m, m);
 		return (if_simloop(ifp, m, RO_GET_FAMILY(ro, dst), 0));
 	}
 
 	/*
 	 * Add local infiniband header. If no space in first mbuf,
 	 * allocate another.
 	 */
 	M_PREPEND(m, INFINIBAND_HDR_LEN, M_NOWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto bad;
 	}
 	if ((pflags & RT_HAS_HEADER) == 0) {
 		ih = mtod(m, struct infiniband_header *);
 		memcpy(ih, phdr, hlen);
 	}
 
 	/*
 	 * Queue message on interface, update output statistics if
 	 * successful, and start output if interface not yet active.
 	 */
 	return (ifp->if_transmit(ifp, m));
 bad:
 	if (m != NULL)
 		m_freem(m);
 	return (error);
 }
 
 /*
  * Process a received Infiniband packet.
  */
 static void
 infiniband_input(struct ifnet *ifp, struct mbuf *m)
 {
 	struct infiniband_header *ibh;
 	struct epoch_tracker et;
 	int isr;
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		goto done;
 	}
 
 	ibh = mtod(m, struct infiniband_header *);
 
 	/*
 	 * Reset layer specific mbuf flags to avoid confusing upper
 	 * layers:
 	 */
 	m->m_flags &= ~M_VLANTAG;
 	m_clrprotoflags(m);
 
 	if (INFINIBAND_IS_MULTICAST(ibh->ib_hwaddr)) {
 		if (memcmp(ibh->ib_hwaddr, ifp->if_broadcastaddr,
 		    ifp->if_addrlen) == 0)
 			m->m_flags |= M_BCAST;
 		else
 			m->m_flags |= M_MCAST;
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 	}
 
 	/* Let BPF have it before we strip the header. */
 	INFINIBAND_BPF_MTAP(ifp, m);
 
 	/* Allow monitor mode to claim this frame, after stats are updated. */
 	if (ifp->if_flags & IFF_MONITOR) {
 		m_freem(m);
 		goto done;
 	}
 
 	/* Direct packet to correct FIB based on interface config. */
 	M_SETFIB(m, ifp->if_fib);
 
 	/* Handle input from a lagg<N> port */
 	if (ifp->if_type == IFT_INFINIBANDLAG) {
 		KASSERT(lagg_input_infiniband_p != NULL,
 		    ("%s: if_lagg not loaded!", __func__));
 		m = (*lagg_input_infiniband_p)(ifp, m);
 		if (__predict_false(m == NULL))
 			goto done;
 		ifp = m->m_pkthdr.rcvif;
 	}
 
 	/*
 	 * Dispatch frame to upper layer.
 	 */
 	switch (ibh->ib_protocol) {
 #ifdef INET
 	case htons(ETHERTYPE_IP):
 		isr = NETISR_IP;
 		break;
 
 	case htons(ETHERTYPE_ARP):
 		if (ifp->if_flags & IFF_NOARP) {
 			/* Discard packet if ARP is disabled on interface */
 			m_freem(m);
 			goto done;
 		}
 		isr = NETISR_ARP;
 		break;
 #endif
 #ifdef INET6
 	case htons(ETHERTYPE_IPV6):
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		goto done;
 	}
 
 	/* Strip off the Infiniband header. */
 	m_adj(m, INFINIBAND_HDR_LEN);
 
 #ifdef MAC
 	/*
 	 * Tag the mbuf with an appropriate MAC label before any other
 	 * consumers can get to it.
 	 */
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 	/* Allow monitor mode to claim this frame, after stats are updated. */
 	NET_EPOCH_ENTER(et);
 	netisr_dispatch(isr, m);
 	NET_EPOCH_EXIT(et);
 done:
 	CURVNET_RESTORE();
 }
 
 static int
 infiniband_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
     struct sockaddr *sa)
 {
 	struct sockaddr_dl *sdl;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	uint8_t *e_addr;
 
 	switch (sa->sa_family) {
 	case AF_LINK:
 		/*
 		 * No mapping needed. Just check that it's a valid MC address.
 		 */
 		sdl = (struct sockaddr_dl *)sa;
 		e_addr = LLADDR(sdl);
 		if (!INFINIBAND_IS_MULTICAST(e_addr))
 			return (EADDRNOTAVAIL);
 		*llsa = NULL;
 		return 0;
 
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)sa;
 		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 			return (EADDRNOTAVAIL);
 		sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
 		sdl->sdl_alen = INFINIBAND_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		infiniband_ipv4_multicast_map(
 		    sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return (0);
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)sa;
 		/*
 		 * An IP6 address of 0 means listen to all of the
 		 * multicast address used for IP6. This has no meaning
 		 * in infiniband.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
 			return (EADDRNOTAVAIL);
 		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 			return (EADDRNOTAVAIL);
 		sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
 		sdl->sdl_alen = INFINIBAND_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		infiniband_ipv6_multicast_map(
 		    &sin6->sin6_addr, ifp->if_broadcastaddr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return (0);
 #endif
 	default:
 		return (EAFNOSUPPORT);
 	}
 }
 
 void
 infiniband_ifattach(struct ifnet *ifp, const uint8_t *lla, const uint8_t *llb)
 {
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 	int i;
 
 	ifp->if_addrlen = INFINIBAND_ADDR_LEN;
 	ifp->if_hdrlen = INFINIBAND_HDR_LEN;
 	ifp->if_mtu = INFINIBAND_MTU;
 	if_attach(ifp);
 	ifp->if_output = infiniband_output;
 	ifp->if_input = infiniband_input;
 	ifp->if_resolvemulti = infiniband_resolvemulti;
 	ifp->if_requestencap = infiniband_requestencap;
 
 	if (ifp->if_baudrate == 0)
 		ifp->if_baudrate = IF_Gbps(10); /* default value */
 	if (llb != NULL)
 		ifp->if_broadcastaddr = llb;
 
 	ifa = ifp->if_addr;
 	KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	sdl->sdl_type = IFT_INFINIBAND;
 	sdl->sdl_alen = ifp->if_addrlen;
 
 	if (lla != NULL) {
 		memcpy(LLADDR(sdl), lla, ifp->if_addrlen);
 
 		if (ifp->if_hw_addr != NULL)
 			memcpy(ifp->if_hw_addr, lla, ifp->if_addrlen);
 	} else {
 		lla = LLADDR(sdl);
 	}
 
 	/* Attach ethernet compatible network device */
 	bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
 
 	/* Announce Infiniband MAC address if non-zero. */
 	for (i = 0; i < ifp->if_addrlen; i++)
 		if (lla[i] != 0)
 			break;
 	if (i != ifp->if_addrlen)
 		if_printf(ifp, "Infiniband address: %20D\n", lla, ":");
 
 	/* Add necessary bits are setup; announce it now. */
 	EVENTHANDLER_INVOKE(infiniband_ifattach_event, ifp);
 
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("INFINIBAND", ifp->if_xname, "IFATTACH", NULL);
 }
 
 /*
  * Perform common duties while detaching an Infiniband interface
  */
 void
 infiniband_ifdetach(struct ifnet *ifp)
 {
 	bpfdetach(ifp);
 	if_detach(ifp);
 }
 
 static int
 infiniband_modevent(module_t mod, int type, void *data)
 {
 	switch (type) {
 	case MOD_LOAD:
 	case MOD_UNLOAD:
 		return (0);
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static moduledata_t infiniband_mod = {
 	.name = "if_infiniband",
 	.evhand = &infiniband_modevent,
 };
 
 DECLARE_MODULE(if_infiniband, infiniband_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
 MODULE_VERSION(if_infiniband, 1);
diff --git a/sys/net/if_ipsec.c b/sys/net/if_ipsec.c
index a2f690b4cffb..2e0b956f5f99 100644
--- a/sys/net/if_ipsec.c
+++ b/sys/net/if_ipsec.c
@@ -1,1084 +1,1085 @@
 /*-
  * Copyright (c) 2016-2018 Yandex LLC
  * Copyright (c) 2016-2018 Andrey V. Elsukov <ae@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/fnv_hash.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/errno.h>
 #include <sys/sysctl.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/conf.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/bpf.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_encap.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/scope6_var.h>
 
 #include <netipsec/ipsec.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 
 #include <net/if_ipsec.h>
 #include <netipsec/key.h>
 
 #include <security/mac/mac_framework.h>
 
 static MALLOC_DEFINE(M_IPSEC, "ipsec", "IPsec Virtual Tunnel Interface");
 static const char ipsecname[] = "ipsec";
 
 #if defined(INET) && defined(INET6)
 #define	IPSEC_SPCOUNT		4
 #else
 #define	IPSEC_SPCOUNT		2
 #endif
 
 struct ipsec_softc {
 	struct ifnet		*ifp;
 	struct secpolicy	*sp[IPSEC_SPCOUNT];
 	uint32_t		reqid;
 	u_int			family;
 	u_int			fibnum;
 
 	CK_LIST_ENTRY(ipsec_softc) idhash;
 	CK_LIST_ENTRY(ipsec_softc) srchash;
 };
 
 #define	IPSEC_RLOCK_TRACKER	struct epoch_tracker ipsec_et
 #define	IPSEC_RLOCK()	epoch_enter_preempt(net_epoch_preempt, &ipsec_et)
 #define	IPSEC_RUNLOCK()	epoch_exit_preempt(net_epoch_preempt, &ipsec_et)
 #define	IPSEC_WAIT()	epoch_wait_preempt(net_epoch_preempt)
 
 #ifndef IPSEC_HASH_SIZE
 #define	IPSEC_HASH_SIZE	(1 << 5)
 #endif
 
 CK_LIST_HEAD(ipsec_iflist, ipsec_softc);
 VNET_DEFINE_STATIC(struct ipsec_iflist *, ipsec_idhtbl) = NULL;
 #define	V_ipsec_idhtbl		VNET(ipsec_idhtbl)
 
 #ifdef INET
 VNET_DEFINE_STATIC(struct ipsec_iflist *, ipsec4_srchtbl) = NULL;
 #define	V_ipsec4_srchtbl	VNET(ipsec4_srchtbl)
 static const struct srcaddrtab *ipsec4_srctab = NULL;
 #endif
 
 #ifdef INET6
 VNET_DEFINE_STATIC(struct ipsec_iflist *, ipsec6_srchtbl) = NULL;
 #define	V_ipsec6_srchtbl	VNET(ipsec6_srchtbl)
 static const struct srcaddrtab *ipsec6_srctab = NULL;
 #endif
 
 static struct ipsec_iflist *
 ipsec_idhash(uint32_t id)
 {
 
 	return (&V_ipsec_idhtbl[fnv_32_buf(&id, sizeof(id),
 	    FNV1_32_INIT) & (IPSEC_HASH_SIZE - 1)]);
 }
 
 static struct ipsec_iflist *
 ipsec_srchash(const struct sockaddr *sa)
 {
 	uint32_t hval;
 
 	switch (sa->sa_family) {
 #ifdef INET
 	case AF_INET:
 		hval = fnv_32_buf(
 		    &((const struct sockaddr_in *)sa)->sin_addr.s_addr,
 		    sizeof(in_addr_t), FNV1_32_INIT);
 		return (&V_ipsec4_srchtbl[hval & (IPSEC_HASH_SIZE - 1)]);
 #endif
 #ifdef INET6
 	case AF_INET6:
 		hval = fnv_32_buf(
 		    &((const struct sockaddr_in6 *)sa)->sin6_addr,
 		    sizeof(struct in6_addr), FNV1_32_INIT);
 		return (&V_ipsec6_srchtbl[hval & (IPSEC_HASH_SIZE - 1)]);
 #endif
 	}
 	return (NULL);
 }
 
 /*
  * ipsec_ioctl_sx protects from concurrent ioctls.
  */
 static struct sx ipsec_ioctl_sx;
 SX_SYSINIT(ipsec_ioctl_sx, &ipsec_ioctl_sx, "ipsec_ioctl");
 
 static int	ipsec_init_reqid(struct ipsec_softc *);
 static int	ipsec_set_tunnel(struct ipsec_softc *, struct sockaddr *,
     struct sockaddr *, uint32_t);
 static void	ipsec_delete_tunnel(struct ipsec_softc *);
 
 static int	ipsec_set_addresses(struct ifnet *, struct sockaddr *,
     struct sockaddr *);
 static int	ipsec_set_reqid(struct ipsec_softc *, uint32_t);
 static void	ipsec_set_running(struct ipsec_softc *);
 
 #ifdef VIMAGE
 static void	ipsec_reassign(struct ifnet *, struct vnet *, char *);
 #endif
 static void	ipsec_srcaddr(void *, const struct sockaddr *, int);
 static int	ipsec_ioctl(struct ifnet *, u_long, caddr_t);
 static int	ipsec_transmit(struct ifnet *, struct mbuf *);
 static int	ipsec_output(struct ifnet *, struct mbuf *,
     const struct sockaddr *, struct route *);
 static void	ipsec_qflush(struct ifnet *);
 static int	ipsec_clone_create(struct if_clone *, int, caddr_t);
 static void	ipsec_clone_destroy(struct ifnet *);
 
 VNET_DEFINE_STATIC(struct if_clone *, ipsec_cloner);
 #define	V_ipsec_cloner		VNET(ipsec_cloner)
 
 static int
 ipsec_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct ipsec_softc *sc;
 	struct ifnet *ifp;
 
 	sc = malloc(sizeof(*sc), M_IPSEC, M_WAITOK | M_ZERO);
 	sc->fibnum = curthread->td_proc->p_fibnum;
 	sc->ifp = ifp = if_alloc(IFT_TUNNEL);
 	ifp->if_softc = sc;
 	if_initname(ifp, ipsecname, unit);
 
 	ifp->if_addrlen = 0;
 	ifp->if_mtu = IPSEC_MTU;
 	ifp->if_flags  = IFF_POINTOPOINT | IFF_MULTICAST;
 	ifp->if_ioctl  = ipsec_ioctl;
 	ifp->if_transmit  = ipsec_transmit;
 	ifp->if_qflush  = ipsec_qflush;
 	ifp->if_output = ipsec_output;
 #ifdef VIMAGE
 	ifp->if_reassign = ipsec_reassign;
 #endif
 	if_attach(ifp);
 	bpfattach(ifp, DLT_NULL, sizeof(uint32_t));
 
 	return (0);
 }
 
 #ifdef VIMAGE
 static void
 ipsec_reassign(struct ifnet *ifp, struct vnet *new_vnet __unused,
     char *unused __unused)
 {
 	struct ipsec_softc *sc;
 
 	sx_xlock(&ipsec_ioctl_sx);
 	sc = ifp->if_softc;
 	if (sc != NULL)
 		ipsec_delete_tunnel(sc);
 	sx_xunlock(&ipsec_ioctl_sx);
 }
 #endif /* VIMAGE */
 
 static void
 ipsec_clone_destroy(struct ifnet *ifp)
 {
 	struct ipsec_softc *sc;
 
 	sx_xlock(&ipsec_ioctl_sx);
 	sc = ifp->if_softc;
 	ipsec_delete_tunnel(sc);
 	/*
 	 * Delete softc from idhash on interface destroy, since
 	 * ipsec_delete_tunnel() keeps reqid unchanged.
 	 */
 	if (sc->reqid != 0)
 		CK_LIST_REMOVE(sc, idhash);
 	bpfdetach(ifp);
 	if_detach(ifp);
 	ifp->if_softc = NULL;
 	sx_xunlock(&ipsec_ioctl_sx);
 
 	IPSEC_WAIT();
 	if_free(ifp);
 	free(sc, M_IPSEC);
 }
 
 static struct ipsec_iflist *
 ipsec_hashinit(void)
 {
 	struct ipsec_iflist *hash;
 	int i;
 
 	hash = malloc(sizeof(struct ipsec_iflist) * IPSEC_HASH_SIZE,
 	    M_IPSEC, M_WAITOK);
 	for (i = 0; i < IPSEC_HASH_SIZE; i++)
 		CK_LIST_INIT(&hash[i]);
 
 	return (hash);
 }
 
 static void
 vnet_ipsec_init(const void *unused __unused)
 {
 
 	V_ipsec_idhtbl = ipsec_hashinit();
 #ifdef INET
 	V_ipsec4_srchtbl = ipsec_hashinit();
 	if (IS_DEFAULT_VNET(curvnet))
 		ipsec4_srctab = ip_encap_register_srcaddr(ipsec_srcaddr,
 		    NULL, M_WAITOK);
 #endif
 #ifdef INET6
 	V_ipsec6_srchtbl = ipsec_hashinit();
 	if (IS_DEFAULT_VNET(curvnet))
 		ipsec6_srctab = ip6_encap_register_srcaddr(ipsec_srcaddr,
 		    NULL, M_WAITOK);
 #endif
 	V_ipsec_cloner = if_clone_simple(ipsecname, ipsec_clone_create,
 	    ipsec_clone_destroy, 0);
 }
 VNET_SYSINIT(vnet_ipsec_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_ipsec_init, NULL);
 
 static void
 vnet_ipsec_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_ipsec_cloner);
 	free(V_ipsec_idhtbl, M_IPSEC);
 	/*
 	 * Use V_ipsec_idhtbl pointer as indicator that VNET is going to be
 	 * destroyed, it is used by ipsec_srcaddr() callback.
 	 */
 	V_ipsec_idhtbl = NULL;
 	IPSEC_WAIT();
 
 #ifdef INET
 	if (IS_DEFAULT_VNET(curvnet))
 		ip_encap_unregister_srcaddr(ipsec4_srctab);
 	free(V_ipsec4_srchtbl, M_IPSEC);
 #endif
 #ifdef INET6
 	if (IS_DEFAULT_VNET(curvnet))
 		ip6_encap_unregister_srcaddr(ipsec6_srctab);
 	free(V_ipsec6_srchtbl, M_IPSEC);
 #endif
 }
 VNET_SYSUNINIT(vnet_ipsec_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_ipsec_uninit, NULL);
 
 static struct secpolicy *
 ipsec_getpolicy(struct ipsec_softc *sc, int dir, sa_family_t af)
 {
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		return (sc->sp[(dir == IPSEC_DIR_INBOUND ? 0: 1)]);
 #endif
 #ifdef INET6
 	case AF_INET6:
 		return (sc->sp[(dir == IPSEC_DIR_INBOUND ? 0: 1)
 #ifdef INET
 			+ 2
 #endif
 		]);
 #endif
 	}
 	return (NULL);
 }
 
 static struct secasindex *
 ipsec_getsaidx(struct ipsec_softc *sc, int dir, sa_family_t af)
 {
 	struct secpolicy *sp;
 
 	sp = ipsec_getpolicy(sc, dir, af);
 	if (sp == NULL)
 		return (NULL);
 	return (&sp->req[0]->saidx);
 }
 
 static int
 ipsec_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	IPSEC_RLOCK_TRACKER;
 	struct ipsec_softc *sc;
 	struct secpolicy *sp;
 	struct ip *ip;
 	uint32_t af;
 	int error;
 
 	IPSEC_RLOCK();
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error) {
 		m_freem(m);
 		goto err;
 	}
 #endif
 	error = ENETDOWN;
 	sc = ifp->if_softc;
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    (ifp->if_flags & IFF_MONITOR) != 0 ||
 	    (ifp->if_flags & IFF_UP) == 0 || sc->family == 0) {
 		m_freem(m);
 		goto err;
 	}
 
 	/* Determine address family to correctly handle packet in BPF */
 	ip = mtod(m, struct ip *);
 	switch (ip->ip_v) {
 #ifdef INET
 	case IPVERSION:
 		af = AF_INET;
 		break;
 #endif
 #ifdef INET6
 	case (IPV6_VERSION >> 4):
 		af = AF_INET6;
 		break;
 #endif
 	default:
 		error = EAFNOSUPPORT;
 		m_freem(m);
 		goto err;
 	}
 
 	/*
 	 * Loop prevention.
 	 * XXX: for now just check presence of IPSEC_OUT_DONE mbuf tag.
 	 *      We can read full chain and compare destination address,
 	 *      proto and mode from xform_history with values from softc.
 	 */
 	if (m_tag_find(m, PACKET_TAG_IPSEC_OUT_DONE, NULL) != NULL) {
 		m_freem(m);
 		goto err;
 	}
 
 	sp = ipsec_getpolicy(sc, IPSEC_DIR_OUTBOUND, af);
 	key_addref(sp);
 	M_SETFIB(m, sc->fibnum);
 
 	BPF_MTAP2(ifp, &af, sizeof(af), m);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		error = ipsec4_process_packet(m, sp, NULL);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		error = ipsec6_process_packet(m, sp, NULL);
 		break;
 #endif
 	default:
 		panic("%s: unknown address family\n", __func__);
 	}
 err:
 	if (error != 0)
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	IPSEC_RUNLOCK();
 	return (error);
 }
 
 static void
 ipsec_qflush(struct ifnet *ifp __unused)
 {
 
 }
 
 static int
 ipsec_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
 	struct route *ro)
 {
 
 	return (ifp->if_transmit(ifp, m));
 }
 
 int
 ipsec_if_input(struct mbuf *m, struct secasvar *sav, uint32_t af)
 {
 	IPSEC_RLOCK_TRACKER;
 	struct secasindex *saidx;
 	struct ipsec_softc *sc;
 	struct ifnet *ifp;
 
 	if (sav->state != SADB_SASTATE_MATURE &&
 	    sav->state != SADB_SASTATE_DYING) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	if (sav->sah->saidx.mode != IPSEC_MODE_TUNNEL ||
 	    sav->sah->saidx.proto != IPPROTO_ESP)
 		return (0);
 
 	IPSEC_RLOCK();
 	CK_LIST_FOREACH(sc, ipsec_idhash(sav->sah->saidx.reqid), idhash) {
 		if (sc->family == 0)
 			continue;
 		saidx = ipsec_getsaidx(sc, IPSEC_DIR_INBOUND,
 		    sav->sah->saidx.src.sa.sa_family);
 		/* SA's reqid should match reqid in SP */
 		if (saidx == NULL ||
 		    sav->sah->saidx.reqid != saidx->reqid)
 			continue;
 		/* SAH's addresses should match tunnel endpoints. */
 		if (key_sockaddrcmp(&sav->sah->saidx.dst.sa,
 		    &saidx->dst.sa, 0) != 0)
 			continue;
 		if (key_sockaddrcmp(&sav->sah->saidx.src.sa,
 		    &saidx->src.sa, 0) == 0)
 			break;
 	}
 	if (sc == NULL) {
 		IPSEC_RUNLOCK();
 		/* Tunnel was not found. Nothing to do. */
 		return (0);
 	}
 	ifp = sc->ifp;
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    (ifp->if_flags & IFF_UP) == 0) {
 		IPSEC_RUNLOCK();
 		m_freem(m);
 		return (ENETDOWN);
 	}
 	/*
 	 * We found matching and working tunnel.
 	 * Set its ifnet as receiving interface.
 	 */
 	m->m_pkthdr.rcvif = ifp;
 
 	m_clrprotoflags(m);
 	M_SETFIB(m, ifp->if_fib);
 	BPF_MTAP2(ifp, &af, sizeof(af), m);
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	if ((ifp->if_flags & IFF_MONITOR) != 0) {
 		IPSEC_RUNLOCK();
 		m_freem(m);
 		return (ENETDOWN);
 	}
 	IPSEC_RUNLOCK();
 	return (0);
 }
 
 static int
 ipsec_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq*)data;
 	struct sockaddr *dst, *src;
 	struct ipsec_softc *sc;
 	struct secasindex *saidx;
 #ifdef INET
 	struct sockaddr_in *sin = NULL;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6 = NULL;
 #endif
 	uint32_t reqid;
 	int error;
 
 	switch (cmd) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 	case SIOCGIFMTU:
 	case SIOCSIFFLAGS:
 		return (0);
 	case SIOCSIFMTU:
 		if (ifr->ifr_mtu < IPSEC_MTU_MIN ||
 		    ifr->ifr_mtu > IPSEC_MTU_MAX)
 			return (EINVAL);
 		else
 			ifp->if_mtu = ifr->ifr_mtu;
 		return (0);
 	}
 	sx_xlock(&ipsec_ioctl_sx);
 	sc = ifp->if_softc;
 	/* Check that softc is still here */
 	if (sc == NULL) {
 		error = ENXIO;
 		goto bad;
 	}
 	error = 0;
 	switch (cmd) {
 	case SIOCSIFPHYADDR:
 #ifdef INET6
 	case SIOCSIFPHYADDR_IN6:
 #endif
 		error = EINVAL;
 		switch (cmd) {
 #ifdef INET
 		case SIOCSIFPHYADDR:
 			src = (struct sockaddr *)
 				&(((struct in_aliasreq *)data)->ifra_addr);
 			dst = (struct sockaddr *)
 				&(((struct in_aliasreq *)data)->ifra_dstaddr);
 			break;
 #endif
 #ifdef INET6
 		case SIOCSIFPHYADDR_IN6:
 			src = (struct sockaddr *)
 				&(((struct in6_aliasreq *)data)->ifra_addr);
 			dst = (struct sockaddr *)
 				&(((struct in6_aliasreq *)data)->ifra_dstaddr);
 			break;
 #endif
 		default:
 			goto bad;
 		}
 		/* sa_family must be equal */
 		if (src->sa_family != dst->sa_family ||
 		    src->sa_len != dst->sa_len)
 			goto bad;
 
 		/* validate sa_len */
 		switch (src->sa_family) {
 #ifdef INET
 		case AF_INET:
 			if (src->sa_len != sizeof(struct sockaddr_in))
 				goto bad;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			if (src->sa_len != sizeof(struct sockaddr_in6))
 				goto bad;
 			break;
 #endif
 		default:
 			error = EAFNOSUPPORT;
 			goto bad;
 		}
 		/* check sa_family looks sane for the cmd */
 		error = EAFNOSUPPORT;
 		switch (cmd) {
 #ifdef INET
 		case SIOCSIFPHYADDR:
 			if (src->sa_family == AF_INET)
 				break;
 			goto bad;
 #endif
 #ifdef INET6
 		case SIOCSIFPHYADDR_IN6:
 			if (src->sa_family == AF_INET6)
 				break;
 			goto bad;
 #endif
 		}
 		error = EADDRNOTAVAIL;
 		switch (src->sa_family) {
 #ifdef INET
 		case AF_INET:
 			if (satosin(src)->sin_addr.s_addr == INADDR_ANY ||
 			    satosin(dst)->sin_addr.s_addr == INADDR_ANY)
 				goto bad;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			if (IN6_IS_ADDR_UNSPECIFIED(
 			    &satosin6(src)->sin6_addr) ||
 			    IN6_IS_ADDR_UNSPECIFIED(
 			    &satosin6(dst)->sin6_addr))
 				goto bad;
 			/*
 			 * Check validity of the scope zone ID of the
 			 * addresses, and convert it into the kernel
 			 * internal form if necessary.
 			 */
 			error = sa6_embedscope(satosin6(src), 0);
 			if (error != 0)
 				goto bad;
 			error = sa6_embedscope(satosin6(dst), 0);
 			if (error != 0)
 				goto bad;
 #endif
 		};
 		error = ipsec_set_addresses(ifp, src, dst);
 		break;
 	case SIOCDIFPHYADDR:
 		ipsec_delete_tunnel(sc);
 		break;
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 #ifdef INET6
 	case SIOCGIFPSRCADDR_IN6:
 	case SIOCGIFPDSTADDR_IN6:
 #endif
 		if (sc->family == 0) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		saidx = ipsec_getsaidx(sc, IPSEC_DIR_OUTBOUND, sc->family);
 		if (saidx == NULL) {
 			error = ENXIO;
 			break;
 		}
 		switch (cmd) {
 #ifdef INET
 		case SIOCGIFPSRCADDR:
 		case SIOCGIFPDSTADDR:
 			if (saidx->src.sa.sa_family != AF_INET) {
 				error = EADDRNOTAVAIL;
 				break;
 			}
 			sin = (struct sockaddr_in *)&ifr->ifr_addr;
 			memset(sin, 0, sizeof(*sin));
 			sin->sin_family = AF_INET;
 			sin->sin_len = sizeof(*sin);
 			break;
 #endif
 #ifdef INET6
 		case SIOCGIFPSRCADDR_IN6:
 		case SIOCGIFPDSTADDR_IN6:
 			if (saidx->src.sa.sa_family != AF_INET6) {
 				error = EADDRNOTAVAIL;
 				break;
 			}
 			sin6 = (struct sockaddr_in6 *)
 				&(((struct in6_ifreq *)data)->ifr_addr);
 			memset(sin6, 0, sizeof(*sin6));
 			sin6->sin6_family = AF_INET6;
 			sin6->sin6_len = sizeof(*sin6);
 			break;
 #endif
 		default:
 			error = EAFNOSUPPORT;
 		}
 		if (error == 0) {
 			switch (cmd) {
 #ifdef INET
 			case SIOCGIFPSRCADDR:
 				sin->sin_addr = saidx->src.sin.sin_addr;
 				break;
 			case SIOCGIFPDSTADDR:
 				sin->sin_addr = saidx->dst.sin.sin_addr;
 				break;
 #endif
 #ifdef INET6
 			case SIOCGIFPSRCADDR_IN6:
 				sin6->sin6_addr = saidx->src.sin6.sin6_addr;
 				break;
 			case SIOCGIFPDSTADDR_IN6:
 				sin6->sin6_addr = saidx->dst.sin6.sin6_addr;
 				break;
 #endif
 			}
 		}
 		if (error != 0)
 			break;
 		switch (cmd) {
 #ifdef INET
 		case SIOCGIFPSRCADDR:
 		case SIOCGIFPDSTADDR:
 			error = prison_if(curthread->td_ucred,
 			    (struct sockaddr *)sin);
 			if (error != 0)
 				memset(sin, 0, sizeof(*sin));
 			break;
 #endif
 #ifdef INET6
 		case SIOCGIFPSRCADDR_IN6:
 		case SIOCGIFPDSTADDR_IN6:
 			error = prison_if(curthread->td_ucred,
 			    (struct sockaddr *)sin6);
 			if (error == 0)
 				error = sa6_recoverscope(sin6);
 			if (error != 0)
 				memset(sin6, 0, sizeof(*sin6));
 #endif
 		}
 		break;
 	case SIOCGTUNFIB:
 		ifr->ifr_fib = sc->fibnum;
 		break;
 	case SIOCSTUNFIB:
 		if ((error = priv_check(curthread, PRIV_NET_SETIFFIB)) != 0)
 			break;
 		if (ifr->ifr_fib >= rt_numfibs)
 			error = EINVAL;
 		else
 			sc->fibnum = ifr->ifr_fib;
 		break;
 	case IPSECGREQID:
 		reqid = sc->reqid;
 		error = copyout(&reqid, ifr_data_get_ptr(ifr), sizeof(reqid));
 		break;
 	case IPSECSREQID:
 		if ((error = priv_check(curthread, PRIV_NET_SETIFCAP)) != 0)
 			break;
 		error = copyin(ifr_data_get_ptr(ifr), &reqid, sizeof(reqid));
 		if (error != 0)
 			break;
 		error = ipsec_set_reqid(sc, reqid);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 bad:
 	sx_xunlock(&ipsec_ioctl_sx);
 	return (error);
 }
 
 /*
  * Check that ingress address belongs to local host.
  */
 static void
 ipsec_set_running(struct ipsec_softc *sc)
 {
 	struct secasindex *saidx;
 	int localip;
 
 	saidx = ipsec_getsaidx(sc, IPSEC_DIR_OUTBOUND, sc->family);
 	if (saidx == NULL)
 		return;
 	localip = 0;
 	switch (sc->family) {
 #ifdef INET
 	case AF_INET:
 		localip = in_localip(saidx->src.sin.sin_addr);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		localip = in6_localip(&saidx->src.sin6.sin6_addr);
 		break;
 #endif
 	}
 	if (localip != 0)
 		sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	else
 		sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 }
 
 /*
  * ifaddr_event handler.
  * Clear IFF_DRV_RUNNING flag when ingress address disappears to prevent
  * source address spoofing.
  */
 static void
 ipsec_srcaddr(void *arg __unused, const struct sockaddr *sa,
     int event __unused)
 {
 	struct ipsec_softc *sc;
 	struct secasindex *saidx;
 	struct ipsec_iflist *iflist;
 
 	/* Check that VNET is ready */
 	if (V_ipsec_idhtbl == NULL)
 		return;
 
 	NET_EPOCH_ASSERT();
 	iflist = ipsec_srchash(sa);
 	if (iflist == NULL)
 		return;
 	CK_LIST_FOREACH(sc, iflist, srchash) {
 		if (sc->family == 0)
 			continue;
 		saidx = ipsec_getsaidx(sc, IPSEC_DIR_OUTBOUND, sa->sa_family);
 		if (saidx == NULL ||
 		    key_sockaddrcmp(&saidx->src.sa, sa, 0) != 0)
 			continue;
 		ipsec_set_running(sc);
 	}
 }
 
 /*
  * Allocate new private security policies for tunneling interface.
  * Each tunneling interface has following security policies for
  * both AF:
  *   0.0.0.0/0[any] 0.0.0.0/0[any] -P in \
  *	ipsec esp/tunnel/RemoteIP-LocalIP/unique:reqid
  *   0.0.0.0/0[any] 0.0.0.0/0[any] -P out \
  *	ipsec esp/tunnel/LocalIP-RemoteIP/unique:reqid
  */
 static int
 ipsec_newpolicies(struct ipsec_softc *sc, struct secpolicy *sp[IPSEC_SPCOUNT],
     const struct sockaddr *src, const struct sockaddr *dst, uint32_t reqid)
 {
 	struct ipsecrequest *isr;
 	int i;
 
 	memset(sp, 0, sizeof(struct secpolicy *) * IPSEC_SPCOUNT);
 	for (i = 0; i < IPSEC_SPCOUNT; i++) {
 		if ((sp[i] = key_newsp()) == NULL)
 			goto fail;
 		if ((isr = ipsec_newisr()) == NULL)
 			goto fail;
 
 		sp[i]->policy = IPSEC_POLICY_IPSEC;
 		sp[i]->state = IPSEC_SPSTATE_DEAD;
 		sp[i]->req[sp[i]->tcount++] = isr;
 		sp[i]->created = time_second;
 		/* Use priority field to store if_index */
 		sp[i]->priority = sc->ifp->if_index;
 		isr->level = IPSEC_LEVEL_UNIQUE;
 		isr->saidx.proto = IPPROTO_ESP;
 		isr->saidx.mode = IPSEC_MODE_TUNNEL;
 		isr->saidx.reqid = reqid;
 		if (i % 2 == 0) {
 			sp[i]->spidx.dir = IPSEC_DIR_INBOUND;
 			bcopy(src, &isr->saidx.dst, src->sa_len);
 			bcopy(dst, &isr->saidx.src, dst->sa_len);
 		} else {
 			sp[i]->spidx.dir = IPSEC_DIR_OUTBOUND;
 			bcopy(src, &isr->saidx.src, src->sa_len);
 			bcopy(dst, &isr->saidx.dst, dst->sa_len);
 		}
 		sp[i]->spidx.ul_proto = IPSEC_ULPROTO_ANY;
 #ifdef INET
 		if (i < 2) {
 			sp[i]->spidx.src.sa.sa_family =
 			    sp[i]->spidx.dst.sa.sa_family = AF_INET;
 			sp[i]->spidx.src.sa.sa_len =
 			    sp[i]->spidx.dst.sa.sa_len =
 			    sizeof(struct sockaddr_in);
 			continue;
 		}
 #endif
 #ifdef INET6
 		sp[i]->spidx.src.sa.sa_family =
 		    sp[i]->spidx.dst.sa.sa_family = AF_INET6;
 		sp[i]->spidx.src.sa.sa_len =
 		    sp[i]->spidx.dst.sa.sa_len = sizeof(struct sockaddr_in6);
 #endif
 	}
 	return (0);
 fail:
 	for (i = 0; i < IPSEC_SPCOUNT; i++)
 		key_freesp(&sp[i]);
 	return (ENOMEM);
 }
 
 static int
 ipsec_check_reqid(uint32_t reqid)
 {
 	struct ipsec_softc *sc;
 
 	sx_assert(&ipsec_ioctl_sx, SA_XLOCKED);
 	CK_LIST_FOREACH(sc, ipsec_idhash(reqid), idhash) {
 		if (sc->reqid == reqid)
 			return (EEXIST);
 	}
 	return (0);
 }
 
 /*
  * We use key_newreqid() to automatically obtain unique reqid.
  * Then we check that given id is unique, i.e. it is not used by
  * another if_ipsec(4) interface. This macro limits the number of
  * tries to get unique id.
  */
 #define	IPSEC_REQID_TRYCNT	64
 static int
 ipsec_init_reqid(struct ipsec_softc *sc)
 {
 	uint32_t reqid;
 	int trycount;
 
 	sx_assert(&ipsec_ioctl_sx, SA_XLOCKED);
 	if (sc->reqid != 0) /* already initialized */
 		return (0);
 
 	trycount = IPSEC_REQID_TRYCNT;
 	while (--trycount > 0) {
 		reqid = key_newreqid();
 		if (ipsec_check_reqid(reqid) == 0)
 			break;
 	}
 	if (trycount == 0)
 		return (EEXIST);
 	sc->reqid = reqid;
 	CK_LIST_INSERT_HEAD(ipsec_idhash(reqid), sc, idhash);
 	return (0);
 }
 
 /*
  * Set or update reqid for given tunneling interface.
  * When specified reqid is zero, generate new one.
  * We are protected by ioctl_sx lock from concurrent id generation.
  * Also softc would not disappear while we hold ioctl_sx lock.
  */
 static int
 ipsec_set_reqid(struct ipsec_softc *sc, uint32_t reqid)
 {
 	struct secasindex *saidx;
 
 	sx_assert(&ipsec_ioctl_sx, SA_XLOCKED);
 
 	if (sc->reqid == reqid && reqid != 0)
 		return (0);
 
 	if (reqid != 0) {
 		/* Check that specified reqid doesn't exist */
 		if (ipsec_check_reqid(reqid) != 0)
 			return (EEXIST);
 		if (sc->reqid != 0) {
 			CK_LIST_REMOVE(sc, idhash);
 			IPSEC_WAIT();
 		}
 		sc->reqid = reqid;
 		CK_LIST_INSERT_HEAD(ipsec_idhash(reqid), sc, idhash);
 	} else {
 		/* Generate new reqid */
 		if (ipsec_init_reqid(sc) != 0)
 			return (EEXIST);
 	}
 
 	/* Tunnel isn't fully configured, just return. */
 	if (sc->family == 0)
 		return (0);
 
 	saidx = ipsec_getsaidx(sc, IPSEC_DIR_OUTBOUND, sc->family);
 	KASSERT(saidx != NULL,
 	    ("saidx is NULL, but family is %d", sc->family));
 	return (ipsec_set_tunnel(sc, &saidx->src.sa, &saidx->dst.sa,
 	    sc->reqid));
 }
 
 /*
  * Set tunnel endpoints addresses.
  */
 static int
 ipsec_set_addresses(struct ifnet *ifp, struct sockaddr *src,
     struct sockaddr *dst)
 {
 	struct ipsec_softc *sc;
 	struct secasindex *saidx;
 
 	sx_assert(&ipsec_ioctl_sx, SA_XLOCKED);
 
 	sc = ifp->if_softc;
 	if (sc->family != 0) {
 		saidx = ipsec_getsaidx(sc, IPSEC_DIR_OUTBOUND,
 		    src->sa_family);
 		if (saidx != NULL && saidx->reqid == sc->reqid &&
 		    key_sockaddrcmp(&saidx->src.sa, src, 0) == 0 &&
 		    key_sockaddrcmp(&saidx->dst.sa, dst, 0) == 0)
 			return (0); /* Nothing has been changed. */
 	}
 	/* If reqid is not set, generate new one. */
 	if (ipsec_init_reqid(sc) != 0)
 		return (EEXIST);
 	return (ipsec_set_tunnel(sc, src, dst, sc->reqid));
 }
 
 static int
 ipsec_set_tunnel(struct ipsec_softc *sc, struct sockaddr *src,
     struct sockaddr *dst, uint32_t reqid)
 {
 	struct epoch_tracker et;
 	struct ipsec_iflist *iflist;
 	struct secpolicy *sp[IPSEC_SPCOUNT];
 	int i;
 
 	sx_assert(&ipsec_ioctl_sx, SA_XLOCKED);
 
 	/* Allocate SP with new addresses. */
 	iflist = ipsec_srchash(src);
 	if (iflist == NULL) {
 		sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 		return (EAFNOSUPPORT);
 	}
 	if (ipsec_newpolicies(sc, sp, src, dst, reqid) == 0) {
 		/* Add new policies to SPDB */
 		if (key_register_ifnet(sp, IPSEC_SPCOUNT) != 0) {
 			for (i = 0; i < IPSEC_SPCOUNT; i++)
 				key_freesp(&sp[i]);
 			return (EAGAIN);
 		}
 		if (sc->family != 0)
 			ipsec_delete_tunnel(sc);
 		for (i = 0; i < IPSEC_SPCOUNT; i++)
 			sc->sp[i] = sp[i];
 		sc->family = src->sa_family;
 		CK_LIST_INSERT_HEAD(iflist, sc, srchash);
 	} else {
 		sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 		return (ENOMEM);
 	}
 	NET_EPOCH_ENTER(et);
 	ipsec_set_running(sc);
 	NET_EPOCH_EXIT(et);
 	return (0);
 }
 
 static void
 ipsec_delete_tunnel(struct ipsec_softc *sc)
 {
 	int i;
 
 	sx_assert(&ipsec_ioctl_sx, SA_XLOCKED);
 
 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	if (sc->family != 0) {
 		CK_LIST_REMOVE(sc, srchash);
 		sc->family = 0;
 		/*
 		 * Make sure that ipsec_if_input() will not do access
 		 * to softc's policies.
 		 */
 		IPSEC_WAIT();
 
 		key_unregister_ifnet(sc->sp, IPSEC_SPCOUNT);
 		for (i = 0; i < IPSEC_SPCOUNT; i++)
 			key_freesp(&sc->sp[i]);
 	}
 }
diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c
index 58157e0dff3f..c649a2c3372e 100644
--- a/sys/net/if_lagg.c
+++ b/sys/net/if_lagg.c
@@ -1,2754 +1,2755 @@
 /*	$OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $	*/
 
 /*
  * Copyright (c) 2005, 2006 Reyk Floeter <reyk@openbsd.org>
  * Copyright (c) 2007 Andrew Thompson <thompsa@FreeBSD.org>
  * Copyright (c) 2014, 2016 Marcelo Araujo <araujo@FreeBSD.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
  * copyright notice and this permission notice appear in all copies.
  *
  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 #include <sys/eventhandler.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_clone.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/bpf.h>
 #include <net/route.h>
 #include <net/vnet.h>
 #include <net/infiniband.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #endif
 #ifdef INET
 #include <netinet/in_systm.h>
 #include <netinet/if_ether.h>
 #endif
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif
 
 #include <net/if_vlan_var.h>
 #include <net/if_lagg.h>
 #include <net/ieee8023ad_lacp.h>
 
 #ifdef INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
  * should be more generalized?
  */
 extern void	nd6_setmtu(struct ifnet *);
 #endif
 
 #ifdef DEV_NETMAP
 MODULE_DEPEND(if_lagg, netmap, 1, 1, 1);
 #endif
 
 #define	LAGG_SX_INIT(_sc)	sx_init(&(_sc)->sc_sx, "if_lagg sx")
 #define	LAGG_SX_DESTROY(_sc)	sx_destroy(&(_sc)->sc_sx)
 #define	LAGG_XLOCK(_sc)		sx_xlock(&(_sc)->sc_sx)
 #define	LAGG_XUNLOCK(_sc)	sx_xunlock(&(_sc)->sc_sx)
 #define	LAGG_SXLOCK_ASSERT(_sc)	sx_assert(&(_sc)->sc_sx, SA_LOCKED)
 #define	LAGG_XLOCK_ASSERT(_sc)	sx_assert(&(_sc)->sc_sx, SA_XLOCKED)
 
 /* Special flags we should propagate to the lagg ports. */
 static struct {
 	int flag;
 	int (*func)(struct ifnet *, int);
 } lagg_pflags[] = {
 	{IFF_PROMISC, ifpromisc},
 	{IFF_ALLMULTI, if_allmulti},
 	{0, NULL}
 };
 
 struct lagg_snd_tag {
 	struct m_snd_tag com;
 	struct m_snd_tag *tag;
 };
 
 VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */
 #define	V_lagg_list	VNET(lagg_list)
 VNET_DEFINE_STATIC(struct mtx, lagg_list_mtx);
 #define	V_lagg_list_mtx	VNET(lagg_list_mtx)
 #define	LAGG_LIST_LOCK_INIT(x)		mtx_init(&V_lagg_list_mtx, \
 					"if_lagg list", NULL, MTX_DEF)
 #define	LAGG_LIST_LOCK_DESTROY(x)	mtx_destroy(&V_lagg_list_mtx)
 #define	LAGG_LIST_LOCK(x)		mtx_lock(&V_lagg_list_mtx)
 #define	LAGG_LIST_UNLOCK(x)		mtx_unlock(&V_lagg_list_mtx)
 eventhandler_tag	lagg_detach_cookie = NULL;
 
 static int	lagg_clone_create(struct if_clone *, char *, size_t,
 		    struct ifc_data *, struct ifnet **);
 static int	lagg_clone_destroy(struct if_clone *, struct ifnet *, uint32_t);
 VNET_DEFINE_STATIC(struct if_clone *, lagg_cloner);
 #define	V_lagg_cloner	VNET(lagg_cloner)
 static const char laggname[] = "lagg";
 static MALLOC_DEFINE(M_LAGG, laggname, "802.3AD Link Aggregation Interface");
 
 static void	lagg_capabilities(struct lagg_softc *);
 static int	lagg_port_create(struct lagg_softc *, struct ifnet *);
 static int	lagg_port_destroy(struct lagg_port *, int);
 static struct mbuf *lagg_input_ethernet(struct ifnet *, struct mbuf *);
 static struct mbuf *lagg_input_infiniband(struct ifnet *, struct mbuf *);
 static void	lagg_linkstate(struct lagg_softc *);
 static void	lagg_port_state(struct ifnet *, int);
 static int	lagg_port_ioctl(struct ifnet *, u_long, caddr_t);
 static int	lagg_port_output(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *);
 static void	lagg_port_ifdetach(void *arg __unused, struct ifnet *);
 #ifdef LAGG_PORT_STACKING
 static int	lagg_port_checkstacking(struct lagg_softc *);
 #endif
 static void	lagg_port2req(struct lagg_port *, struct lagg_reqport *);
 static void	lagg_init(void *);
 static void	lagg_stop(struct lagg_softc *);
 static int	lagg_ioctl(struct ifnet *, u_long, caddr_t);
 #if defined(KERN_TLS) || defined(RATELIMIT)
 static int	lagg_snd_tag_alloc(struct ifnet *,
 		    union if_snd_tag_alloc_params *,
 		    struct m_snd_tag **);
 static int	lagg_snd_tag_modify(struct m_snd_tag *,
 		    union if_snd_tag_modify_params *);
 static int	lagg_snd_tag_query(struct m_snd_tag *,
 		    union if_snd_tag_query_params *);
 static void	lagg_snd_tag_free(struct m_snd_tag *);
 static struct m_snd_tag *lagg_next_snd_tag(struct m_snd_tag *);
 static void     lagg_ratelimit_query(struct ifnet *,
 		    struct if_ratelimit_query_results *);
 #endif
 static int	lagg_setmulti(struct lagg_port *);
 static int	lagg_clrmulti(struct lagg_port *);
 static	void	lagg_setcaps(struct lagg_port *, int cap, int cap2);
 static	int	lagg_setflag(struct lagg_port *, int, int,
 		    int (*func)(struct ifnet *, int));
 static	int	lagg_setflags(struct lagg_port *, int status);
 static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt);
 static int	lagg_transmit_ethernet(struct ifnet *, struct mbuf *);
 static int	lagg_transmit_infiniband(struct ifnet *, struct mbuf *);
 static void	lagg_qflush(struct ifnet *);
 static int	lagg_media_change(struct ifnet *);
 static void	lagg_media_status(struct ifnet *, struct ifmediareq *);
 static struct lagg_port *lagg_link_active(struct lagg_softc *,
 	    struct lagg_port *);
 
 /* Simple round robin */
 static void	lagg_rr_attach(struct lagg_softc *);
 static int	lagg_rr_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 
 /* Active failover */
 static int	lagg_fail_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 
 /* Loadbalancing */
 static void	lagg_lb_attach(struct lagg_softc *);
 static void	lagg_lb_detach(struct lagg_softc *);
 static int	lagg_lb_port_create(struct lagg_port *);
 static void	lagg_lb_port_destroy(struct lagg_port *);
 static int	lagg_lb_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 static int	lagg_lb_porttable(struct lagg_softc *, struct lagg_port *);
 
 /* Broadcast */
 static int    lagg_bcast_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_bcast_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 
 /* 802.3ad LACP */
 static void	lagg_lacp_attach(struct lagg_softc *);
 static void	lagg_lacp_detach(struct lagg_softc *);
 static int	lagg_lacp_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 static void	lagg_lacp_lladdr(struct lagg_softc *);
 
 /* lagg protocol table */
 static const struct lagg_proto {
 	lagg_proto	pr_num;
 	void		(*pr_attach)(struct lagg_softc *);
 	void		(*pr_detach)(struct lagg_softc *);
 	int		(*pr_start)(struct lagg_softc *, struct mbuf *);
 	struct mbuf *	(*pr_input)(struct lagg_softc *, struct lagg_port *,
 			    struct mbuf *);
 	int		(*pr_addport)(struct lagg_port *);
 	void		(*pr_delport)(struct lagg_port *);
 	void		(*pr_linkstate)(struct lagg_port *);
 	void 		(*pr_init)(struct lagg_softc *);
 	void 		(*pr_stop)(struct lagg_softc *);
 	void 		(*pr_lladdr)(struct lagg_softc *);
 	void		(*pr_request)(struct lagg_softc *, void *);
 	void		(*pr_portreq)(struct lagg_port *, void *);
 } lagg_protos[] = {
     {
 	.pr_num = LAGG_PROTO_NONE
     },
     {
 	.pr_num = LAGG_PROTO_ROUNDROBIN,
 	.pr_attach = lagg_rr_attach,
 	.pr_start = lagg_rr_start,
 	.pr_input = lagg_rr_input,
     },
     {
 	.pr_num = LAGG_PROTO_FAILOVER,
 	.pr_start = lagg_fail_start,
 	.pr_input = lagg_fail_input,
     },
     {
 	.pr_num = LAGG_PROTO_LOADBALANCE,
 	.pr_attach = lagg_lb_attach,
 	.pr_detach = lagg_lb_detach,
 	.pr_start = lagg_lb_start,
 	.pr_input = lagg_lb_input,
 	.pr_addport = lagg_lb_port_create,
 	.pr_delport = lagg_lb_port_destroy,
     },
     {
 	.pr_num = LAGG_PROTO_LACP,
 	.pr_attach = lagg_lacp_attach,
 	.pr_detach = lagg_lacp_detach,
 	.pr_start = lagg_lacp_start,
 	.pr_input = lagg_lacp_input,
 	.pr_addport = lacp_port_create,
 	.pr_delport = lacp_port_destroy,
 	.pr_linkstate = lacp_linkstate,
 	.pr_init = lacp_init,
 	.pr_stop = lacp_stop,
 	.pr_lladdr = lagg_lacp_lladdr,
 	.pr_request = lacp_req,
 	.pr_portreq = lacp_portreq,
     },
     {
 	.pr_num = LAGG_PROTO_BROADCAST,
 	.pr_start = lagg_bcast_start,
 	.pr_input = lagg_bcast_input,
     },
 };
 
 SYSCTL_DECL(_net_link);
 SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Link Aggregation");
 
 /* Allow input on any failover links */
 VNET_DEFINE_STATIC(int, lagg_failover_rx_all);
 #define	V_lagg_failover_rx_all	VNET(lagg_failover_rx_all)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(lagg_failover_rx_all), 0,
     "Accept input from any interface in a failover lagg");
 
 /* Default value for using flowid */
 VNET_DEFINE_STATIC(int, def_use_flowid) = 0;
 #define	V_def_use_flowid	VNET(def_use_flowid)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN,
     &VNET_NAME(def_use_flowid), 0,
     "Default setting for using flow id for load sharing");
 
 /* Default value for using numa */
 VNET_DEFINE_STATIC(int, def_use_numa) = 1;
 #define	V_def_use_numa	VNET(def_use_numa)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_numa, CTLFLAG_RWTUN,
     &VNET_NAME(def_use_numa), 0,
     "Use numa to steer flows");
 
 /* Default value for flowid shift */
 VNET_DEFINE_STATIC(int, def_flowid_shift) = 16;
 #define	V_def_flowid_shift	VNET(def_flowid_shift)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN,
     &VNET_NAME(def_flowid_shift), 0,
     "Default setting for flowid shift for load sharing");
 
 static void
 vnet_lagg_init(const void *unused __unused)
 {
 
 	LAGG_LIST_LOCK_INIT();
 	SLIST_INIT(&V_lagg_list);
 	struct if_clone_addreq req = {
 		.create_f = lagg_clone_create,
 		.destroy_f = lagg_clone_destroy,
 		.flags = IFC_F_AUTOUNIT,
 	};
 	V_lagg_cloner = ifc_attach_cloner(laggname, &req);
 }
 VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_lagg_init, NULL);
 
 static void
 vnet_lagg_uninit(const void *unused __unused)
 {
 
 	ifc_detach_cloner(V_lagg_cloner);
 	LAGG_LIST_LOCK_DESTROY();
 }
 VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_lagg_uninit, NULL);
 
 static int
 lagg_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		lagg_input_ethernet_p = lagg_input_ethernet;
 		lagg_input_infiniband_p = lagg_input_infiniband;
 		lagg_linkstate_p = lagg_port_state;
 		lagg_detach_cookie = EVENTHANDLER_REGISTER(
 		    ifnet_departure_event, lagg_port_ifdetach, NULL,
 		    EVENTHANDLER_PRI_ANY);
 		break;
 	case MOD_UNLOAD:
 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 		    lagg_detach_cookie);
 		lagg_input_ethernet_p = NULL;
 		lagg_input_infiniband_p = NULL;
 		lagg_linkstate_p = NULL;
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t lagg_mod = {
 	"if_lagg",
 	lagg_modevent,
 	0
 };
 
 DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_lagg, 1);
 MODULE_DEPEND(if_lagg, if_infiniband, 1, 1, 1);
 
 static void
 lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr)
 {
 
 	LAGG_XLOCK_ASSERT(sc);
 	KASSERT(sc->sc_proto == LAGG_PROTO_NONE, ("%s: sc %p has proto",
 	    __func__, sc));
 
 	if (sc->sc_ifflags & IFF_DEBUG)
 		if_printf(sc->sc_ifp, "using proto %u\n", pr);
 
 	if (lagg_protos[pr].pr_attach != NULL)
 		lagg_protos[pr].pr_attach(sc);
 	sc->sc_proto = pr;
 }
 
 static void
 lagg_proto_detach(struct lagg_softc *sc)
 {
 	lagg_proto pr;
 
 	LAGG_XLOCK_ASSERT(sc);
 	pr = sc->sc_proto;
 	sc->sc_proto = LAGG_PROTO_NONE;
 
 	if (lagg_protos[pr].pr_detach != NULL)
 		lagg_protos[pr].pr_detach(sc);
 }
 
 static int
 lagg_proto_start(struct lagg_softc *sc, struct mbuf *m)
 {
 
 	return (lagg_protos[sc->sc_proto].pr_start(sc, m));
 }
 
 static struct mbuf *
 lagg_proto_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 
 	return (lagg_protos[sc->sc_proto].pr_input(sc, lp, m));
 }
 
 static int
 lagg_proto_addport(struct lagg_softc *sc, struct lagg_port *lp)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_addport == NULL)
 		return (0);
 	else
 		return (lagg_protos[sc->sc_proto].pr_addport(lp));
 }
 
 static void
 lagg_proto_delport(struct lagg_softc *sc, struct lagg_port *lp)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_delport != NULL)
 		lagg_protos[sc->sc_proto].pr_delport(lp);
 }
 
 static void
 lagg_proto_linkstate(struct lagg_softc *sc, struct lagg_port *lp)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_linkstate != NULL)
 		lagg_protos[sc->sc_proto].pr_linkstate(lp);
 }
 
 static void
 lagg_proto_init(struct lagg_softc *sc)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_init != NULL)
 		lagg_protos[sc->sc_proto].pr_init(sc);
 }
 
 static void
 lagg_proto_stop(struct lagg_softc *sc)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_stop != NULL)
 		lagg_protos[sc->sc_proto].pr_stop(sc);
 }
 
 static void
 lagg_proto_lladdr(struct lagg_softc *sc)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_lladdr != NULL)
 		lagg_protos[sc->sc_proto].pr_lladdr(sc);
 }
 
 static void
 lagg_proto_request(struct lagg_softc *sc, void *v)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_request != NULL)
 		lagg_protos[sc->sc_proto].pr_request(sc, v);
 }
 
 static void
 lagg_proto_portreq(struct lagg_softc *sc, struct lagg_port *lp, void *v)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_portreq != NULL)
 		lagg_protos[sc->sc_proto].pr_portreq(lp, v);
 }
 
 /*
  * This routine is run via an vlan
  * config EVENT
  */
 static void
 lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
 {
 	struct lagg_softc *sc = ifp->if_softc;
 	struct lagg_port *lp;
 
 	if (ifp->if_softc !=  arg)   /* Not our event */
 		return;
 
 	LAGG_XLOCK(sc);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag);
 	LAGG_XUNLOCK(sc);
 }
 
 /*
  * This routine is run via an vlan
  * unconfig EVENT
  */
 static void
 lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
 {
 	struct lagg_softc *sc = ifp->if_softc;
 	struct lagg_port *lp;
 
 	if (ifp->if_softc !=  arg)   /* Not our event */
 		return;
 
 	LAGG_XLOCK(sc);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag);
 	LAGG_XUNLOCK(sc);
 }
 
 static int
 lagg_clone_create(struct if_clone *ifc, char *name, size_t len,
     struct ifc_data *ifd, struct ifnet **ifpp)
 {
 	struct iflaggparam iflp;
 	struct lagg_softc *sc;
 	struct ifnet *ifp;
 	int if_type;
 	int error;
 	static const uint8_t eaddr[LAGG_ADDR_LEN];
 
 	if (ifd->params != NULL) {
 		error = ifc_copyin(ifd, &iflp, sizeof(iflp));
 		if (error)
 			return (error);
 
 		switch (iflp.lagg_type) {
 		case LAGG_TYPE_ETHERNET:
 			if_type = IFT_ETHER;
 			break;
 		case LAGG_TYPE_INFINIBAND:
 			if_type = IFT_INFINIBAND;
 			break;
 		default:
 			return (EINVAL);
 		}
 	} else {
 		if_type = IFT_ETHER;
 	}
 
 	sc = malloc(sizeof(*sc), M_LAGG, M_WAITOK|M_ZERO);
 	ifp = sc->sc_ifp = if_alloc(if_type);
 	if (ifp == NULL) {
 		free(sc, M_LAGG);
 		return (ENOSPC);
 	}
 	LAGG_SX_INIT(sc);
 
 	mtx_init(&sc->sc_mtx, "lagg-mtx", NULL, MTX_DEF);
 	callout_init_mtx(&sc->sc_watchdog, &sc->sc_mtx, 0);
 
 	LAGG_XLOCK(sc);
 	if (V_def_use_flowid)
 		sc->sc_opts |= LAGG_OPT_USE_FLOWID;
 	if (V_def_use_numa)
 		sc->sc_opts |= LAGG_OPT_USE_NUMA;
 	sc->flowid_shift = V_def_flowid_shift;
 
 	/* Hash all layers by default */
 	sc->sc_flags = MBUF_HASHFLAG_L2|MBUF_HASHFLAG_L3|MBUF_HASHFLAG_L4;
 
 	lagg_proto_attach(sc, LAGG_PROTO_DEFAULT);
 
 	CK_SLIST_INIT(&sc->sc_ports);
 
 	switch (if_type) {
 	case IFT_ETHER:
 		/* Initialise pseudo media types */
 		ifmedia_init(&sc->sc_media, 0, lagg_media_change,
 		    lagg_media_status);
 		ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
 		ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
 
 		if_initname(ifp, laggname, ifd->unit);
 		ifp->if_transmit = lagg_transmit_ethernet;
 		break;
 	case IFT_INFINIBAND:
 		if_initname(ifp, laggname, ifd->unit);
 		ifp->if_transmit = lagg_transmit_infiniband;
 		break;
 	default:
 		break;
 	}
 	ifp->if_softc = sc;
 	ifp->if_qflush = lagg_qflush;
 	ifp->if_init = lagg_init;
 	ifp->if_ioctl = lagg_ioctl;
 	ifp->if_get_counter = lagg_get_counter;
 	ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	ifp->if_snd_tag_alloc = lagg_snd_tag_alloc;
 	ifp->if_ratelimit_query = lagg_ratelimit_query;
 #endif
 	ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
 
 	/*
 	 * Attach as an ordinary ethernet device, children will be attached
 	 * as special device IFT_IEEE8023ADLAG or IFT_INFINIBANDLAG.
 	 */
 	switch (if_type) {
 	case IFT_ETHER:
 		ether_ifattach(ifp, eaddr);
 		break;
 	case IFT_INFINIBAND:
 		infiniband_ifattach(ifp, eaddr, sc->sc_bcast_addr);
 		break;
 	default:
 		break;
 	}
 
 	sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
 		lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
 	sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
 		lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
 
 	/* Insert into the global list of laggs */
 	LAGG_LIST_LOCK();
 	SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries);
 	LAGG_LIST_UNLOCK();
 	LAGG_XUNLOCK(sc);
 	*ifpp = ifp;
 
 	return (0);
 }
 
 static int
 lagg_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	struct lagg_port *lp;
 
 	LAGG_XLOCK(sc);
 	sc->sc_destroying = 1;
 	lagg_stop(sc);
 	ifp->if_flags &= ~IFF_UP;
 
 	EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach);
 	EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach);
 
 	/* Shutdown and remove lagg ports */
 	while ((lp = CK_SLIST_FIRST(&sc->sc_ports)) != NULL)
 		lagg_port_destroy(lp, 1);
 
 	/* Unhook the aggregation protocol */
 	lagg_proto_detach(sc);
 	LAGG_XUNLOCK(sc);
 
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 		ifmedia_removeall(&sc->sc_media);
 		ether_ifdetach(ifp);
 		break;
 	case IFT_INFINIBAND:
 		infiniband_ifdetach(ifp);
 		break;
 	default:
 		break;
 	}
 	if_free(ifp);
 
 	LAGG_LIST_LOCK();
 	SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries);
 	LAGG_LIST_UNLOCK();
 
 	mtx_destroy(&sc->sc_mtx);
 	LAGG_SX_DESTROY(sc);
 	free(sc, M_LAGG);
 
 	return (0);
 }
 
 static void
 lagg_capabilities(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	int cap, cap2, ena, ena2, pena, pena2;
 	uint64_t hwa;
 	struct ifnet_hw_tsomax hw_tsomax;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	/* Get common enabled capabilities for the lagg ports */
 	ena = ena2 = ~0;
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		ena &= lp->lp_ifp->if_capenable;
 		ena2 &= lp->lp_ifp->if_capenable2;
 	}
 	if (CK_SLIST_FIRST(&sc->sc_ports) == NULL)
 		ena = ena2 = 0;
 
 	/*
 	 * Apply common enabled capabilities back to the lagg ports.
 	 * May require several iterations if they are dependent.
 	 */
 	do {
 		pena = ena;
 		pena2 = ena2;
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			lagg_setcaps(lp, ena, ena2);
 			ena &= lp->lp_ifp->if_capenable;
 			ena2 &= lp->lp_ifp->if_capenable2;
 		}
 	} while (pena != ena || pena2 != ena2);
 
 	/* Get other capabilities from the lagg ports */
 	cap = cap2 = ~0;
 	hwa = ~(uint64_t)0;
 	memset(&hw_tsomax, 0, sizeof(hw_tsomax));
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		cap &= lp->lp_ifp->if_capabilities;
 		cap2 &= lp->lp_ifp->if_capabilities2;
 		hwa &= lp->lp_ifp->if_hwassist;
 		if_hw_tsomax_common(lp->lp_ifp, &hw_tsomax);
 	}
 	if (CK_SLIST_FIRST(&sc->sc_ports) == NULL)
 		cap = cap2 = hwa = 0;
 
 	if (sc->sc_ifp->if_capabilities != cap ||
 	    sc->sc_ifp->if_capenable != ena ||
 	    sc->sc_ifp->if_capenable2 != ena2 ||
 	    sc->sc_ifp->if_hwassist != hwa ||
 	    if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax) != 0) {
 		sc->sc_ifp->if_capabilities = cap;
 		sc->sc_ifp->if_capabilities2 = cap2;
 		sc->sc_ifp->if_capenable = ena;
 		sc->sc_ifp->if_capenable2 = ena2;
 		sc->sc_ifp->if_hwassist = hwa;
 		getmicrotime(&sc->sc_ifp->if_lastchange);
 
 		if (sc->sc_ifflags & IFF_DEBUG)
 			if_printf(sc->sc_ifp,
 			    "capabilities 0x%08x enabled 0x%08x\n", cap, ena);
 	}
 }
 
 static int
 lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp)
 {
 	struct lagg_softc *sc_ptr;
 	struct lagg_port *lp, *tlp;
 	struct ifreq ifr;
 	int error, i, oldmtu;
 	int if_type;
 	uint64_t *pval;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	if (sc->sc_ifp == ifp) {
 		if_printf(sc->sc_ifp,
 		    "cannot add a lagg to itself as a port\n");
 		return (EINVAL);
 	}
 
 	if (sc->sc_destroying == 1)
 		return (ENXIO);
 
 	/* Limit the maximal number of lagg ports */
 	if (sc->sc_count >= LAGG_MAX_PORTS)
 		return (ENOSPC);
 
 	/* Check if port has already been associated to a lagg */
 	if (ifp->if_lagg != NULL) {
 		/* Port is already in the current lagg? */
 		lp = (struct lagg_port *)ifp->if_lagg;
 		if (lp->lp_softc == sc)
 			return (EEXIST);
 		return (EBUSY);
 	}
 
 	switch (sc->sc_ifp->if_type) {
 	case IFT_ETHER:
 		/* XXX Disallow non-ethernet interfaces (this should be any of 802) */
 		if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN)
 			return (EPROTONOSUPPORT);
 		if_type = IFT_IEEE8023ADLAG;
 		break;
 	case IFT_INFINIBAND:
 		/* XXX Disallow non-infiniband interfaces */
 		if (ifp->if_type != IFT_INFINIBAND)
 			return (EPROTONOSUPPORT);
 		if_type = IFT_INFINIBANDLAG;
 		break;
 	default:
 		break;
 	}
 
 	/* Allow the first Ethernet member to define the MTU */
 	oldmtu = -1;
 	if (CK_SLIST_EMPTY(&sc->sc_ports)) {
 		sc->sc_ifp->if_mtu = ifp->if_mtu;
 	} else if (sc->sc_ifp->if_mtu != ifp->if_mtu) {
 		if (ifp->if_ioctl == NULL) {
 			if_printf(sc->sc_ifp, "cannot change MTU for %s\n",
 			    ifp->if_xname);
 			return (EINVAL);
 		}
 		oldmtu = ifp->if_mtu;
 		strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name));
 		ifr.ifr_mtu = sc->sc_ifp->if_mtu;
 		error = (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr);
 		if (error != 0) {
 			if_printf(sc->sc_ifp, "invalid MTU for %s\n",
 			    ifp->if_xname);
 			return (error);
 		}
 		ifr.ifr_mtu = oldmtu;
 	}
 
 	lp = malloc(sizeof(struct lagg_port), M_LAGG, M_WAITOK|M_ZERO);
 	lp->lp_softc = sc;
 
 	/* Check if port is a stacked lagg */
 	LAGG_LIST_LOCK();
 	SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) {
 		if (ifp == sc_ptr->sc_ifp) {
 			LAGG_LIST_UNLOCK();
 			free(lp, M_LAGG);
 			if (oldmtu != -1)
 				(*ifp->if_ioctl)(ifp, SIOCSIFMTU,
 				    (caddr_t)&ifr);
 			return (EINVAL);
 			/* XXX disable stacking for the moment, its untested */
 #ifdef LAGG_PORT_STACKING
 			lp->lp_flags |= LAGG_PORT_STACK;
 			if (lagg_port_checkstacking(sc_ptr) >=
 			    LAGG_MAX_STACKING) {
 				LAGG_LIST_UNLOCK();
 				free(lp, M_LAGG);
 				if (oldmtu != -1)
 					(*ifp->if_ioctl)(ifp, SIOCSIFMTU,
 					    (caddr_t)&ifr);
 				return (E2BIG);
 			}
 #endif
 		}
 	}
 	LAGG_LIST_UNLOCK();
 
 	if_ref(ifp);
 	lp->lp_ifp = ifp;
 
 	bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ifp->if_addrlen);
 	lp->lp_ifcapenable = ifp->if_capenable;
 	if (CK_SLIST_EMPTY(&sc->sc_ports)) {
 		bcopy(IF_LLADDR(ifp), IF_LLADDR(sc->sc_ifp), ifp->if_addrlen);
 		lagg_proto_lladdr(sc);
 		EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
 	} else {
 		if_setlladdr(ifp, IF_LLADDR(sc->sc_ifp), ifp->if_addrlen);
 	}
 	lagg_setflags(lp, 1);
 
 	if (CK_SLIST_EMPTY(&sc->sc_ports))
 		sc->sc_primary = lp;
 
 	/* Change the interface type */
 	lp->lp_iftype = ifp->if_type;
 	ifp->if_type = if_type;
 	ifp->if_lagg = lp;
 	lp->lp_ioctl = ifp->if_ioctl;
 	ifp->if_ioctl = lagg_port_ioctl;
 	lp->lp_output = ifp->if_output;
 	ifp->if_output = lagg_port_output;
 
 	/* Read port counters */
 	pval = lp->port_counters.val;
 	for (i = 0; i < IFCOUNTERS; i++, pval++)
 		*pval = ifp->if_get_counter(ifp, i);
 
 	/*
 	 * Insert into the list of ports.
 	 * Keep ports sorted by if_index. It is handy, when configuration
 	 * is predictable and `ifconfig laggN create ...` command
 	 * will lead to the same result each time.
 	 */
 	CK_SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) {
 		if (tlp->lp_ifp->if_index < ifp->if_index && (
 		    CK_SLIST_NEXT(tlp, lp_entries) == NULL ||
 		    ((struct  lagg_port*)CK_SLIST_NEXT(tlp, lp_entries))->lp_ifp->if_index >
 		    ifp->if_index))
 			break;
 	}
 	if (tlp != NULL)
 		CK_SLIST_INSERT_AFTER(tlp, lp, lp_entries);
 	else
 		CK_SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries);
 	sc->sc_count++;
 
 	lagg_setmulti(lp);
 
 	if ((error = lagg_proto_addport(sc, lp)) != 0) {
 		/* Remove the port, without calling pr_delport. */
 		lagg_port_destroy(lp, 0);
 		if (oldmtu != -1)
 			(*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr);
 		return (error);
 	}
 
 	/* Update lagg capabilities */
 	lagg_capabilities(sc);
 	lagg_linkstate(sc);
 
 	return (0);
 }
 
 #ifdef LAGG_PORT_STACKING
 static int
 lagg_port_checkstacking(struct lagg_softc *sc)
 {
 	struct lagg_softc *sc_ptr;
 	struct lagg_port *lp;
 	int m = 0;
 
 	LAGG_SXLOCK_ASSERT(sc);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (lp->lp_flags & LAGG_PORT_STACK) {
 			sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc;
 			m = MAX(m, lagg_port_checkstacking(sc_ptr));
 		}
 	}
 
 	return (m + 1);
 }
 #endif
 
 static void
 lagg_port_destroy_cb(epoch_context_t ec)
 {
 	struct lagg_port *lp;
 	struct ifnet *ifp;
 
 	lp = __containerof(ec, struct lagg_port, lp_epoch_ctx);
 	ifp = lp->lp_ifp;
 
 	if_rele(ifp);
 	free(lp, M_LAGG);
 }
 
 static int
 lagg_port_destroy(struct lagg_port *lp, int rundelport)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	struct lagg_port *lp_ptr, *lp0;
 	struct ifnet *ifp = lp->lp_ifp;
 	uint64_t *pval, vdiff;
 	int i;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	if (rundelport)
 		lagg_proto_delport(sc, lp);
 
 	if (lp->lp_detaching == 0)
 		lagg_clrmulti(lp);
 
 	/* Restore interface */
 	ifp->if_type = lp->lp_iftype;
 	ifp->if_ioctl = lp->lp_ioctl;
 	ifp->if_output = lp->lp_output;
 	ifp->if_lagg = NULL;
 
 	/* Update detached port counters */
 	pval = lp->port_counters.val;
 	for (i = 0; i < IFCOUNTERS; i++, pval++) {
 		vdiff = ifp->if_get_counter(ifp, i) - *pval;
 		sc->detached_counters.val[i] += vdiff;
 	}
 
 	/* Finally, remove the port from the lagg */
 	CK_SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries);
 	sc->sc_count--;
 
 	/* Update the primary interface */
 	if (lp == sc->sc_primary) {
 		uint8_t lladdr[LAGG_ADDR_LEN];
 
 		if ((lp0 = CK_SLIST_FIRST(&sc->sc_ports)) == NULL)
 			bzero(&lladdr, LAGG_ADDR_LEN);
 		else
 			bcopy(lp0->lp_lladdr, lladdr, LAGG_ADDR_LEN);
 		sc->sc_primary = lp0;
 		if (sc->sc_destroying == 0) {
 			bcopy(lladdr, IF_LLADDR(sc->sc_ifp), sc->sc_ifp->if_addrlen);
 			lagg_proto_lladdr(sc);
 			EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
 
 			/*
 			 * Update lladdr for each port (new primary needs update
 			 * as well, to switch from old lladdr to its 'real' one).
 			 * We can skip this if the lagg is being destroyed.
 			 */
 			CK_SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries)
 				if_setlladdr(lp_ptr->lp_ifp, lladdr,
 				    lp_ptr->lp_ifp->if_addrlen);
 		}
 	}
 
 	if (lp->lp_ifflags)
 		if_printf(ifp, "%s: lp_ifflags unclean\n", __func__);
 
 	if (lp->lp_detaching == 0) {
 		lagg_setflags(lp, 0);
 		lagg_setcaps(lp, lp->lp_ifcapenable, lp->lp_ifcapenable2);
 		if_setlladdr(ifp, lp->lp_lladdr, ifp->if_addrlen);
 	}
 
 	/*
 	 * free port and release it's ifnet reference after a grace period has
 	 * elapsed.
 	 */
 	NET_EPOCH_CALL(lagg_port_destroy_cb, &lp->lp_epoch_ctx);
 	/* Update lagg capabilities */
 	lagg_capabilities(sc);
 	lagg_linkstate(sc);
 
 	return (0);
 }
 
 static int
 lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct epoch_tracker et;
 	struct lagg_reqport *rp = (struct lagg_reqport *)data;
 	struct lagg_softc *sc;
 	struct lagg_port *lp = NULL;
 	int error = 0;
 
 	/* Should be checked by the caller */
 	switch (ifp->if_type) {
 	case IFT_IEEE8023ADLAG:
 	case IFT_INFINIBANDLAG:
 		if ((lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL)
 			goto fallback;
 		break;
 	default:
 		goto fallback;
 	}
 
 	switch (cmd) {
 	case SIOCGLAGGPORT:
 		if (rp->rp_portname[0] == '\0' ||
 		    ifunit(rp->rp_portname) != ifp) {
 			error = EINVAL;
 			break;
 		}
 
 		NET_EPOCH_ENTER(et);
 		if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) {
 			error = ENOENT;
 			NET_EPOCH_EXIT(et);
 			break;
 		}
 
 		lagg_port2req(lp, rp);
 		NET_EPOCH_EXIT(et);
 		break;
 
 	case SIOCSIFCAP:
 	case SIOCSIFCAPNV:
 		if (lp->lp_ioctl == NULL) {
 			error = EINVAL;
 			break;
 		}
 		error = (*lp->lp_ioctl)(ifp, cmd, data);
 		if (error)
 			break;
 
 		/* Update lagg interface capabilities */
 		LAGG_XLOCK(sc);
 		lagg_capabilities(sc);
 		LAGG_XUNLOCK(sc);
 		VLAN_CAPABILITIES(sc->sc_ifp);
 		break;
 
 	case SIOCSIFMTU:
 		/* Do not allow the MTU to be changed once joined */
 		error = EINVAL;
 		break;
 
 	default:
 		goto fallback;
 	}
 
 	return (error);
 
 fallback:
 	if (lp != NULL && lp->lp_ioctl != NULL)
 		return ((*lp->lp_ioctl)(ifp, cmd, data));
 
 	return (EINVAL);
 }
 
 /*
  * Requests counter @cnt data. 
  *
  * Counter value is calculated the following way:
  * 1) for each port, sum  difference between current and "initial" measurements.
  * 2) add lagg logical interface counters.
  * 3) add data from detached_counters array.
  *
  * We also do the following things on ports attach/detach:
  * 1) On port attach we store all counters it has into port_counter array. 
  * 2) On port detach we add the different between "initial" and
  *   current counters data to detached_counters array.
  */
 static uint64_t
 lagg_get_counter(struct ifnet *ifp, ift_counter cnt)
 {
 	struct epoch_tracker et;
 	struct lagg_softc *sc;
 	struct lagg_port *lp;
 	struct ifnet *lpifp;
 	uint64_t newval, oldval, vsum;
 
 	/* Revise this when we've got non-generic counters. */
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	sc = (struct lagg_softc *)ifp->if_softc;
 
 	vsum = 0;
 	NET_EPOCH_ENTER(et);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		/* Saved attached value */
 		oldval = lp->port_counters.val[cnt];
 		/* current value */
 		lpifp = lp->lp_ifp;
 		newval = lpifp->if_get_counter(lpifp, cnt);
 		/* Calculate diff and save new */
 		vsum += newval - oldval;
 	}
 	NET_EPOCH_EXIT(et);
 
 	/*
 	 * Add counter data which might be added by upper
 	 * layer protocols operating on logical interface.
 	 */
 	vsum += if_get_counter_default(ifp, cnt);
 
 	/*
 	 * Add counter data from detached ports counters
 	 */
 	vsum += sc->detached_counters.val[cnt];
 
 	return (vsum);
 }
 
 /*
  * For direct output to child ports.
  */
 static int
 lagg_port_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 	struct lagg_port *lp = ifp->if_lagg;
 
 	switch (dst->sa_family) {
 		case pseudo_AF_HDRCMPLT:
 		case AF_UNSPEC:
 			if (lp != NULL)
 				return ((*lp->lp_output)(ifp, m, dst, ro));
 	}
 
 	/* drop any other frames */
 	m_freem(m);
 	return (ENETDOWN);
 }
 
 static void
 lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp)
 {
 	struct lagg_port *lp;
 	struct lagg_softc *sc;
 
 	if ((lp = ifp->if_lagg) == NULL)
 		return;
 	/* If the ifnet is just being renamed, don't do anything. */
 	if (ifp->if_flags & IFF_RENAMING)
 		return;
 
 	sc = lp->lp_softc;
 
 	LAGG_XLOCK(sc);
 	lp->lp_detaching = 1;
 	lagg_port_destroy(lp, 1);
 	LAGG_XUNLOCK(sc);
 	VLAN_CAPABILITIES(sc->sc_ifp);
 }
 
 static void
 lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 
 	strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname));
 	strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname));
 	rp->rp_prio = lp->lp_prio;
 	rp->rp_flags = lp->lp_flags;
 	lagg_proto_portreq(sc, lp, &rp->rp_psc);
 
 	/* Add protocol specific flags */
 	switch (sc->sc_proto) {
 		case LAGG_PROTO_FAILOVER:
 			if (lp == sc->sc_primary)
 				rp->rp_flags |= LAGG_PORT_MASTER;
 			if (lp == lagg_link_active(sc, sc->sc_primary))
 				rp->rp_flags |= LAGG_PORT_ACTIVE;
 			break;
 
 		case LAGG_PROTO_ROUNDROBIN:
 		case LAGG_PROTO_LOADBALANCE:
 		case LAGG_PROTO_BROADCAST:
 			if (LAGG_PORTACTIVE(lp))
 				rp->rp_flags |= LAGG_PORT_ACTIVE;
 			break;
 
 		case LAGG_PROTO_LACP:
 			/* LACP has a different definition of active */
 			if (lacp_isactive(lp))
 				rp->rp_flags |= LAGG_PORT_ACTIVE;
 			if (lacp_iscollecting(lp))
 				rp->rp_flags |= LAGG_PORT_COLLECTING;
 			if (lacp_isdistributing(lp))
 				rp->rp_flags |= LAGG_PORT_DISTRIBUTING;
 			break;
 	}
 
 }
 
 static void
 lagg_watchdog_infiniband(void *arg)
 {
 	struct epoch_tracker et;
 	struct lagg_softc *sc;
 	struct lagg_port *lp;
 	struct ifnet *ifp;
 	struct ifnet *lp_ifp;
 
 	sc = arg;
 
 	/*
 	 * Because infiniband nodes have a fixed MAC address, which is
 	 * generated by the so-called GID, we need to regularly update
 	 * the link level address of the parent lagg<N> device when
 	 * the active port changes. Possibly we could piggy-back on
 	 * link up/down events aswell, but using a timer also provides
 	 * a guarantee against too frequent events. This operation
 	 * does not have to be atomic.
 	 */
 	NET_EPOCH_ENTER(et);
 	lp = lagg_link_active(sc, sc->sc_primary);
 	if (lp != NULL) {
 		ifp = sc->sc_ifp;
 		lp_ifp = lp->lp_ifp;
 
 		if (ifp != NULL && lp_ifp != NULL &&
 		    (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp_ifp), ifp->if_addrlen) != 0 ||
 		     memcmp(sc->sc_bcast_addr, lp_ifp->if_broadcastaddr, ifp->if_addrlen) != 0)) {
 			memcpy(IF_LLADDR(ifp), IF_LLADDR(lp_ifp), ifp->if_addrlen);
 			memcpy(sc->sc_bcast_addr, lp_ifp->if_broadcastaddr, ifp->if_addrlen);
 
 			CURVNET_SET(ifp->if_vnet);
 			EVENTHANDLER_INVOKE(iflladdr_event, ifp);
 			CURVNET_RESTORE();
 		}
 	}
 	NET_EPOCH_EXIT(et);
 
 	callout_reset(&sc->sc_watchdog, hz, &lagg_watchdog_infiniband, arg);
 }
 
 static void
 lagg_init(void *xsc)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)xsc;
 	struct ifnet *ifp = sc->sc_ifp;
 	struct lagg_port *lp;
 
 	LAGG_XLOCK(sc);
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		LAGG_XUNLOCK(sc);
 		return;
 	}
 
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 
 	/*
 	 * Update the port lladdrs if needed.
 	 * This might be if_setlladdr() notification
 	 * that lladdr has been changed.
 	 */
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp->lp_ifp),
 		    ifp->if_addrlen) != 0)
 			if_setlladdr(lp->lp_ifp, IF_LLADDR(ifp), ifp->if_addrlen);
 	}
 
 	lagg_proto_init(sc);
 
 	if (ifp->if_type == IFT_INFINIBAND) {
 		mtx_lock(&sc->sc_mtx);
 		lagg_watchdog_infiniband(sc);
 		mtx_unlock(&sc->sc_mtx);
 	}
 
 	LAGG_XUNLOCK(sc);
 }
 
 static void
 lagg_stop(struct lagg_softc *sc)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 
 	lagg_proto_stop(sc);
 
 	mtx_lock(&sc->sc_mtx);
 	callout_stop(&sc->sc_watchdog);
 	mtx_unlock(&sc->sc_mtx);
 
 	callout_drain(&sc->sc_watchdog);
 }
 
 static int
 lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct epoch_tracker et;
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	struct lagg_reqall *ra = (struct lagg_reqall *)data;
 	struct lagg_reqopts *ro = (struct lagg_reqopts *)data;
 	struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf;
 	struct lagg_reqflags *rf = (struct lagg_reqflags *)data;
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct lagg_port *lp;
 	struct ifnet *tpif;
 	struct thread *td = curthread;
 	char *buf, *outbuf;
 	int count, buflen, len, error = 0, oldmtu;
 
 	bzero(&rpbuf, sizeof(rpbuf));
 
 	/* XXX: This can race with lagg_clone_destroy. */
 
 	switch (cmd) {
 	case SIOCGLAGG:
 		LAGG_XLOCK(sc);
 		buflen = sc->sc_count * sizeof(struct lagg_reqport);
 		outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
 		ra->ra_proto = sc->sc_proto;
 		lagg_proto_request(sc, &ra->ra_psc);
 		count = 0;
 		buf = outbuf;
 		len = min(ra->ra_size, buflen);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			if (len < sizeof(rpbuf))
 				break;
 
 			lagg_port2req(lp, &rpbuf);
 			memcpy(buf, &rpbuf, sizeof(rpbuf));
 			count++;
 			buf += sizeof(rpbuf);
 			len -= sizeof(rpbuf);
 		}
 		LAGG_XUNLOCK(sc);
 		ra->ra_ports = count;
 		ra->ra_size = count * sizeof(rpbuf);
 		error = copyout(outbuf, ra->ra_port, ra->ra_size);
 		free(outbuf, M_TEMP);
 		break;
 	case SIOCSLAGG:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (ra->ra_proto >= LAGG_PROTO_MAX) {
 			error = EPROTONOSUPPORT;
 			break;
 		}
 		/* Infiniband only supports the failover protocol. */
 		if (ra->ra_proto != LAGG_PROTO_FAILOVER &&
 		    ifp->if_type == IFT_INFINIBAND) {
 			error = EPROTONOSUPPORT;
 			break;
 		}
 		LAGG_XLOCK(sc);
 		lagg_proto_detach(sc);
 		lagg_proto_attach(sc, ra->ra_proto);
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCGLAGGOPTS:
 		LAGG_XLOCK(sc);
 		ro->ro_opts = sc->sc_opts;
 		if (sc->sc_proto == LAGG_PROTO_LACP) {
 			struct lacp_softc *lsc;
 
 			lsc = (struct lacp_softc *)sc->sc_psc;
 			if (lsc->lsc_debug.lsc_tx_test != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_TXTEST;
 			if (lsc->lsc_debug.lsc_rx_test != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_RXTEST;
 			if (lsc->lsc_strict_mode != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_STRICT;
 			if (lsc->lsc_fast_timeout != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_FAST_TIMO;
 
 			ro->ro_active = sc->sc_active;
 		} else {
 			ro->ro_active = 0;
 			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 				ro->ro_active += LAGG_PORTACTIVE(lp);
 		}
 		ro->ro_bkt = sc->sc_stride;
 		ro->ro_flapping = sc->sc_flapping;
 		ro->ro_flowid_shift = sc->flowid_shift;
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCSLAGGOPTS:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 
 		/*
 		 * The stride option was added without defining a corresponding
 		 * LAGG_OPT flag, so handle a non-zero value before checking
 		 * anything else to preserve compatibility.
 		 */
 		LAGG_XLOCK(sc);
 		if (ro->ro_opts == 0 && ro->ro_bkt != 0) {
 			if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN) {
 				LAGG_XUNLOCK(sc);
 				error = EINVAL;
 				break;
 			}
 			sc->sc_stride = ro->ro_bkt;
 		}
 		if (ro->ro_opts == 0) {
 			LAGG_XUNLOCK(sc);
 			break;
 		}
 
 		/*
 		 * Set options.  LACP options are stored in sc->sc_psc,
 		 * not in sc_opts.
 		 */
 		int valid, lacp;
 
 		switch (ro->ro_opts) {
 		case LAGG_OPT_USE_FLOWID:
 		case -LAGG_OPT_USE_FLOWID:
 		case LAGG_OPT_USE_NUMA:
 		case -LAGG_OPT_USE_NUMA:
 		case LAGG_OPT_FLOWIDSHIFT:
 		case LAGG_OPT_RR_LIMIT:
 			valid = 1;
 			lacp = 0;
 			break;
 		case LAGG_OPT_LACP_TXTEST:
 		case -LAGG_OPT_LACP_TXTEST:
 		case LAGG_OPT_LACP_RXTEST:
 		case -LAGG_OPT_LACP_RXTEST:
 		case LAGG_OPT_LACP_STRICT:
 		case -LAGG_OPT_LACP_STRICT:
 		case LAGG_OPT_LACP_FAST_TIMO:
 		case -LAGG_OPT_LACP_FAST_TIMO:
 			valid = lacp = 1;
 			break;
 		default:
 			valid = lacp = 0;
 			break;
 		}
 
 		if (valid == 0 ||
 		    (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) {
 			/* Invalid combination of options specified. */
 			error = EINVAL;
 			LAGG_XUNLOCK(sc);
 			break;	/* Return from SIOCSLAGGOPTS. */ 
 		}
 
 		/*
 		 * Store new options into sc->sc_opts except for
 		 * FLOWIDSHIFT, RR and LACP options.
 		 */
 		if (lacp == 0) {
 			if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT)
 				sc->flowid_shift = ro->ro_flowid_shift;
 			else if (ro->ro_opts == LAGG_OPT_RR_LIMIT) {
 				if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN ||
 				    ro->ro_bkt == 0) {
 					error = EINVAL;
 					LAGG_XUNLOCK(sc);
 					break;
 				}
 				sc->sc_stride = ro->ro_bkt;
 			} else if (ro->ro_opts > 0)
 				sc->sc_opts |= ro->ro_opts;
 			else
 				sc->sc_opts &= ~ro->ro_opts;
 		} else {
 			struct lacp_softc *lsc;
 			struct lacp_port *lp;
 
 			lsc = (struct lacp_softc *)sc->sc_psc;
 
 			switch (ro->ro_opts) {
 			case LAGG_OPT_LACP_TXTEST:
 				lsc->lsc_debug.lsc_tx_test = 1;
 				break;
 			case -LAGG_OPT_LACP_TXTEST:
 				lsc->lsc_debug.lsc_tx_test = 0;
 				break;
 			case LAGG_OPT_LACP_RXTEST:
 				lsc->lsc_debug.lsc_rx_test = 1;
 				break;
 			case -LAGG_OPT_LACP_RXTEST:
 				lsc->lsc_debug.lsc_rx_test = 0;
 				break;
 			case LAGG_OPT_LACP_STRICT:
 				lsc->lsc_strict_mode = 1;
 				break;
 			case -LAGG_OPT_LACP_STRICT:
 				lsc->lsc_strict_mode = 0;
 				break;
 			case LAGG_OPT_LACP_FAST_TIMO:
 				LACP_LOCK(lsc);
         			LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
                         		lp->lp_state |= LACP_STATE_TIMEOUT;
 				LACP_UNLOCK(lsc);
 				lsc->lsc_fast_timeout = 1;
 				break;
 			case -LAGG_OPT_LACP_FAST_TIMO:
 				LACP_LOCK(lsc);
         			LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
                         		lp->lp_state &= ~LACP_STATE_TIMEOUT;
 				LACP_UNLOCK(lsc);
 				lsc->lsc_fast_timeout = 0;
 				break;
 			}
 		}
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCGLAGGFLAGS:
 		rf->rf_flags = 0;
 		LAGG_XLOCK(sc);
 		if (sc->sc_flags & MBUF_HASHFLAG_L2)
 			rf->rf_flags |= LAGG_F_HASHL2;
 		if (sc->sc_flags & MBUF_HASHFLAG_L3)
 			rf->rf_flags |= LAGG_F_HASHL3;
 		if (sc->sc_flags & MBUF_HASHFLAG_L4)
 			rf->rf_flags |= LAGG_F_HASHL4;
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCSLAGGHASH:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) {
 			error = EINVAL;
 			break;
 		}
 		LAGG_XLOCK(sc);
 		sc->sc_flags = 0;
 		if (rf->rf_flags & LAGG_F_HASHL2)
 			sc->sc_flags |= MBUF_HASHFLAG_L2;
 		if (rf->rf_flags & LAGG_F_HASHL3)
 			sc->sc_flags |= MBUF_HASHFLAG_L3;
 		if (rf->rf_flags & LAGG_F_HASHL4)
 			sc->sc_flags |= MBUF_HASHFLAG_L4;
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCGLAGGPORT:
 		if (rp->rp_portname[0] == '\0' ||
 		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 
 		NET_EPOCH_ENTER(et);
 		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
 		    lp->lp_softc != sc) {
 			error = ENOENT;
 			NET_EPOCH_EXIT(et);
 			if_rele(tpif);
 			break;
 		}
 
 		lagg_port2req(lp, rp);
 		NET_EPOCH_EXIT(et);
 		if_rele(tpif);
 		break;
 	case SIOCSLAGGPORT:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (rp->rp_portname[0] == '\0' ||
 		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 #ifdef INET6
 		/*
 		 * A laggport interface should not have inet6 address
 		 * because two interfaces with a valid link-local
 		 * scope zone must not be merged in any form.  This
 		 * restriction is needed to prevent violation of
 		 * link-local scope zone.  Attempts to add a laggport
 		 * interface which has inet6 addresses triggers
 		 * removal of all inet6 addresses on the member
 		 * interface.
 		 */
 		if (in6ifa_llaonifp(tpif)) {
 			in6_ifdetach(tpif);
 				if_printf(sc->sc_ifp,
 				    "IPv6 addresses on %s have been removed "
 				    "before adding it as a member to prevent "
 				    "IPv6 address scope violation.\n",
 				    tpif->if_xname);
 		}
 #endif
 		oldmtu = ifp->if_mtu;
 		LAGG_XLOCK(sc);
 		error = lagg_port_create(sc, tpif);
 		LAGG_XUNLOCK(sc);
 		if_rele(tpif);
 
 		/*
 		 * LAGG MTU may change during addition of the first port.
 		 * If it did, do network layer specific procedure.
 		 */
 		if (ifp->if_mtu != oldmtu) {
 #ifdef INET6
 			nd6_setmtu(ifp);
 #endif
 			rt_updatemtu(ifp);
 		}
 
 		VLAN_CAPABILITIES(ifp);
 		break;
 	case SIOCSLAGGDELPORT:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (rp->rp_portname[0] == '\0' ||
 		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 
 		LAGG_XLOCK(sc);
 		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
 		    lp->lp_softc != sc) {
 			error = ENOENT;
 			LAGG_XUNLOCK(sc);
 			if_rele(tpif);
 			break;
 		}
 
 		error = lagg_port_destroy(lp, 1);
 		LAGG_XUNLOCK(sc);
 		if_rele(tpif);
 		VLAN_CAPABILITIES(ifp);
 		break;
 	case SIOCSIFFLAGS:
 		/* Set flags on ports too */
 		LAGG_XLOCK(sc);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			lagg_setflags(lp, 1);
 		}
 
 		if (!(ifp->if_flags & IFF_UP) &&
 		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			/*
 			 * If interface is marked down and it is running,
 			 * then stop and disable it.
 			 */
 			lagg_stop(sc);
 			LAGG_XUNLOCK(sc);
 		} else if ((ifp->if_flags & IFF_UP) &&
 		    !(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			/*
 			 * If interface is marked up and it is stopped, then
 			 * start it.
 			 */
 			LAGG_XUNLOCK(sc);
 			(*ifp->if_init)(sc);
 		} else
 			LAGG_XUNLOCK(sc);
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		LAGG_XLOCK(sc);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			lagg_clrmulti(lp);
 			lagg_setmulti(lp);
 		}
 		LAGG_XUNLOCK(sc);
 		error = 0;
 		break;
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		if (ifp->if_type == IFT_INFINIBAND)
 			error = EINVAL;
 		else
 			error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
 		break;
 
 	case SIOCSIFCAP:
 	case SIOCSIFCAPNV:
 		LAGG_XLOCK(sc);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			if (lp->lp_ioctl != NULL)
 				(*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
 		}
 		lagg_capabilities(sc);
 		LAGG_XUNLOCK(sc);
 		VLAN_CAPABILITIES(ifp);
 		error = 0;
 		break;
 
 	case SIOCGIFCAPNV:
 		error = 0;
 		break;
 
 	case SIOCSIFMTU:
 		LAGG_XLOCK(sc);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			if (lp->lp_ioctl != NULL)
 				error = (*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
 			else
 				error = EINVAL;
 			if (error != 0) {
 				if_printf(ifp,
 				    "failed to change MTU to %d on port %s, "
 				    "reverting all ports to original MTU (%d)\n",
 				    ifr->ifr_mtu, lp->lp_ifp->if_xname, ifp->if_mtu);
 				break;
 			}
 		}
 		if (error == 0) {
 			ifp->if_mtu = ifr->ifr_mtu;
 		} else {
 			/* set every port back to the original MTU */
 			ifr->ifr_mtu = ifp->if_mtu;
 			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 				if (lp->lp_ioctl != NULL)
 					(*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
 			}
 		}
 		lagg_capabilities(sc);
 		LAGG_XUNLOCK(sc);
 		VLAN_CAPABILITIES(ifp);
 		break;
 
 	default:
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 	return (error);
 }
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 #ifdef RATELIMIT
 static const struct if_snd_tag_sw lagg_snd_tag_ul_sw = {
 	.snd_tag_modify = lagg_snd_tag_modify,
 	.snd_tag_query = lagg_snd_tag_query,
 	.snd_tag_free = lagg_snd_tag_free,
 	.next_snd_tag = lagg_next_snd_tag,
 	.type = IF_SND_TAG_TYPE_UNLIMITED
 };
 
 static const struct if_snd_tag_sw lagg_snd_tag_rl_sw = {
 	.snd_tag_modify = lagg_snd_tag_modify,
 	.snd_tag_query = lagg_snd_tag_query,
 	.snd_tag_free = lagg_snd_tag_free,
 	.next_snd_tag = lagg_next_snd_tag,
 	.type = IF_SND_TAG_TYPE_RATE_LIMIT
 };
 #endif
 
 #ifdef KERN_TLS
 static const struct if_snd_tag_sw lagg_snd_tag_tls_sw = {
 	.snd_tag_modify = lagg_snd_tag_modify,
 	.snd_tag_query = lagg_snd_tag_query,
 	.snd_tag_free = lagg_snd_tag_free,
 	.next_snd_tag = lagg_next_snd_tag,
 	.type = IF_SND_TAG_TYPE_TLS
 };
 
 #ifdef RATELIMIT
 static const struct if_snd_tag_sw lagg_snd_tag_tls_rl_sw = {
 	.snd_tag_modify = lagg_snd_tag_modify,
 	.snd_tag_query = lagg_snd_tag_query,
 	.snd_tag_free = lagg_snd_tag_free,
 	.next_snd_tag = lagg_next_snd_tag,
 	.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT
 };
 #endif
 #endif
 
 static inline struct lagg_snd_tag *
 mst_to_lst(struct m_snd_tag *mst)
 {
 
 	return (__containerof(mst, struct lagg_snd_tag, com));
 }
 
 /*
  * Look up the port used by a specific flow.  This only works for lagg
  * protocols with deterministic port mappings (e.g. not roundrobin).
  * In addition protocols which use a hash to map flows to ports must
  * be configured to use the mbuf flowid rather than hashing packet
  * contents.
  */
 static struct lagg_port *
 lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid, uint32_t flowtype,
     uint8_t numa_domain)
 {
 	struct lagg_softc *sc;
 	struct lagg_port *lp;
 	struct lagg_lb *lb;
 	uint32_t hash, p;
 	int err;
 
 	sc = ifp->if_softc;
 
 	switch (sc->sc_proto) {
 	case LAGG_PROTO_FAILOVER:
 		return (lagg_link_active(sc, sc->sc_primary));
 	case LAGG_PROTO_LOADBALANCE:
 		if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
 		    flowtype == M_HASHTYPE_NONE)
 			return (NULL);
 		p = flowid >> sc->flowid_shift;
 		p %= sc->sc_count;
 		lb = (struct lagg_lb *)sc->sc_psc;
 		lp = lb->lb_ports[p];
 		return (lagg_link_active(sc, lp));
 	case LAGG_PROTO_LACP:
 		if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
 		    flowtype == M_HASHTYPE_NONE)
 			return (NULL);
 		hash = flowid >> sc->flowid_shift;
 		return (lacp_select_tx_port_by_hash(sc, hash, numa_domain, &err));
 	default:
 		return (NULL);
 	}
 }
 
 static int
 lagg_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 	struct epoch_tracker et;
 	const struct if_snd_tag_sw *sw;
 	struct lagg_snd_tag *lst;
 	struct lagg_port *lp;
 	struct ifnet *lp_ifp;
 	struct m_snd_tag *mst;
 	int error;
 
 	switch (params->hdr.type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_UNLIMITED:
 		sw = &lagg_snd_tag_ul_sw;
 		break;
 	case IF_SND_TAG_TYPE_RATE_LIMIT:
 		sw = &lagg_snd_tag_rl_sw;
 		break;
 #endif
 #ifdef KERN_TLS
 	case IF_SND_TAG_TYPE_TLS:
 		sw = &lagg_snd_tag_tls_sw;
 		break;
 	case IF_SND_TAG_TYPE_TLS_RX:
 		/* Return tag from port interface directly. */
 		sw = NULL;
 		break;
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
 		sw = &lagg_snd_tag_tls_rl_sw;
 		break;
 #endif
 #endif
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	NET_EPOCH_ENTER(et);
 	lp = lookup_snd_tag_port(ifp, params->hdr.flowid,
 	    params->hdr.flowtype, params->hdr.numa_domain);
 	if (lp == NULL) {
 		NET_EPOCH_EXIT(et);
 		return (EOPNOTSUPP);
 	}
 	if (lp->lp_ifp == NULL) {
 		NET_EPOCH_EXIT(et);
 		return (EOPNOTSUPP);
 	}
 	lp_ifp = lp->lp_ifp;
 	if_ref(lp_ifp);
 	NET_EPOCH_EXIT(et);
 
 	if (sw != NULL) {
 		lst = malloc(sizeof(*lst), M_LAGG, M_NOWAIT);
 		if (lst == NULL) {
 			if_rele(lp_ifp);
 			return (ENOMEM);
 		}
 	} else
 		lst = NULL;
 
 	error = m_snd_tag_alloc(lp_ifp, params, &mst);
 	if_rele(lp_ifp);
 	if (error) {
 		free(lst, M_LAGG);
 		return (error);
 	}
 
 	if (sw != NULL) {
 		m_snd_tag_init(&lst->com, ifp, sw);
 		lst->tag = mst;
 
 		*ppmt = &lst->com;
 	} else
 		*ppmt = mst;
 
 	return (0);
 }
 
 static struct m_snd_tag *
 lagg_next_snd_tag(struct m_snd_tag *mst)
 {
 	struct lagg_snd_tag *lst;
 
 	lst = mst_to_lst(mst);
 	return (lst->tag);
 }
 
 static int
 lagg_snd_tag_modify(struct m_snd_tag *mst,
     union if_snd_tag_modify_params *params)
 {
 	struct lagg_snd_tag *lst;
 
 	lst = mst_to_lst(mst);
 	return (lst->tag->sw->snd_tag_modify(lst->tag, params));
 }
 
 static int
 lagg_snd_tag_query(struct m_snd_tag *mst,
     union if_snd_tag_query_params *params)
 {
 	struct lagg_snd_tag *lst;
 
 	lst = mst_to_lst(mst);
 	return (lst->tag->sw->snd_tag_query(lst->tag, params));
 }
 
 static void
 lagg_snd_tag_free(struct m_snd_tag *mst)
 {
 	struct lagg_snd_tag *lst;
 
 	lst = mst_to_lst(mst);
 	m_snd_tag_rele(lst->tag);
 	free(lst, M_LAGG);
 }
 
 static void
 lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
 {
 	/*
 	 * For lagg, we have an indirect
 	 * interface. The caller needs to
 	 * get a ratelimit tag on the actual
 	 * interface the flow will go on.
 	 */
 	q->rate_table = NULL;
 	q->flags = RT_IS_INDIRECT;
 	q->max_flows = 0;
 	q->number_of_rates = 0;
 }
 #endif
 
 static int
 lagg_setmulti(struct lagg_port *lp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *ifp = lp->lp_ifp;
 	struct ifnet *scifp = sc->sc_ifp;
 	struct lagg_mc *mc;
 	struct ifmultiaddr *ifma;
 	int error;
 
 	IF_ADDR_WLOCK(scifp);
 	CK_STAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		mc = malloc(sizeof(struct lagg_mc), M_LAGG, M_NOWAIT);
 		if (mc == NULL) {
 			IF_ADDR_WUNLOCK(scifp);
 			return (ENOMEM);
 		}
 		bcopy(ifma->ifma_addr, &mc->mc_addr,
 		    ifma->ifma_addr->sa_len);
 		mc->mc_addr.sdl_index = ifp->if_index;
 		mc->mc_ifma = NULL;
 		SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries);
 	}
 	IF_ADDR_WUNLOCK(scifp);
 	SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) {
 		error = if_addmulti(ifp,
 		    (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 static int
 lagg_clrmulti(struct lagg_port *lp)
 {
 	struct lagg_mc *mc;
 
 	LAGG_XLOCK_ASSERT(lp->lp_softc);
 	while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) {
 		SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries);
 		if (mc->mc_ifma && lp->lp_detaching == 0)
 			if_delmulti_ifma(mc->mc_ifma);
 		free(mc, M_LAGG);
 	}
 	return (0);
 }
 
 static void
 lagg_setcaps(struct lagg_port *lp, int cap, int cap2)
 {
 	struct ifreq ifr;
 	struct siocsifcapnv_driver_data drv_ioctl_data;
 
 	if (lp->lp_ifp->if_capenable == cap &&
 	    lp->lp_ifp->if_capenable2 == cap2)
 		return;
 	if (lp->lp_ioctl == NULL)
 		return;
 	/* XXX */
 	if ((lp->lp_ifp->if_capabilities & IFCAP_NV) != 0) {
 		drv_ioctl_data.reqcap = cap;
 		drv_ioctl_data.reqcap2 = cap2;
 		drv_ioctl_data.nvcap = NULL;
 		(*lp->lp_ioctl)(lp->lp_ifp, SIOCSIFCAPNV,
 		    (caddr_t)&drv_ioctl_data);
 	} else {
 		ifr.ifr_reqcap = cap;
 		(*lp->lp_ioctl)(lp->lp_ifp, SIOCSIFCAP, (caddr_t)&ifr);
 	}
 }
 
 /* Handle a ref counted flag that should be set on the lagg port as well */
 static int
 lagg_setflag(struct lagg_port *lp, int flag, int status,
     int (*func)(struct ifnet *, int))
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *scifp = sc->sc_ifp;
 	struct ifnet *ifp = lp->lp_ifp;
 	int error;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	status = status ? (scifp->if_flags & flag) : 0;
 	/* Now "status" contains the flag value or 0 */
 
 	/*
 	 * See if recorded ports status is different from what
 	 * we want it to be.  If it is, flip it.  We record ports
 	 * status in lp_ifflags so that we won't clear ports flag
 	 * we haven't set.  In fact, we don't clear or set ports
 	 * flags directly, but get or release references to them.
 	 * That's why we can be sure that recorded flags still are
 	 * in accord with actual ports flags.
 	 */
 	if (status != (lp->lp_ifflags & flag)) {
 		error = (*func)(ifp, status);
 		if (error)
 			return (error);
 		lp->lp_ifflags &= ~flag;
 		lp->lp_ifflags |= status;
 	}
 	return (0);
 }
 
 /*
  * Handle IFF_* flags that require certain changes on the lagg port
  * if "status" is true, update ports flags respective to the lagg
  * if "status" is false, forcedly clear the flags set on port.
  */
 static int
 lagg_setflags(struct lagg_port *lp, int status)
 {
 	int error, i;
 
 	for (i = 0; lagg_pflags[i].flag; i++) {
 		error = lagg_setflag(lp, lagg_pflags[i].flag,
 		    status, lagg_pflags[i].func);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 static int
 lagg_transmit_ethernet(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	int error;
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG)
 		MPASS(m->m_pkthdr.snd_tag->ifp == ifp);
 #endif
 	NET_EPOCH_ENTER(et);
 	/* We need a Tx algorithm and at least one port */
 	if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) {
 		NET_EPOCH_EXIT(et);
 		m_freem(m);
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENXIO);
 	}
 
 	ETHER_BPF_MTAP(ifp, m);
 
 	error = lagg_proto_start(sc, m);
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 static int
 lagg_transmit_infiniband(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	int error;
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG)
 		MPASS(m->m_pkthdr.snd_tag->ifp == ifp);
 #endif
 	NET_EPOCH_ENTER(et);
 	/* We need a Tx algorithm and at least one port */
 	if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) {
 		NET_EPOCH_EXIT(et);
 		m_freem(m);
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENXIO);
 	}
 
 	INFINIBAND_BPF_MTAP(ifp, m);
 
 	error = lagg_proto_start(sc, m);
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 /*
  * The ifp->if_qflush entry point for lagg(4) is no-op.
  */
 static void
 lagg_qflush(struct ifnet *ifp __unused)
 {
 }
 
 static struct mbuf *
 lagg_input_ethernet(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct lagg_port *lp = ifp->if_lagg;
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *scifp = sc->sc_ifp;
 
 	NET_EPOCH_ENTER(et);
 	if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    lp->lp_detaching != 0 ||
 	    sc->sc_proto == LAGG_PROTO_NONE) {
 		NET_EPOCH_EXIT(et);
 		m_freem(m);
 		return (NULL);
 	}
 
 	ETHER_BPF_MTAP(scifp, m);
 
 	m = lagg_proto_input(sc, lp, m);
 	if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) {
 		m_freem(m);
 		m = NULL;
 	}
 
 #ifdef DEV_NETMAP
 	if (m != NULL && scifp->if_capenable & IFCAP_NETMAP) {
 		scifp->if_input(scifp, m);
 		m = NULL;
 	}
 #endif	/* DEV_NETMAP */
 
 	NET_EPOCH_EXIT(et);
 	return (m);
 }
 
 static struct mbuf *
 lagg_input_infiniband(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct lagg_port *lp = ifp->if_lagg;
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *scifp = sc->sc_ifp;
 
 	NET_EPOCH_ENTER(et);
 	if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    lp->lp_detaching != 0 ||
 	    sc->sc_proto == LAGG_PROTO_NONE) {
 		NET_EPOCH_EXIT(et);
 		m_freem(m);
 		return (NULL);
 	}
 
 	INFINIBAND_BPF_MTAP(scifp, m);
 
 	m = lagg_proto_input(sc, lp, m);
 	if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) {
 		m_freem(m);
 		m = NULL;
 	}
 
 	NET_EPOCH_EXIT(et);
 	return (m);
 }
 
 static int
 lagg_media_change(struct ifnet *ifp)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 
 	if (sc->sc_ifflags & IFF_DEBUG)
 		printf("%s\n", __func__);
 
 	/* Ignore */
 	return (0);
 }
 
 static void
 lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr)
 {
 	struct epoch_tracker et;
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	struct lagg_port *lp;
 
 	imr->ifm_status = IFM_AVALID;
 	imr->ifm_active = IFM_ETHER | IFM_AUTO;
 
 	NET_EPOCH_ENTER(et);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (LAGG_PORTACTIVE(lp))
 			imr->ifm_status |= IFM_ACTIVE;
 	}
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 lagg_linkstate(struct lagg_softc *sc)
 {
 	struct epoch_tracker et;
 	struct lagg_port *lp;
 	int new_link = LINK_STATE_DOWN;
 	uint64_t speed;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	/* LACP handles link state itself */
 	if (sc->sc_proto == LAGG_PROTO_LACP)
 		return;
 
 	/* Our link is considered up if at least one of our ports is active */
 	NET_EPOCH_ENTER(et);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (lp->lp_ifp->if_link_state == LINK_STATE_UP) {
 			new_link = LINK_STATE_UP;
 			break;
 		}
 	}
 	NET_EPOCH_EXIT(et);
 	if_link_state_change(sc->sc_ifp, new_link);
 
 	/* Update if_baudrate to reflect the max possible speed */
 	switch (sc->sc_proto) {
 		case LAGG_PROTO_FAILOVER:
 			sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ?
 			    sc->sc_primary->lp_ifp->if_baudrate : 0;
 			break;
 		case LAGG_PROTO_ROUNDROBIN:
 		case LAGG_PROTO_LOADBALANCE:
 		case LAGG_PROTO_BROADCAST:
 			speed = 0;
 			NET_EPOCH_ENTER(et);
 			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 				speed += lp->lp_ifp->if_baudrate;
 			NET_EPOCH_EXIT(et);
 			sc->sc_ifp->if_baudrate = speed;
 			break;
 		case LAGG_PROTO_LACP:
 			/* LACP updates if_baudrate itself */
 			break;
 	}
 }
 
 static void
 lagg_port_state(struct ifnet *ifp, int state)
 {
 	struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg;
 	struct lagg_softc *sc = NULL;
 
 	if (lp != NULL)
 		sc = lp->lp_softc;
 	if (sc == NULL)
 		return;
 
 	LAGG_XLOCK(sc);
 	lagg_linkstate(sc);
 	lagg_proto_linkstate(sc, lp);
 	LAGG_XUNLOCK(sc);
 }
 
 struct lagg_port *
 lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp)
 {
 	struct lagg_port *lp_next, *rval = NULL;
 
 	/*
 	 * Search a port which reports an active link state.
 	 */
 
 #ifdef INVARIANTS
 	/*
 	 * This is called with either in the network epoch
 	 * or with LAGG_XLOCK(sc) held.
 	 */
 	if (!in_epoch(net_epoch_preempt))
 		LAGG_XLOCK_ASSERT(sc);
 #endif
 
 	if (lp == NULL)
 		goto search;
 	if (LAGG_PORTACTIVE(lp)) {
 		rval = lp;
 		goto found;
 	}
 	if ((lp_next = CK_SLIST_NEXT(lp, lp_entries)) != NULL &&
 	    LAGG_PORTACTIVE(lp_next)) {
 		rval = lp_next;
 		goto found;
 	}
 
 search:
 	CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
 		if (LAGG_PORTACTIVE(lp_next)) {
 			return (lp_next);
 		}
 	}
 found:
 	return (rval);
 }
 
 int
 lagg_enqueue(struct ifnet *ifp, struct mbuf *m)
 {
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) {
 		struct lagg_snd_tag *lst;
 		struct m_snd_tag *mst;
 
 		mst = m->m_pkthdr.snd_tag;
 		lst = mst_to_lst(mst);
 		if (lst->tag->ifp != ifp) {
 			m_freem(m);
 			return (EAGAIN);
 		}
 		m->m_pkthdr.snd_tag = m_snd_tag_ref(lst->tag);
 		m_snd_tag_rele(mst);
 	}
 #endif
 	return (ifp->if_transmit)(ifp, m);
 }
 
 /*
  * Simple round robin aggregation
  */
 static void
 lagg_rr_attach(struct lagg_softc *sc)
 {
 	sc->sc_seq = 0;
 	sc->sc_stride = 1;
 }
 
 static int
 lagg_rr_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_port *lp;
 	uint32_t p;
 
 	p = atomic_fetchadd_32(&sc->sc_seq, 1);
 	p /= sc->sc_stride;
 	p %= sc->sc_count;
 	lp = CK_SLIST_FIRST(&sc->sc_ports);
 
 	while (p--)
 		lp = CK_SLIST_NEXT(lp, lp_entries);
 
 	/*
 	 * Check the port's link state. This will return the next active
 	 * port if the link is down or the port is NULL.
 	 */
 	if ((lp = lagg_link_active(sc, lp)) == NULL) {
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	/* Just pass in the packet to our lagg device */
 	m->m_pkthdr.rcvif = ifp;
 
 	return (m);
 }
 
 /*
  * Broadcast mode
  */
 static int
 lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	int errors = 0;
 	int ret;
 	struct lagg_port *lp, *last = NULL;
 	struct mbuf *m0;
 
 	NET_EPOCH_ASSERT();
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (!LAGG_PORTACTIVE(lp))
 			continue;
 
 		if (last != NULL) {
 			m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 			if (m0 == NULL) {
 				ret = ENOBUFS;
 				errors++;
 				break;
 			}
 			lagg_enqueue(last->lp_ifp, m0);
 		}
 		last = lp;
 	}
 
 	if (last == NULL) {
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (ENOENT);
 	}
 	if ((last = lagg_link_active(sc, last)) == NULL) {
 		errors++;
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	ret = lagg_enqueue(last->lp_ifp, m);
 	if (errors != 0)
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors);
 
 	return (ret);
 }
 
 static struct mbuf*
 lagg_bcast_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	/* Just pass in the packet to our lagg device */
 	m->m_pkthdr.rcvif = ifp;
 	return (m);
 }
 
 /*
  * Active failover
  */
 static int
 lagg_fail_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_port *lp;
 
 	/* Use the master port if active or the next available port */
 	if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) {
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 	struct lagg_port *tmp_tp;
 
 	if (lp == sc->sc_primary || V_lagg_failover_rx_all) {
 		m->m_pkthdr.rcvif = ifp;
 		return (m);
 	}
 
 	if (!LAGG_PORTACTIVE(sc->sc_primary)) {
 		tmp_tp = lagg_link_active(sc, sc->sc_primary);
 		/*
 		 * If tmp_tp is null, we've received a packet when all
 		 * our links are down. Weird, but process it anyways.
 		 */
 		if ((tmp_tp == NULL || tmp_tp == lp)) {
 			m->m_pkthdr.rcvif = ifp;
 			return (m);
 		}
 	}
 
 	m_freem(m);
 	return (NULL);
 }
 
 /*
  * Loadbalancing
  */
 static void
 lagg_lb_attach(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	struct lagg_lb *lb;
 
 	LAGG_XLOCK_ASSERT(sc);
 	lb = malloc(sizeof(struct lagg_lb), M_LAGG, M_WAITOK | M_ZERO);
 	lb->lb_key = m_ether_tcpip_hash_init();
 	sc->sc_psc = lb;
 
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lagg_lb_port_create(lp);
 }
 
 static void
 lagg_lb_detach(struct lagg_softc *sc)
 {
 	struct lagg_lb *lb;
 
 	lb = (struct lagg_lb *)sc->sc_psc;
 	if (lb != NULL)
 		free(lb, M_LAGG);
 }
 
 static int
 lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp)
 {
 	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
 	struct lagg_port *lp_next;
 	int i = 0, rv;
 
 	rv = 0;
 	bzero(&lb->lb_ports, sizeof(lb->lb_ports));
 	LAGG_XLOCK_ASSERT(sc);
 	CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
 		if (lp_next == lp)
 			continue;
 		if (i >= LAGG_MAX_PORTS) {
 			rv = EINVAL;
 			break;
 		}
 		if (sc->sc_ifflags & IFF_DEBUG)
 			printf("%s: port %s at index %d\n",
 			    sc->sc_ifname, lp_next->lp_ifp->if_xname, i);
 		lb->lb_ports[i++] = lp_next;
 	}
 
 	return (rv);
 }
 
 static int
 lagg_lb_port_create(struct lagg_port *lp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	return (lagg_lb_porttable(sc, NULL));
 }
 
 static void
 lagg_lb_port_destroy(struct lagg_port *lp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	lagg_lb_porttable(sc, lp);
 }
 
 static int
 lagg_lb_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
 	struct lagg_port *lp = NULL;
 	uint32_t p = 0;
 
 	if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) &&
 	    M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
 		p = m->m_pkthdr.flowid >> sc->flowid_shift;
 	else
 		p = m_ether_tcpip_hash(sc->sc_flags, m, lb->lb_key);
 	p %= sc->sc_count;
 	lp = lb->lb_ports[p];
 
 	/*
 	 * Check the port's link state. This will return the next active
 	 * port if the link is down or the port is NULL.
 	 */
 	if ((lp = lagg_link_active(sc, lp)) == NULL) {
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	/* Just pass in the packet to our lagg device */
 	m->m_pkthdr.rcvif = ifp;
 
 	return (m);
 }
 
 /*
  * 802.3ad LACP
  */
 static void
 lagg_lacp_attach(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 
 	lacp_attach(sc);
 	LAGG_XLOCK_ASSERT(sc);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_create(lp);
 }
 
 static void
 lagg_lacp_detach(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	void *psc;
 
 	LAGG_XLOCK_ASSERT(sc);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_destroy(lp);
 
 	psc = sc->sc_psc;
 	sc->sc_psc = NULL;
 	lacp_detach(psc);
 }
 
 static void
 lagg_lacp_lladdr(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 
 	LAGG_SXLOCK_ASSERT(sc);
 
 	/* purge all the lacp ports */
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_destroy(lp);
 
 	/* add them back in */
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_create(lp);
 }
 
 static int
 lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_port *lp;
 	int err;
 
 	lp = lacp_select_tx_port(sc, m, &err);
 	if (lp == NULL) {
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (err);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 	struct ether_header *eh;
 	u_short etype;
 
 	eh = mtod(m, struct ether_header *);
 	etype = ntohs(eh->ether_type);
 
 	/* Tap off LACP control messages */
 	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) {
 		m = lacp_input(lp, m);
 		if (m == NULL)
 			return (NULL);
 	}
 
 	/*
 	 * If the port is not collecting or not in the active aggregator then
 	 * free and return.
 	 */
 	if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) {
 		m_freem(m);
 		return (NULL);
 	}
 
 	m->m_pkthdr.rcvif = ifp;
 	return (m);
 }
diff --git a/sys/net/if_llatbl.c b/sys/net/if_llatbl.c
index 68a5ab931ec9..86bf9a0d47db 100644
--- a/sys/net/if_llatbl.c
+++ b/sys/net/if_llatbl.c
@@ -1,1196 +1,1197 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2004 Luigi Rizzo, Alessandro Cerri. All rights reserved.
  * Copyright (c) 2004-2008 Qing Li. All rights reserved.
  * Copyright (c) 2008 Kip Macy. All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/uma.h>
 
 #include <netinet/in.h>
 #include <net/if_llatbl.h>
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/route_debug.h>
 #include <net/vnet.h>
 #include <netinet/if_ether.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/nd6.h>
 
 MALLOC_DEFINE(M_LLTABLE, "lltable", "link level address tables");
 
 VNET_DEFINE_STATIC(SLIST_HEAD(, lltable), lltables) =
     SLIST_HEAD_INITIALIZER(lltables);
 #define	V_lltables	VNET(lltables)
 
 static struct rwlock lltable_list_lock;
 RW_SYSINIT(lltable_list_lock, &lltable_list_lock, "lltable_list_lock");
 #define	LLTABLE_LIST_RLOCK()		rw_rlock(&lltable_list_lock)
 #define	LLTABLE_LIST_RUNLOCK()		rw_runlock(&lltable_list_lock)
 #define	LLTABLE_LIST_WLOCK()		rw_wlock(&lltable_list_lock)
 #define	LLTABLE_LIST_WUNLOCK()		rw_wunlock(&lltable_list_lock)
 #define	LLTABLE_LIST_LOCK_ASSERT()	rw_assert(&lltable_list_lock, RA_LOCKED)
 
 static void lltable_unlink(struct lltable *llt);
 static void llentries_unlink(struct lltable *llt, struct llentries *head);
 
 /*
  * Dump lle state for a specific address family.
  */
 static int
 lltable_dump_af(struct lltable *llt, struct sysctl_req *wr)
 {
 	struct epoch_tracker et;
 	int error;
 
 	LLTABLE_LIST_LOCK_ASSERT();
 
 	if (llt->llt_ifp->if_flags & IFF_LOOPBACK)
 		return (0);
 	error = 0;
 
 	NET_EPOCH_ENTER(et);
 	error = lltable_foreach_lle(llt,
 	    (llt_foreach_cb_t *)llt->llt_dump_entry, wr);
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 /*
  * Dump arp state for a specific address family.
  */
 int
 lltable_sysctl_dumparp(int af, struct sysctl_req *wr)
 {
 	struct lltable *llt;
 	int error = 0;
 
 	LLTABLE_LIST_RLOCK();
 	SLIST_FOREACH(llt, &V_lltables, llt_link) {
 		if (llt->llt_af == af) {
 			error = lltable_dump_af(llt, wr);
 			if (error != 0)
 				goto done;
 		}
 	}
 done:
 	LLTABLE_LIST_RUNLOCK();
 	return (error);
 }
 
 /*
  * Adds a mbuf to hold queue. Drops old packets if the queue is full.
  *
  * Returns the number of held packets that were dropped.
  */
 size_t
 lltable_append_entry_queue(struct llentry *lle, struct mbuf *m,
     size_t maxheld)
 {
 	size_t pkts_dropped = 0;
 
 	LLE_WLOCK_ASSERT(lle);
 
 	while (lle->la_numheld >= maxheld && lle->la_hold != NULL) {
 		struct mbuf *next = lle->la_hold->m_nextpkt;
 		m_freem(lle->la_hold);
 		lle->la_hold = next;
 		lle->la_numheld--;
 		pkts_dropped++;
 	}
 
 	if (lle->la_hold != NULL) {
 		struct mbuf *curr = lle->la_hold;
 		while (curr->m_nextpkt != NULL)
 			curr = curr->m_nextpkt;
 		curr->m_nextpkt = m;
 	} else
 		lle->la_hold = m;
 
 	lle->la_numheld++;
 
 	return pkts_dropped;
 }
 
 
 /*
  * Common function helpers for chained hash table.
  */
 
 /*
  * Runs specified callback for each entry in @llt.
  * Caller does the locking.
  *
  */
 static int
 htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg)
 {
 	struct llentry *lle, *next;
 	int i, error;
 
 	error = 0;
 
 	for (i = 0; i < llt->llt_hsize; i++) {
 		CK_LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) {
 			error = f(llt, lle, farg);
 			if (error != 0)
 				break;
 		}
 	}
 
 	return (error);
 }
 
 /*
  * The htable_[un]link_entry() functions return:
  * 0 if the entry was (un)linked already and nothing changed,
  * 1 if the entry was added/removed to/from the table, and
  * -1 on error (e.g., not being able to add the entry due to limits reached).
  * While the "unlink" operation should never error, callers of
  * lltable_link_entry() need to check for errors and handle them.
  */
 static int
 htable_link_entry(struct lltable *llt, struct llentry *lle)
 {
 	struct llentries *lleh;
 	uint32_t hashidx;
 
 	if ((lle->la_flags & LLE_LINKED) != 0)
 		return (0);
 
 	IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp);
 
 	if (llt->llt_maxentries > 0 &&
 	    llt->llt_entries >= llt->llt_maxentries)
 		return (-1);
 
 	hashidx = llt->llt_hash(lle, llt->llt_hsize);
 	lleh = &llt->lle_head[hashidx];
 
 	lle->lle_tbl  = llt;
 	lle->lle_head = lleh;
 	lle->la_flags |= LLE_LINKED;
 	CK_LIST_INSERT_HEAD(lleh, lle, lle_next);
 	llt->llt_entries++;
 
 	return (1);
 }
 
 static int
 htable_unlink_entry(struct llentry *lle)
 {
 	struct lltable *llt;
 
 	if ((lle->la_flags & LLE_LINKED) == 0)
 		return (0);
 
 	llt = lle->lle_tbl;
 	IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp);
 	KASSERT(llt->llt_entries > 0, ("%s: lltable %p (%s) entries %d <= 0",
 	    __func__, llt, if_name(llt->llt_ifp), llt->llt_entries));
 
 	CK_LIST_REMOVE(lle, lle_next);
 	lle->la_flags &= ~(LLE_VALID | LLE_LINKED);
 #if 0
 	lle->lle_tbl = NULL;
 	lle->lle_head = NULL;
 #endif
 	llt->llt_entries--;
 
 	return (1);
 }
 
 struct prefix_match_data {
 	const struct sockaddr *addr;
 	const struct sockaddr *mask;
 	struct llentries dchain;
 	u_int flags;
 };
 
 static int
 htable_prefix_free_cb(struct lltable *llt, struct llentry *lle, void *farg)
 {
 	struct prefix_match_data *pmd;
 
 	pmd = (struct prefix_match_data *)farg;
 
 	if (llt->llt_match_prefix(pmd->addr, pmd->mask, pmd->flags, lle)) {
 		LLE_WLOCK(lle);
 		CK_LIST_INSERT_HEAD(&pmd->dchain, lle, lle_chain);
 	}
 
 	return (0);
 }
 
 static void
 htable_prefix_free(struct lltable *llt, const struct sockaddr *addr,
     const struct sockaddr *mask, u_int flags)
 {
 	struct llentry *lle, *next;
 	struct prefix_match_data pmd;
 
 	bzero(&pmd, sizeof(pmd));
 	pmd.addr = addr;
 	pmd.mask = mask;
 	pmd.flags = flags;
 	CK_LIST_INIT(&pmd.dchain);
 
 	IF_AFDATA_WLOCK(llt->llt_ifp);
 	/* Push matching lles to chain */
 	lltable_foreach_lle(llt, htable_prefix_free_cb, &pmd);
 
 	llentries_unlink(llt, &pmd.dchain);
 	IF_AFDATA_WUNLOCK(llt->llt_ifp);
 
 	CK_LIST_FOREACH_SAFE(lle, &pmd.dchain, lle_chain, next)
 		lltable_free_entry(llt, lle);
 }
 
 static void
 htable_free_tbl(struct lltable *llt)
 {
 
 	free(llt->lle_head, M_LLTABLE);
 	free(llt, M_LLTABLE);
 }
 
 static void
 llentries_unlink(struct lltable *llt, struct llentries *head)
 {
 	struct llentry *lle, *next;
 
 	CK_LIST_FOREACH_SAFE(lle, head, lle_chain, next)
 		llt->llt_unlink_entry(lle);
 }
 
 /*
  * Helper function used to drop all mbufs in hold queue.
  *
  * Returns the number of held packets, if any, that were dropped.
  */
 size_t
 lltable_drop_entry_queue(struct llentry *lle)
 {
 	size_t pkts_dropped = 0;
 
 	LLE_WLOCK_ASSERT(lle);
 
 	while (lle->la_hold != NULL) {
 		struct mbuf *next = lle->la_hold->m_nextpkt;
 		m_freem(lle->la_hold);
 		lle->la_hold = next;
 		lle->la_numheld--;
 		pkts_dropped++;
 	}
 
 	KASSERT(lle->la_numheld == 0,
 		("%s: la_numheld %d > 0, pkts_dropped %zd", __func__,
 		 lle->la_numheld, pkts_dropped));
 
 	return (pkts_dropped);
 }
 
 void
 lltable_set_entry_addr(struct ifnet *ifp, struct llentry *lle,
     const char *linkhdr, size_t linkhdrsize, int lladdr_off)
 {
 
 	memcpy(lle->r_linkdata, linkhdr, linkhdrsize);
 	lle->r_hdrlen = linkhdrsize;
 	lle->ll_addr = &lle->r_linkdata[lladdr_off];
 	lle->la_flags |= LLE_VALID;
 	lle->r_flags |= RLLE_VALID;
 }
 
 /*
  * Acquires lltable write lock.
  *
  * Returns true on success, with both lltable and lle lock held.
  * On failure, false is returned and lle wlock is still held.
  */
 bool
 lltable_acquire_wlock(struct ifnet *ifp, struct llentry *lle)
 {
 	NET_EPOCH_ASSERT();
 
 	/* Perform real LLE update */
 	/* use afdata WLOCK to update fields */
 	LLE_WUNLOCK(lle);
 	IF_AFDATA_WLOCK(ifp);
 	LLE_WLOCK(lle);
 
 	/*
 	 * Since we droppped LLE lock, other thread might have deleted
 	 * this lle. Check and return
 	 */
 	if ((lle->la_flags & LLE_DELETED) != 0) {
 		IF_AFDATA_WUNLOCK(ifp);
 		return (false);
 	}
 
 	return (true);
 }
 
 /*
  * Tries to update @lle link-level address.
  * Since update requires AFDATA WLOCK, function
  * drops @lle lock, acquires AFDATA lock and then acquires
  * @lle lock to maintain lock order.
  *
  * Returns 1 on success.
  */
 int
 lltable_try_set_entry_addr(struct ifnet *ifp, struct llentry *lle,
     const char *linkhdr, size_t linkhdrsize, int lladdr_off)
 {
 
 	if (!lltable_acquire_wlock(ifp, lle))
 		return (0);
 
 	/* Update data */
 	lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off);
 
 	IF_AFDATA_WUNLOCK(ifp);
 
 	return (1);
 }
 
  /*
  * Helper function used to pre-compute full/partial link-layer
  * header data suitable for feeding into if_output().
  */
 int
 lltable_calc_llheader(struct ifnet *ifp, int family, char *lladdr,
     char *buf, size_t *bufsize, int *lladdr_off)
 {
 	struct if_encap_req ereq;
 	int error;
 
 	bzero(buf, *bufsize);
 	bzero(&ereq, sizeof(ereq));
 	ereq.buf = buf;
 	ereq.bufsize = *bufsize;
 	ereq.rtype = IFENCAP_LL;
 	ereq.family = family;
 	ereq.lladdr = lladdr;
 	ereq.lladdr_len = ifp->if_addrlen;
 	error = ifp->if_requestencap(ifp, &ereq);
 	if (error == 0) {
 		*bufsize = ereq.bufsize;
 		*lladdr_off = ereq.lladdr_off;
 	}
 
 	return (error);
 }
 
 /*
  * Searches for the child entry matching @family inside @lle.
  * Returns the entry or NULL.
  */
 struct llentry *
 llentry_lookup_family(struct llentry *lle, int family)
 {
 	struct llentry *child_lle;
 
 	if (lle == NULL)
 		return (NULL);
 
 	CK_SLIST_FOREACH(child_lle, &lle->lle_children, lle_child_next) {
 		if (child_lle->r_family == family)
 			return (child_lle);
 	}
 
 	return (NULL);
 }
 
 /*
  * Retrieves upper protocol family for the llentry.
  * By default, all "normal" (e.g. upper_family == transport_family)
  * llentries have r_family set to 0.
  * Thus, use @default_family in that regard, otherwise use r_family.
  *
  * Returns upper protocol family
  */
 int
 llentry_get_upper_family(const struct llentry *lle, int default_family)
 {
 	return (lle->r_family == 0 ? default_family : lle->r_family);
 }
 
 /*
  * Prints llentry @lle data into provided buffer.
  * Example: lle/inet/valid/em0/1.2.3.4
  *
  * Returns @buf.
  */
 char *
 llentry_print_buf(const struct llentry *lle, struct ifnet *ifp, int family,
     char *buf, size_t bufsize)
 {
 #if defined(INET) || defined(INET6)
 	char abuf[INET6_ADDRSTRLEN];
 #endif
 
 	const char *valid = (lle->r_flags & RLLE_VALID) ? "valid" : "no_l2";
 	const char *upper_str = rib_print_family(llentry_get_upper_family(lle, family));
 
 	switch (family) {
 #ifdef INET
 	case AF_INET:
 		inet_ntop(AF_INET, &lle->r_l3addr.addr4, abuf, sizeof(abuf));
 		snprintf(buf, bufsize, "lle/%s/%s/%s/%s", upper_str,
 		    valid, if_name(ifp), abuf);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		inet_ntop(AF_INET6, &lle->r_l3addr.addr6, abuf, sizeof(abuf));
 		snprintf(buf, bufsize, "lle/%s/%s/%s/%s", upper_str,
 		    valid, if_name(ifp), abuf);
 		break;
 #endif
 	default:
 		snprintf(buf, bufsize, "lle/%s/%s/%s/????", upper_str,
 		    valid, if_name(ifp));
 		break;
 	}
 
 	return (buf);
 }
 
 char *
 llentry_print_buf_lltable(const struct llentry *lle, char *buf, size_t bufsize)
 {
 	struct lltable *tbl = lle->lle_tbl;
 
 	return (llentry_print_buf(lle, lltable_get_ifp(tbl), lltable_get_af(tbl), buf, bufsize));
 }
 
 /*
  * Requests feedback from the datapath.
  * First packet using @lle should result in
  * setting r_skip_req back to 0 and updating
  * lle_hittime to the current time_uptime.
  */
 void
 llentry_request_feedback(struct llentry *lle)
 {
 	struct llentry *child_lle;
 
 	LLE_REQ_LOCK(lle);
 	lle->r_skip_req = 1;
 	LLE_REQ_UNLOCK(lle);
 
 	CK_SLIST_FOREACH(child_lle, &lle->lle_children, lle_child_next) {
 		LLE_REQ_LOCK(child_lle);
 		child_lle->r_skip_req = 1;
 		LLE_REQ_UNLOCK(child_lle);
 	}
 }
 
 /*
  * Updates the lle state to mark it has been used
  * and record the time.
  * Used by the llentry_provide_feedback() wrapper.
  */
 void
 llentry_mark_used(struct llentry *lle)
 {
 	LLE_REQ_LOCK(lle);
 	lle->r_skip_req = 0;
 	lle->lle_hittime = time_uptime;
 	LLE_REQ_UNLOCK(lle);
 }
 
 /*
  * Fetches the time when lle was used.
  * Return 0 if the entry was not used, relevant time_uptime
  *  otherwise.
  */
 static time_t
 llentry_get_hittime_raw(struct llentry *lle)
 {
 	time_t lle_hittime = 0;
 
 	LLE_REQ_LOCK(lle);
 	if ((lle->r_skip_req == 0) && (lle_hittime < lle->lle_hittime))
 		lle_hittime = lle->lle_hittime;
 	LLE_REQ_UNLOCK(lle);
 
 	return (lle_hittime);
 }
 
 time_t
 llentry_get_hittime(struct llentry *lle)
 {
 	time_t lle_hittime = 0;
 	struct llentry *child_lle;
 
 	lle_hittime = llentry_get_hittime_raw(lle);
 
 	CK_SLIST_FOREACH(child_lle, &lle->lle_children, lle_child_next) {
 		time_t hittime = llentry_get_hittime_raw(child_lle);
 		if (hittime > lle_hittime)
 			lle_hittime = hittime;
 	}
 
 	return (lle_hittime);
 }
 
 /*
  * Update link-layer header for given @lle after
  * interface lladdr was changed.
  */
 static int
 llentry_update_ifaddr(struct lltable *llt, struct llentry *lle, void *farg)
 {
 	struct ifnet *ifp;
 	u_char linkhdr[LLE_MAX_LINKHDR];
 	size_t linkhdrsize;
 	u_char *lladdr;
 	int lladdr_off;
 
 	ifp = (struct ifnet *)farg;
 
 	lladdr = lle->ll_addr;
 
 	LLE_WLOCK(lle);
 	if ((lle->la_flags & LLE_VALID) == 0) {
 		LLE_WUNLOCK(lle);
 		return (0);
 	}
 
 	if ((lle->la_flags & LLE_IFADDR) != 0)
 		lladdr = IF_LLADDR(ifp);
 
 	linkhdrsize = sizeof(linkhdr);
 	lltable_calc_llheader(ifp, llt->llt_af, lladdr, linkhdr, &linkhdrsize,
 	    &lladdr_off);
 	memcpy(lle->r_linkdata, linkhdr, linkhdrsize);
 	LLE_WUNLOCK(lle);
 
 	return (0);
 }
 
 /*
  * Update all calculated headers for given @llt
  */
 void
 lltable_update_ifaddr(struct lltable *llt)
 {
 
 	if (llt->llt_ifp->if_flags & IFF_LOOPBACK)
 		return;
 
 	IF_AFDATA_WLOCK(llt->llt_ifp);
 	lltable_foreach_lle(llt, llentry_update_ifaddr, llt->llt_ifp);
 	IF_AFDATA_WUNLOCK(llt->llt_ifp);
 }
 
 /*
  *
  * Performs generic cleanup routines and frees lle.
  *
  * Called for non-linked entries, with callouts and
  * other AF-specific cleanups performed.
  *
  * @lle must be passed WLOCK'ed
  *
  * Returns the number of held packets, if any, that were dropped.
  */
 size_t
 llentry_free(struct llentry *lle)
 {
 	size_t pkts_dropped;
 
 	LLE_WLOCK_ASSERT(lle);
 
 	KASSERT((lle->la_flags & LLE_LINKED) == 0, ("freeing linked lle"));
 
 	pkts_dropped = lltable_drop_entry_queue(lle);
 
 	/* cancel timer */
 	if (callout_stop(&lle->lle_timer) > 0)
 		LLE_REMREF(lle);
 	LLE_FREE_LOCKED(lle);
 
 	return (pkts_dropped);
 }
 
 /*
  * Free all entries from given table and free itself.
  */
 
 static int
 lltable_free_cb(struct lltable *llt, struct llentry *lle, void *farg)
 {
 	struct llentries *dchain;
 
 	dchain = (struct llentries *)farg;
 
 	LLE_WLOCK(lle);
 	CK_LIST_INSERT_HEAD(dchain, lle, lle_chain);
 
 	return (0);
 }
 
 /*
  * Free all entries from given table and free itself.
  */
 void
 lltable_free(struct lltable *llt)
 {
 	struct llentry *lle, *next;
 	struct llentries dchain;
 
 	KASSERT(llt != NULL, ("%s: llt is NULL", __func__));
 
 	lltable_unlink(llt);
 
 	CK_LIST_INIT(&dchain);
 	IF_AFDATA_WLOCK(llt->llt_ifp);
 	/* Push all lles to @dchain */
 	lltable_foreach_lle(llt, lltable_free_cb, &dchain);
 	llentries_unlink(llt, &dchain);
 	IF_AFDATA_WUNLOCK(llt->llt_ifp);
 
 	CK_LIST_FOREACH_SAFE(lle, &dchain, lle_chain, next) {
 		llentry_free(lle);
 	}
 
 	KASSERT(llt->llt_entries == 0, ("%s: lltable %p (%s) entires not 0: %d",
 	    __func__, llt, llt->llt_ifp->if_xname, llt->llt_entries));
 
 	llt->llt_free_tbl(llt);
 }
 
 /*
  * Deletes an address from given lltable.
  * Used for userland interaction to remove
  * individual entries. Skips entries added by OS.
  */
 int
 lltable_delete_addr(struct lltable *llt, u_int flags,
     const struct sockaddr *l3addr)
 {
 	struct llentry *lle;
 	struct ifnet *ifp;
 
 	ifp = llt->llt_ifp;
 	IF_AFDATA_WLOCK(ifp);
 	lle = lla_lookup(llt, LLE_SF(l3addr->sa_family, LLE_EXCLUSIVE), l3addr);
 
 	if (lle == NULL) {
 		IF_AFDATA_WUNLOCK(ifp);
 		return (ENOENT);
 	}
 	if ((lle->la_flags & LLE_IFADDR) != 0 && (flags & LLE_IFADDR) == 0) {
 		IF_AFDATA_WUNLOCK(ifp);
 		LLE_WUNLOCK(lle);
 		return (EPERM);
 	}
 
 	lltable_unlink_entry(llt, lle);
 	IF_AFDATA_WUNLOCK(ifp);
 
 	llt->llt_delete_entry(llt, lle);
 
 	return (0);
 }
 
 void
 lltable_prefix_free(int af, struct sockaddr *addr, struct sockaddr *mask,
     u_int flags)
 {
 	struct lltable *llt;
 
 	LLTABLE_LIST_RLOCK();
 	SLIST_FOREACH(llt, &V_lltables, llt_link) {
 		if (llt->llt_af != af)
 			continue;
 
 		llt->llt_prefix_free(llt, addr, mask, flags);
 	}
 	LLTABLE_LIST_RUNLOCK();
 }
 
 /*
  * Delete llentries that func() returns true.
  */
 struct lle_match_data {
 	struct llentries dchain;
 	llt_match_cb_t *func;
 	void *farg;
 };
 
 static int
 lltable_delete_conditional_cb(struct lltable *llt, struct llentry *lle,
     void *farg)
 {
 	struct lle_match_data *lmd;
 
 	lmd = (struct lle_match_data *)farg;
 	if (lmd->func(llt, lle, lmd->farg)) {
 		LLE_WLOCK(lle);
 		CK_LIST_INSERT_HEAD(&lmd->dchain, lle, lle_chain);
 	}
 
 	return (0);
 }
 
 void
 lltable_delete_conditional(struct lltable *llt, llt_match_cb_t *func,
     void *farg)
 {
 	struct llentry *lle, *next;
 	struct lle_match_data lmd;
 
 	bzero(&lmd, sizeof(lmd));
 	CK_LIST_INIT(&lmd.dchain);
 	lmd.func = func;
 	lmd.farg = farg;
 
 	IF_AFDATA_WLOCK(llt->llt_ifp);
 	lltable_foreach_lle(llt, lltable_delete_conditional_cb, &lmd);
 	llentries_unlink(llt, &lmd.dchain);
 	IF_AFDATA_WUNLOCK(llt->llt_ifp);
 
 	CK_LIST_FOREACH_SAFE(lle, &lmd.dchain, lle_chain, next)
 		llt->llt_delete_entry(llt, lle);
 }
 
 struct lltable *
 lltable_allocate_htbl(uint32_t hsize)
 {
 	struct lltable *llt;
 	int i;
 
 	llt = malloc(sizeof(struct lltable), M_LLTABLE, M_WAITOK | M_ZERO);
 	llt->llt_hsize = hsize;
 	llt->lle_head = malloc(sizeof(struct llentries) * hsize,
 	    M_LLTABLE, M_WAITOK | M_ZERO);
 
 	for (i = 0; i < llt->llt_hsize; i++)
 		CK_LIST_INIT(&llt->lle_head[i]);
 
 	/* Set some default callbacks */
 	llt->llt_link_entry = htable_link_entry;
 	llt->llt_unlink_entry = htable_unlink_entry;
 	llt->llt_prefix_free = htable_prefix_free;
 	llt->llt_foreach_entry = htable_foreach_lle;
 	llt->llt_free_tbl = htable_free_tbl;
 
 	return (llt);
 }
 
 /*
  * Links lltable to global llt list.
  */
 void
 lltable_link(struct lltable *llt)
 {
 
 	LLTABLE_LIST_WLOCK();
 	SLIST_INSERT_HEAD(&V_lltables, llt, llt_link);
 	LLTABLE_LIST_WUNLOCK();
 }
 
 static void
 lltable_unlink(struct lltable *llt)
 {
 
 	LLTABLE_LIST_WLOCK();
 	SLIST_REMOVE(&V_lltables, llt, lltable, llt_link);
 	LLTABLE_LIST_WUNLOCK();
 
 }
 
 /*
  * Gets interface @ifp lltable for the specified @family
  */
 struct lltable *
 lltable_get(struct ifnet *ifp, int family)
 {
 	switch (family) {
 #ifdef INET
 	case AF_INET:
 		return (in_lltable_get(ifp));
 #endif
 #ifdef INET6
 	case AF_INET6:
 		return (in6_lltable_get(ifp));
 #endif
 	}
 
 	return (NULL);
 }
 
 /*
  * External methods used by lltable consumers
  */
 
 int
 lltable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg)
 {
 
 	return (llt->llt_foreach_entry(llt, f, farg));
 }
 
 struct llentry *
 lltable_alloc_entry(struct lltable *llt, u_int flags,
     const struct sockaddr *l3addr)
 {
 
 	return (llt->llt_alloc_entry(llt, flags, l3addr));
 }
 
 void
 lltable_free_entry(struct lltable *llt, struct llentry *lle)
 {
 
 	llt->llt_free_entry(llt, lle);
 }
 
 int
 lltable_link_entry(struct lltable *llt, struct llentry *lle)
 {
 
 	return (llt->llt_link_entry(llt, lle));
 }
 
 void
 lltable_link_child_entry(struct llentry *lle, struct llentry *child_lle)
 {
 	child_lle->lle_parent = lle;
 	child_lle->lle_tbl = lle->lle_tbl;
 	child_lle->la_flags |= LLE_LINKED;
 	CK_SLIST_INSERT_HEAD(&lle->lle_children, child_lle, lle_child_next);
 }
 
 void
 lltable_unlink_child_entry(struct llentry *child_lle)
 {
 	struct llentry *lle = child_lle->lle_parent;
 
 	child_lle->la_flags &= ~LLE_LINKED;
 	child_lle->lle_parent = NULL;
 	CK_SLIST_REMOVE(&lle->lle_children, child_lle, llentry, lle_child_next);
 }
 
 int
 lltable_unlink_entry(struct lltable *llt, struct llentry *lle)
 {
 
 	return (llt->llt_unlink_entry(lle));
 }
 
 void
 lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa)
 {
 	struct lltable *llt;
 
 	llt = lle->lle_tbl;
 	llt->llt_fill_sa_entry(lle, sa);
 }
 
 struct ifnet *
 lltable_get_ifp(const struct lltable *llt)
 {
 
 	return (llt->llt_ifp);
 }
 
 int
 lltable_get_af(const struct lltable *llt)
 {
 
 	return (llt->llt_af);
 }
 
 /*
  * Called in route_output when rtm_flags contains RTF_LLDATA.
  */
 int
 lla_rt_output(struct rt_msghdr *rtm, struct rt_addrinfo *info)
 {
 	struct sockaddr_dl *dl =
 	    (struct sockaddr_dl *)info->rti_info[RTAX_GATEWAY];
 	struct sockaddr *dst = (struct sockaddr *)info->rti_info[RTAX_DST];
 	struct ifnet *ifp;
 	struct lltable *llt;
 	struct llentry *lle, *lle_tmp;
 	uint8_t linkhdr[LLE_MAX_LINKHDR];
 	size_t linkhdrsize;
 	int lladdr_off;
 	u_int laflags = 0;
 	int error;
 
 	if (dl == NULL || dl->sdl_family != AF_LINK)
 		return (EINVAL);
 
 	/* XXX: should be ntohs() */
 	ifp = ifnet_byindex(dl->sdl_index);
 	if (ifp == NULL) {
 		log(LOG_INFO, "%s: invalid ifp (sdl_index %d)\n",
 		    __func__, dl->sdl_index);
 		return EINVAL;
 	}
 
 	llt = lltable_get(ifp, dst->sa_family);
 
 	if (llt == NULL)
 		return (ESRCH);
 
 	error = 0;
 
 	switch (rtm->rtm_type) {
 	case RTM_ADD:
 		/* Add static LLE */
 		laflags = 0;
 		if (rtm->rtm_rmx.rmx_expire == 0)
 			laflags = LLE_STATIC;
 		lle = lltable_alloc_entry(llt, laflags, dst);
 		if (lle == NULL)
 			return (ENOMEM);
 
 		linkhdrsize = sizeof(linkhdr);
 		if (lltable_calc_llheader(ifp, dst->sa_family, LLADDR(dl),
 		    linkhdr, &linkhdrsize, &lladdr_off) != 0) {
 			lltable_free_entry(llt, lle);
 			return (EINVAL);
 		}
 		lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize,
 		    lladdr_off);
 		if ((rtm->rtm_flags & RTF_ANNOUNCE))
 			lle->la_flags |= LLE_PUB;
 		lle->la_expire = rtm->rtm_rmx.rmx_expire;
 
 		laflags = lle->la_flags;
 
 		/* Try to link new entry */
 		lle_tmp = NULL;
 		IF_AFDATA_WLOCK(ifp);
 		LLE_WLOCK(lle);
 		lle_tmp = lla_lookup(llt, LLE_EXCLUSIVE, dst);
 		if (lle_tmp != NULL) {
 			/* Check if we are trying to replace immutable entry */
 			if ((lle_tmp->la_flags & LLE_IFADDR) != 0) {
 				IF_AFDATA_WUNLOCK(ifp);
 				LLE_WUNLOCK(lle_tmp);
 				lltable_free_entry(llt, lle);
 				return (EPERM);
 			}
 			/* Unlink existing entry from table */
 			lltable_unlink_entry(llt, lle_tmp);
 		}
 		lltable_link_entry(llt, lle);
 		if ((lle->la_flags & LLE_PUB) != 0 &&
 		    (llt->llt_flags & LLT_ADDEDPROXY) == 0)
 			llt->llt_flags |= LLT_ADDEDPROXY;
 		IF_AFDATA_WUNLOCK(ifp);
 
 		if (lle_tmp != NULL) {
 			EVENTHANDLER_INVOKE(lle_event, lle_tmp,LLENTRY_EXPIRED);
 			lltable_free_entry(llt, lle_tmp);
 		}
 
 		/*
 		 * By invoking LLE handler here we might get
 		 * two events on static LLE entry insertion
 		 * in routing socket. However, since we might have
 		 * other subscribers we need to generate this event.
 		 */
 		EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED);
 		LLE_WUNLOCK(lle);
 		llt->llt_post_resolved(llt, lle);
 		break;
 
 	case RTM_DELETE:
 		return (lltable_delete_addr(llt, 0, dst));
 
 	default:
 		error = EINVAL;
 	}
 
 	return (error);
 }
 
 #ifdef DDB
 struct llentry_sa {
 	struct llentry		base;
 	struct sockaddr		l3_addr;
 };
 
 static void
 llatbl_lle_show(struct llentry_sa *la)
 {
 	struct llentry *lle;
 	uint8_t octet[6];
 
 	lle = &la->base;
 	db_printf("lle=%p\n", lle);
 	db_printf(" lle_next=%p\n", lle->lle_next.cle_next);
 	db_printf(" lle_lock=%p\n", &lle->lle_lock);
 	db_printf(" lle_tbl=%p\n", lle->lle_tbl);
 	db_printf(" lle_head=%p\n", lle->lle_head);
 	db_printf(" la_hold=%p\n", lle->la_hold);
 	db_printf(" la_numheld=%d\n", lle->la_numheld);
 	db_printf(" la_expire=%ju\n", (uintmax_t)lle->la_expire);
 	db_printf(" la_flags=0x%04x\n", lle->la_flags);
 	db_printf(" la_asked=%u\n", lle->la_asked);
 	db_printf(" la_preempt=%u\n", lle->la_preempt);
 	db_printf(" ln_state=%d\n", lle->ln_state);
 	db_printf(" ln_router=%u\n", lle->ln_router);
 	db_printf(" ln_ntick=%ju\n", (uintmax_t)lle->ln_ntick);
 	db_printf(" lle_refcnt=%d\n", lle->lle_refcnt);
 	bcopy(lle->ll_addr, octet, sizeof(octet));
 	db_printf(" ll_addr=%02x:%02x:%02x:%02x:%02x:%02x\n",
 	    octet[0], octet[1], octet[2], octet[3], octet[4], octet[5]);
 	db_printf(" lle_timer=%p\n", &lle->lle_timer);
 
 	switch (la->l3_addr.sa_family) {
 #ifdef INET
 	case AF_INET:
 	{
 		struct sockaddr_in *sin;
 		char l3s[INET_ADDRSTRLEN];
 
 		sin = (struct sockaddr_in *)&la->l3_addr;
 		inet_ntoa_r(sin->sin_addr, l3s);
 		db_printf(" l3_addr=%s\n", l3s);
 		break;
 	}
 #endif
 #ifdef INET6
 	case AF_INET6:
 	{
 		struct sockaddr_in6 *sin6;
 		char l3s[INET6_ADDRSTRLEN];
 
 		sin6 = (struct sockaddr_in6 *)&la->l3_addr;
 		ip6_sprintf(l3s, &sin6->sin6_addr);
 		db_printf(" l3_addr=%s\n", l3s);
 		break;
 	}
 #endif
 	default:
 		db_printf(" l3_addr=N/A (af=%d)\n", la->l3_addr.sa_family);
 		break;
 	}
 }
 
 DB_SHOW_COMMAND(llentry, db_show_llentry)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show llentry <struct llentry *>\n");
 		return;
 	}
 
 	llatbl_lle_show((struct llentry_sa *)addr);
 }
 
 static void
 llatbl_llt_show(struct lltable *llt)
 {
 	int i;
 	struct llentry *lle;
 
 	db_printf("llt=%p llt_af=%d llt_ifp=%p\n",
 	    llt, llt->llt_af, llt->llt_ifp);
 
 	for (i = 0; i < llt->llt_hsize; i++) {
 		CK_LIST_FOREACH(lle, &llt->lle_head[i], lle_next) {
 			llatbl_lle_show((struct llentry_sa *)lle);
 			if (db_pager_quit)
 				return;
 		}
 	}
 }
 
 DB_SHOW_COMMAND(lltable, db_show_lltable)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show lltable <struct lltable *>\n");
 		return;
 	}
 
 	llatbl_llt_show((struct lltable *)addr);
 }
 
 DB_SHOW_ALL_COMMAND(lltables, db_show_all_lltables)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct lltable *llt;
 
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET_QUIET(vnet_iter);
 #ifdef VIMAGE
 		db_printf("vnet=%p\n", curvnet);
 #endif
 		SLIST_FOREACH(llt, &V_lltables, llt_link) {
 			db_printf("llt=%p llt_af=%d llt_ifp=%p(%s)\n",
 			    llt, llt->llt_af, llt->llt_ifp,
 			    (llt->llt_ifp != NULL) ?
 				llt->llt_ifp->if_xname : "?");
 			if (have_addr && addr != 0) /* verbose */
 				llatbl_llt_show(llt);
 			if (db_pager_quit) {
 				CURVNET_RESTORE();
 				return;
 			}
 		}
 		CURVNET_RESTORE();
 	}
 }
 #endif
diff --git a/sys/net/if_loop.c b/sys/net/if_loop.c
index f4d34c46f9f0..a483aea8860b 100644
--- a/sys/net/if_loop.c
+++ b/sys/net/if_loop.c
@@ -1,460 +1,461 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_loop.c	8.2 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 /*
  * Loopback interface driver for protocol testing and timing.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/bpf.h>
 #include <net/vnet.h>
 
 #ifdef	INET
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #endif
 
 #ifdef INET6
 #ifndef INET
 #include <netinet/in.h>
 #endif
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 #ifdef TINY_LOMTU
 #define	LOMTU	(1024+512)
 #elif defined(LARGE_LOMTU)
 #define LOMTU	131072
 #else
 #define LOMTU	16384
 #endif
 
 #define	LO_CSUM_FEATURES	(CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP)
 #define	LO_CSUM_FEATURES6	(CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6)
 #define	LO_CSUM_SET		(CSUM_DATA_VALID | CSUM_DATA_VALID_IPV6 | \
 				    CSUM_PSEUDO_HDR | \
 				    CSUM_IP_CHECKED | CSUM_IP_VALID | \
 				    CSUM_SCTP_VALID)
 
 static int	loioctl(struct ifnet *, u_long, caddr_t);
 static int	looutput(struct ifnet *ifp, struct mbuf *m,
 		    const struct sockaddr *dst, struct route *ro);
 
 VNET_DEFINE(struct ifnet *, loif);	/* Used externally */
 
 #ifdef VIMAGE
 VNET_DEFINE_STATIC(struct if_clone *, lo_cloner);
 #define	V_lo_cloner		VNET(lo_cloner)
 #endif
 
 static struct if_clone *lo_cloner;
 static const char loname[] = "lo";
 
 static int
 lo_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags)
 {
 	if (ifp->if_dunit == 0 && (flags & IFC_F_FORCE) == 0)
 		return (EINVAL);
 
 #ifndef VIMAGE
 	/* XXX: destroying lo0 will lead to panics. */
 	KASSERT(V_loif != ifp, ("%s: destroying lo0", __func__));
 #endif
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 	if_free(ifp);
 
 	return (0);
 }
 
 static int
 lo_clone_create(struct if_clone *ifc, char *name, size_t len,
     struct ifc_data *ifd, struct ifnet **ifpp)
 {
 	struct ifnet *ifp;
 
 	ifp = if_alloc(IFT_LOOP);
 	if (ifp == NULL)
 		return (ENOSPC);
 
 	if_initname(ifp, loname, ifd->unit);
 	ifp->if_mtu = LOMTU;
 	ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST;
 	ifp->if_ioctl = loioctl;
 	ifp->if_output = looutput;
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	ifp->if_capabilities = ifp->if_capenable =
 	    IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 | IFCAP_LINKSTATE;
 	ifp->if_hwassist = LO_CSUM_FEATURES | LO_CSUM_FEATURES6;
 	if_attach(ifp);
 	bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));
 	if (V_loif == NULL)
 		V_loif = ifp;
 	*ifpp = ifp;
 
 	return (0);
 }
 
 static void
 vnet_loif_init(const void *unused __unused)
 {
 	struct if_clone_addreq req = {
 		.create_f = lo_clone_create,
 		.destroy_f = lo_clone_destroy,
 		.flags = IFC_F_AUTOUNIT,
 	};
 	lo_cloner = ifc_attach_cloner(loname, &req);
 #ifdef VIMAGE
 	V_lo_cloner = lo_cloner;
 #endif
 	struct ifc_data ifd = { .unit = 0 };
 	ifc_create_ifp(loname, &ifd, NULL);
 }
 VNET_SYSINIT(vnet_loif_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_loif_init, NULL);
 
 #ifdef VIMAGE
 static void
 vnet_loif_uninit(const void *unused __unused)
 {
 
 	ifc_detach_cloner(V_lo_cloner);
 	V_loif = NULL;
 }
 VNET_SYSUNINIT(vnet_loif_uninit, SI_SUB_INIT_IF, SI_ORDER_SECOND,
     vnet_loif_uninit, NULL);
 #endif
 
 static int
 loop_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		break;
 
 	case MOD_UNLOAD:
 		printf("loop module unload - not possible for this module type\n");
 		return (EINVAL);
 
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t loop_mod = {
 	"if_lo",
 	loop_modevent,
 	0
 };
 
 DECLARE_MODULE(if_lo, loop_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
 
 static int
 looutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
     struct route *ro)
 {
 	u_int32_t af;
 #ifdef MAC
 	int error;
 #endif
 
 	M_ASSERTPKTHDR(m); /* check if we have the packet header */
 
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error) {
 		m_freem(m);
 		return (error);
 	}
 #endif
 
 	if (ro != NULL && ro->ro_flags & (RT_REJECT|RT_BLACKHOLE)) {
 		m_freem(m);
 		return (ro->ro_flags & RT_BLACKHOLE ? 0 : EHOSTUNREACH);
 	}
 
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
 
 #ifdef RSS
 	M_HASHTYPE_CLEAR(m);
 #endif
 
 	/* BPF writes need to be handled specially. */
 	if (dst->sa_family == AF_UNSPEC || dst->sa_family == pseudo_AF_HDRCMPLT)
 		bcopy(dst->sa_data, &af, sizeof(af));
 	else
 		af = RO_GET_FAMILY(ro, dst);
 
 #if 1	/* XXX */
 	switch (af) {
 	case AF_INET:
 		if (ifp->if_capenable & IFCAP_RXCSUM) {
 			m->m_pkthdr.csum_data = 0xffff;
 			m->m_pkthdr.csum_flags = LO_CSUM_SET;
 		}
 		m->m_pkthdr.csum_flags &= ~LO_CSUM_FEATURES;
 		break;
 	case AF_INET6:
 #if 0
 		/*
 		 * XXX-BZ for now always claim the checksum is good despite
 		 * any interface flags.   This is a workaround for 9.1-R and
 		 * a proper solution ought to be sought later.
 		 */
 		if (ifp->if_capenable & IFCAP_RXCSUM_IPV6) {
 			m->m_pkthdr.csum_data = 0xffff;
 			m->m_pkthdr.csum_flags = LO_CSUM_SET;
 		}
 #else
 		m->m_pkthdr.csum_data = 0xffff;
 		m->m_pkthdr.csum_flags = LO_CSUM_SET;
 #endif
 		m->m_pkthdr.csum_flags &= ~LO_CSUM_FEATURES6;
 		break;
 	default:
 		printf("looutput: af=%d unexpected\n", af);
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 #endif
 	return (if_simloop(ifp, m, af, 0));
 }
 
 /*
  * if_simloop()
  *
  * This function is to support software emulation of hardware loopback,
  * i.e., for interfaces with the IFF_SIMPLEX attribute. Since they can't
  * hear their own broadcasts, we create a copy of the packet that we
  * would normally receive via a hardware loopback.
  *
  * This function expects the packet to include the media header of length hlen.
  */
 int
 if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen)
 {
 	int isr;
 
 	M_ASSERTPKTHDR(m);
 	m_tag_delete_nonpersistent(m);
 	m->m_pkthdr.rcvif = ifp;
 
 #ifdef MAC
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	/*
 	 * Let BPF see incoming packet in the following manner:
 	 *  - Emulated packet loopback for a simplex interface
 	 *    (net/if_ethersubr.c)
 	 *	-> passes it to ifp's BPF
 	 *  - IPv4/v6 multicast packet loopback (netinet(6)/ip(6)_output.c)
 	 *	-> not passes it to any BPF
 	 *  - Normal packet loopback from myself to myself (net/if_loop.c)
 	 *	-> passes to lo0's BPF (even in case of IPv6, where ifp!=lo0)
 	 */
 	if (hlen > 0) {
 		if (bpf_peers_present(ifp->if_bpf)) {
 			bpf_mtap(ifp->if_bpf, m);
 		}
 	} else {
 		if (bpf_peers_present(V_loif->if_bpf)) {
 			if ((m->m_flags & M_MCAST) == 0 || V_loif == ifp) {
 				/* XXX beware sizeof(af) != 4 */
 				u_int32_t af1 = af;
 
 				/*
 				 * We need to prepend the address family.
 				 */
 				bpf_mtap2(V_loif->if_bpf, &af1, sizeof(af1), m);
 			}
 		}
 	}
 
 	/* Strip away media header */
 	if (hlen > 0) {
 		m_adj(m, hlen);
 #ifndef __NO_STRICT_ALIGNMENT
 		/*
 		 * Some archs do not like unaligned data, so
 		 * we move data down in the first mbuf.
 		 */
 		if (mtod(m, vm_offset_t) & 3) {
 			KASSERT(hlen >= 3, ("if_simloop: hlen too small"));
 			bcopy(m->m_data,
 			    (char *)(mtod(m, vm_offset_t)
 				- (mtod(m, vm_offset_t) & 3)),
 			    m->m_len);
 			m->m_data -= (mtod(m,vm_offset_t) & 3);
 		}
 #endif
 	}
 
 	/* Deliver to upper layer protocol */
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		isr = NETISR_IP;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		m->m_flags |= M_LOOP;
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		printf("if_simloop: can't handle af=%d\n", af);
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	netisr_queue(isr, m);	/* mbuf is free'd on failure. */
 	return (0);
 }
 
 /*
  * Process an ioctl request.
  */
 static int
 loioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	int error = 0, mask;
 
 	switch (cmd) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 		if_link_state_change(ifp, LINK_STATE_UP);
 		/*
 		 * Everything else is done at a higher level.
 		 */
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (ifr == NULL) {
 			error = EAFNOSUPPORT;		/* XXX */
 			break;
 		}
 		switch (ifr->ifr_addr.sa_family) {
 #ifdef INET
 		case AF_INET:
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			break;
 #endif
 
 		default:
 			error = EAFNOSUPPORT;
 			break;
 		}
 		break;
 
 	case SIOCSIFMTU:
 		ifp->if_mtu = ifr->ifr_mtu;
 		break;
 
 	case SIOCSIFFLAGS:
 		if_link_state_change(ifp, (ifp->if_flags & IFF_UP) ?
 		    LINK_STATE_UP: LINK_STATE_DOWN);
 		break;
 
 	case SIOCSIFCAP:
 		mask = ifp->if_capenable ^ ifr->ifr_reqcap;
 		if ((mask & IFCAP_RXCSUM) != 0)
 			ifp->if_capenable ^= IFCAP_RXCSUM;
 		if ((mask & IFCAP_TXCSUM) != 0)
 			ifp->if_capenable ^= IFCAP_TXCSUM;
 		if ((mask & IFCAP_RXCSUM_IPV6) != 0) {
 #if 0
 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
 #else
 			error = EOPNOTSUPP;
 			break;
 #endif
 		}
 		if ((mask & IFCAP_TXCSUM_IPV6) != 0) {
 #if 0
 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
 #else
 			error = EOPNOTSUPP;
 			break;
 #endif
 		}
 		ifp->if_hwassist = 0;
 		if (ifp->if_capenable & IFCAP_TXCSUM)
 			ifp->if_hwassist = LO_CSUM_FEATURES;
 #if 0
 		if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
 			ifp->if_hwassist |= LO_CSUM_FEATURES6;
 #endif
 		break;
 
 	default:
 		error = EINVAL;
 	}
 	return (error);
 }
diff --git a/sys/net/if_me.c b/sys/net/if_me.c
index 0f8336067116..2bbb6b15217d 100644
--- a/sys/net/if_me.c
+++ b/sys/net/if_me.c
@@ -1,689 +1,690 @@
 /*-
  * Copyright (c) 2014, 2018 Andrey V. Elsukov <ae@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_encap.h>
 
 #include <machine/in_cksum.h>
 #include <security/mac/mac_framework.h>
 
 #define	MEMTU			(1500 - sizeof(struct mobhdr))
 static const char mename[] = "me";
 static MALLOC_DEFINE(M_IFME, mename, "Minimal Encapsulation for IP");
 /* Minimal forwarding header RFC 2004 */
 struct mobhdr {
 	uint8_t		mob_proto;	/* protocol */
 	uint8_t		mob_flags;	/* flags */
 #define	MOB_FLAGS_SP	0x80		/* source present */
 	uint16_t	mob_csum;	/* header checksum */
 	struct in_addr	mob_dst;	/* original destination address */
 	struct in_addr	mob_src;	/* original source addr (optional) */
 } __packed;
 
 struct me_softc {
 	struct ifnet		*me_ifp;
 	u_int			me_fibnum;
 	struct in_addr		me_src;
 	struct in_addr		me_dst;
 
 	CK_LIST_ENTRY(me_softc) chain;
 	CK_LIST_ENTRY(me_softc) srchash;
 };
 CK_LIST_HEAD(me_list, me_softc);
 #define	ME2IFP(sc)		((sc)->me_ifp)
 #define	ME_READY(sc)		((sc)->me_src.s_addr != 0)
 #define	ME_RLOCK_TRACKER	struct epoch_tracker me_et
 #define	ME_RLOCK()		epoch_enter_preempt(net_epoch_preempt, &me_et)
 #define	ME_RUNLOCK()		epoch_exit_preempt(net_epoch_preempt, &me_et)
 #define	ME_WAIT()		epoch_wait_preempt(net_epoch_preempt)
 
 #ifndef ME_HASH_SIZE
 #define	ME_HASH_SIZE	(1 << 4)
 #endif
 VNET_DEFINE_STATIC(struct me_list *, me_hashtbl) = NULL;
 VNET_DEFINE_STATIC(struct me_list *, me_srchashtbl) = NULL;
 #define	V_me_hashtbl		VNET(me_hashtbl)
 #define	V_me_srchashtbl		VNET(me_srchashtbl)
 #define	ME_HASH(src, dst)	(V_me_hashtbl[\
     me_hashval((src), (dst)) & (ME_HASH_SIZE - 1)])
 #define	ME_SRCHASH(src)		(V_me_srchashtbl[\
     fnv_32_buf(&(src), sizeof(src), FNV1_32_INIT) & (ME_HASH_SIZE - 1)])
 
 static struct sx me_ioctl_sx;
 SX_SYSINIT(me_ioctl_sx, &me_ioctl_sx, "me_ioctl");
 
 static int	me_clone_create(struct if_clone *, int, caddr_t);
 static void	me_clone_destroy(struct ifnet *);
 VNET_DEFINE_STATIC(struct if_clone *, me_cloner);
 #define	V_me_cloner	VNET(me_cloner)
 
 #ifdef VIMAGE
 static void	me_reassign(struct ifnet *, struct vnet *, char *);
 #endif
 static void	me_qflush(struct ifnet *);
 static int	me_transmit(struct ifnet *, struct mbuf *);
 static int	me_ioctl(struct ifnet *, u_long, caddr_t);
 static int	me_output(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *);
 static int	me_input(struct mbuf *, int, int, void *);
 
 static int	me_set_tunnel(struct me_softc *, in_addr_t, in_addr_t);
 static void	me_delete_tunnel(struct me_softc *);
 
 SYSCTL_DECL(_net_link);
 static SYSCTL_NODE(_net_link, IFT_TUNNEL, me, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Minimal Encapsulation for IP (RFC 2004)");
 #ifndef MAX_ME_NEST
 #define MAX_ME_NEST 1
 #endif
 
 VNET_DEFINE_STATIC(int, max_me_nesting) = MAX_ME_NEST;
 #define	V_max_me_nesting	VNET(max_me_nesting)
 SYSCTL_INT(_net_link_me, OID_AUTO, max_nesting, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(max_me_nesting), 0, "Max nested tunnels");
 
 static uint32_t
 me_hashval(in_addr_t src, in_addr_t dst)
 {
 	uint32_t ret;
 
 	ret = fnv_32_buf(&src, sizeof(src), FNV1_32_INIT);
 	return (fnv_32_buf(&dst, sizeof(dst), ret));
 }
 
 static struct me_list *
 me_hashinit(void)
 {
 	struct me_list *hash;
 	int i;
 
 	hash = malloc(sizeof(struct me_list) * ME_HASH_SIZE,
 	    M_IFME, M_WAITOK);
 	for (i = 0; i < ME_HASH_SIZE; i++)
 		CK_LIST_INIT(&hash[i]);
 
 	return (hash);
 }
 
 static void
 vnet_me_init(const void *unused __unused)
 {
 
 	V_me_cloner = if_clone_simple(mename, me_clone_create,
 	    me_clone_destroy, 0);
 }
 VNET_SYSINIT(vnet_me_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_me_init, NULL);
 
 static void
 vnet_me_uninit(const void *unused __unused)
 {
 
 	if (V_me_hashtbl != NULL) {
 		free(V_me_hashtbl, M_IFME);
 		V_me_hashtbl = NULL;
 		ME_WAIT();
 		free(V_me_srchashtbl, M_IFME);
 	}
 	if_clone_detach(V_me_cloner);
 }
 VNET_SYSUNINIT(vnet_me_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_me_uninit, NULL);
 
 static int
 me_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct me_softc *sc;
 
 	sc = malloc(sizeof(struct me_softc), M_IFME, M_WAITOK | M_ZERO);
 	sc->me_fibnum = curthread->td_proc->p_fibnum;
 	ME2IFP(sc) = if_alloc(IFT_TUNNEL);
 	ME2IFP(sc)->if_softc = sc;
 	if_initname(ME2IFP(sc), mename, unit);
 
 	ME2IFP(sc)->if_mtu = MEMTU;
 	ME2IFP(sc)->if_flags = IFF_POINTOPOINT|IFF_MULTICAST;
 	ME2IFP(sc)->if_output = me_output;
 	ME2IFP(sc)->if_ioctl = me_ioctl;
 	ME2IFP(sc)->if_transmit = me_transmit;
 	ME2IFP(sc)->if_qflush = me_qflush;
 #ifdef VIMAGE
 	ME2IFP(sc)->if_reassign = me_reassign;
 #endif
 	ME2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE;
 	ME2IFP(sc)->if_capenable |= IFCAP_LINKSTATE;
 	if_attach(ME2IFP(sc));
 	bpfattach(ME2IFP(sc), DLT_NULL, sizeof(u_int32_t));
 	return (0);
 }
 
 #ifdef VIMAGE
 static void
 me_reassign(struct ifnet *ifp, struct vnet *new_vnet __unused,
     char *unused __unused)
 {
 	struct me_softc *sc;
 
 	sx_xlock(&me_ioctl_sx);
 	sc = ifp->if_softc;
 	if (sc != NULL)
 		me_delete_tunnel(sc);
 	sx_xunlock(&me_ioctl_sx);
 }
 #endif /* VIMAGE */
 
 static void
 me_clone_destroy(struct ifnet *ifp)
 {
 	struct me_softc *sc;
 
 	sx_xlock(&me_ioctl_sx);
 	sc = ifp->if_softc;
 	me_delete_tunnel(sc);
 	bpfdetach(ifp);
 	if_detach(ifp);
 	ifp->if_softc = NULL;
 	sx_xunlock(&me_ioctl_sx);
 
 	ME_WAIT();
 	if_free(ifp);
 	free(sc, M_IFME);
 }
 
 static int
 me_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct sockaddr_in *src, *dst;
 	struct me_softc *sc;
 	int error;
 
 	switch (cmd) {
 	case SIOCSIFMTU:
 		if (ifr->ifr_mtu < 576)
 			return (EINVAL);
 		ifp->if_mtu = ifr->ifr_mtu;
 		return (0);
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 	case SIOCSIFFLAGS:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		return (0);
 	}
 	sx_xlock(&me_ioctl_sx);
 	sc = ifp->if_softc;
 	if (sc == NULL) {
 		error = ENXIO;
 		goto end;
 	}
 	error = 0;
 	switch (cmd) {
 	case SIOCSIFPHYADDR:
 		src = &((struct in_aliasreq *)data)->ifra_addr;
 		dst = &((struct in_aliasreq *)data)->ifra_dstaddr;
 		if (src->sin_family != dst->sin_family ||
 		    src->sin_family != AF_INET ||
 		    src->sin_len != dst->sin_len ||
 		    src->sin_len != sizeof(struct sockaddr_in)) {
 			error = EINVAL;
 			break;
 		}
 		if (src->sin_addr.s_addr == INADDR_ANY ||
 		    dst->sin_addr.s_addr == INADDR_ANY) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		error = me_set_tunnel(sc, src->sin_addr.s_addr,
 		    dst->sin_addr.s_addr);
 		break;
 	case SIOCDIFPHYADDR:
 		me_delete_tunnel(sc);
 		break;
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 		if (!ME_READY(sc)) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		src = (struct sockaddr_in *)&ifr->ifr_addr;
 		memset(src, 0, sizeof(*src));
 		src->sin_family = AF_INET;
 		src->sin_len = sizeof(*src);
 		switch (cmd) {
 		case SIOCGIFPSRCADDR:
 			src->sin_addr = sc->me_src;
 			break;
 		case SIOCGIFPDSTADDR:
 			src->sin_addr = sc->me_dst;
 			break;
 		}
 		error = prison_if(curthread->td_ucred, sintosa(src));
 		if (error != 0)
 			memset(src, 0, sizeof(*src));
 		break;
 	case SIOCGTUNFIB:
 		ifr->ifr_fib = sc->me_fibnum;
 		break;
 	case SIOCSTUNFIB:
 		if ((error = priv_check(curthread, PRIV_NET_ME)) != 0)
 			break;
 		if (ifr->ifr_fib >= rt_numfibs)
 			error = EINVAL;
 		else
 			sc->me_fibnum = ifr->ifr_fib;
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 end:
 	sx_xunlock(&me_ioctl_sx);
 	return (error);
 }
 
 static int
 me_lookup(const struct mbuf *m, int off, int proto, void **arg)
 {
 	const struct ip *ip;
 	struct me_softc *sc;
 
 	if (V_me_hashtbl == NULL)
 		return (0);
 
 	NET_EPOCH_ASSERT();
 	ip = mtod(m, const struct ip *);
 	CK_LIST_FOREACH(sc, &ME_HASH(ip->ip_dst.s_addr,
 	    ip->ip_src.s_addr), chain) {
 		if (sc->me_src.s_addr == ip->ip_dst.s_addr &&
 		    sc->me_dst.s_addr == ip->ip_src.s_addr) {
 			if ((ME2IFP(sc)->if_flags & IFF_UP) == 0)
 				return (0);
 			*arg = sc;
 			return (ENCAP_DRV_LOOKUP);
 		}
 	}
 	return (0);
 }
 
 /*
  * Check that ingress address belongs to local host.
  */
 static void
 me_set_running(struct me_softc *sc)
 {
 
 	if (in_localip(sc->me_src))
 		ME2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING;
 	else
 		ME2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
 }
 
 /*
  * ifaddr_event handler.
  * Clear IFF_DRV_RUNNING flag when ingress address disappears to prevent
  * source address spoofing.
  */
 static void
 me_srcaddr(void *arg __unused, const struct sockaddr *sa,
     int event __unused)
 {
 	const struct sockaddr_in *sin;
 	struct me_softc *sc;
 
 	/* Check that VNET is ready */
 	if (V_me_hashtbl == NULL)
 		return;
 
 	NET_EPOCH_ASSERT();
 	sin = (const struct sockaddr_in *)sa;
 	CK_LIST_FOREACH(sc, &ME_SRCHASH(sin->sin_addr.s_addr), srchash) {
 		if (sc->me_src.s_addr != sin->sin_addr.s_addr)
 			continue;
 		me_set_running(sc);
 	}
 }
 
 static int
 me_set_tunnel(struct me_softc *sc, in_addr_t src, in_addr_t dst)
 {
 	struct epoch_tracker et;
 	struct me_softc *tmp;
 
 	sx_assert(&me_ioctl_sx, SA_XLOCKED);
 
 	if (V_me_hashtbl == NULL) {
 		V_me_hashtbl = me_hashinit();
 		V_me_srchashtbl = me_hashinit();
 	}
 
 	if (sc->me_src.s_addr == src && sc->me_dst.s_addr == dst)
 		return (0);
 
 	CK_LIST_FOREACH(tmp, &ME_HASH(src, dst), chain) {
 		if (tmp == sc)
 			continue;
 		if (tmp->me_src.s_addr == src &&
 		    tmp->me_dst.s_addr == dst)
 			return (EADDRNOTAVAIL);
 	}
 
 	me_delete_tunnel(sc);
 	sc->me_dst.s_addr = dst;
 	sc->me_src.s_addr = src;
 	CK_LIST_INSERT_HEAD(&ME_HASH(src, dst), sc, chain);
 	CK_LIST_INSERT_HEAD(&ME_SRCHASH(src), sc, srchash);
 
 	NET_EPOCH_ENTER(et);
 	me_set_running(sc);
 	NET_EPOCH_EXIT(et);
 	if_link_state_change(ME2IFP(sc), LINK_STATE_UP);
 	return (0);
 }
 
 static void
 me_delete_tunnel(struct me_softc *sc)
 {
 
 	sx_assert(&me_ioctl_sx, SA_XLOCKED);
 	if (ME_READY(sc)) {
 		CK_LIST_REMOVE(sc, chain);
 		CK_LIST_REMOVE(sc, srchash);
 		ME_WAIT();
 
 		sc->me_src.s_addr = 0;
 		sc->me_dst.s_addr = 0;
 		ME2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
 		if_link_state_change(ME2IFP(sc), LINK_STATE_DOWN);
 	}
 }
 
 static uint16_t
 me_in_cksum(uint16_t *p, int nwords)
 {
 	uint32_t sum = 0;
 
 	while (nwords-- > 0)
 		sum += *p++;
 	sum = (sum >> 16) + (sum & 0xffff);
 	sum += (sum >> 16);
 	return (~sum);
 }
 
 static int
 me_input(struct mbuf *m, int off, int proto, void *arg)
 {
 	struct me_softc *sc = arg;
 	struct mobhdr *mh;
 	struct ifnet *ifp;
 	struct ip *ip;
 	int hlen;
 
 	NET_EPOCH_ASSERT();
 
 	ifp = ME2IFP(sc);
 	/* checks for short packets */
 	hlen = sizeof(struct mobhdr);
 	if (m->m_pkthdr.len < sizeof(struct ip) + hlen)
 		hlen -= sizeof(struct in_addr);
 	if (m->m_len < sizeof(struct ip) + hlen)
 		m = m_pullup(m, sizeof(struct ip) + hlen);
 	if (m == NULL)
 		goto drop;
 	mh = (struct mobhdr *)mtodo(m, sizeof(struct ip));
 	/* check for wrong flags */
 	if (mh->mob_flags & (~MOB_FLAGS_SP)) {
 		m_freem(m);
 		goto drop;
 	}
 	if (mh->mob_flags) {
 	       if (hlen != sizeof(struct mobhdr)) {
 			m_freem(m);
 			goto drop;
 	       }
 	} else
 		hlen = sizeof(struct mobhdr) - sizeof(struct in_addr);
 	/* check mobile header checksum */
 	if (me_in_cksum((uint16_t *)mh, hlen / sizeof(uint16_t)) != 0) {
 		m_freem(m);
 		goto drop;
 	}
 #ifdef MAC
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 	ip = mtod(m, struct ip *);
 	ip->ip_dst = mh->mob_dst;
 	ip->ip_p = mh->mob_proto;
 	ip->ip_sum = 0;
 	ip->ip_len = htons(m->m_pkthdr.len - hlen);
 	if (mh->mob_flags)
 		ip->ip_src = mh->mob_src;
 	memmove(mtodo(m, hlen), ip, sizeof(struct ip));
 	m_adj(m, hlen);
 	m_clrprotoflags(m);
 	m->m_pkthdr.rcvif = ifp;
 	m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID);
 	M_SETFIB(m, ifp->if_fib);
 	hlen = AF_INET;
 	BPF_MTAP2(ifp, &hlen, sizeof(hlen), m);
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	if ((ifp->if_flags & IFF_MONITOR) != 0)
 		m_freem(m);
 	else
 		netisr_dispatch(NETISR_IP, m);
 	return (IPPROTO_DONE);
 drop:
 	if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 	return (IPPROTO_DONE);
 }
 
 static int
 me_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
    struct route *ro)
 {
 	uint32_t af;
 
 	if (dst->sa_family == AF_UNSPEC)
 		bcopy(dst->sa_data, &af, sizeof(af));
 	else
 		af = RO_GET_FAMILY(ro, dst);
 	m->m_pkthdr.csum_data = af;
 	return (ifp->if_transmit(ifp, m));
 }
 
 #define	MTAG_ME	1414491977
 static int
 me_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	ME_RLOCK_TRACKER;
 	struct mobhdr mh;
 	struct me_softc *sc;
 	struct ip *ip;
 	uint32_t af;
 	int error, hlen, plen;
 
 	ME_RLOCK();
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error != 0)
 		goto drop;
 #endif
 	error = ENETDOWN;
 	sc = ifp->if_softc;
 	if (sc == NULL || !ME_READY(sc) ||
 	    (ifp->if_flags & IFF_MONITOR) != 0 ||
 	    (ifp->if_flags & IFF_UP) == 0 ||
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    (error = if_tunnel_check_nesting(ifp, m, MTAG_ME,
 		V_max_me_nesting)) != 0) {
 		m_freem(m);
 		goto drop;
 	}
 	af = m->m_pkthdr.csum_data;
 	if (af != AF_INET) {
 		error = EAFNOSUPPORT;
 		m_freem(m);
 		goto drop;
 	}
 	if (m->m_len < sizeof(struct ip))
 		m = m_pullup(m, sizeof(struct ip));
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto drop;
 	}
 	ip = mtod(m, struct ip *);
 	/* Fragmented datagramms shouldn't be encapsulated */
 	if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) {
 		error = EINVAL;
 		m_freem(m);
 		goto drop;
 	}
 	mh.mob_proto = ip->ip_p;
 	mh.mob_src = ip->ip_src;
 	mh.mob_dst = ip->ip_dst;
 	if (in_hosteq(sc->me_src, ip->ip_src)) {
 		hlen = sizeof(struct mobhdr) - sizeof(struct in_addr);
 		mh.mob_flags = 0;
 	} else {
 		hlen = sizeof(struct mobhdr);
 		mh.mob_flags = MOB_FLAGS_SP;
 	}
 	BPF_MTAP2(ifp, &af, sizeof(af), m);
 	plen = m->m_pkthdr.len;
 	ip->ip_src = sc->me_src;
 	ip->ip_dst = sc->me_dst;
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 	M_SETFIB(m, sc->me_fibnum);
 	M_PREPEND(m, hlen, M_NOWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto drop;
 	}
 	if (m->m_len < sizeof(struct ip) + hlen)
 		m = m_pullup(m, sizeof(struct ip) + hlen);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto drop;
 	}
 	memmove(mtod(m, void *), mtodo(m, hlen), sizeof(struct ip));
 	ip = mtod(m, struct ip *);
 	ip->ip_len = htons(m->m_pkthdr.len);
 	ip->ip_p = IPPROTO_MOBILE;
 	ip->ip_sum = 0;
 	mh.mob_csum = 0;
 	mh.mob_csum = me_in_cksum((uint16_t *)&mh, hlen / sizeof(uint16_t));
 	bcopy(&mh, mtodo(m, sizeof(struct ip)), hlen);
 	error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
 drop:
 	if (error)
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	else {
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, plen);
 	}
 	ME_RUNLOCK();
 	return (error);
 }
 
 static void
 me_qflush(struct ifnet *ifp __unused)
 {
 
 }
 
 static const struct srcaddrtab *me_srcaddrtab = NULL;
 static const struct encaptab *ecookie = NULL;
 static const struct encap_config me_encap_cfg = {
 	.proto = IPPROTO_MOBILE,
 	.min_length = sizeof(struct ip) + sizeof(struct mobhdr) -
 	    sizeof(in_addr_t),
 	.exact_match = ENCAP_DRV_LOOKUP,
 	.lookup = me_lookup,
 	.input = me_input
 };
 
 static int
 memodevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		me_srcaddrtab = ip_encap_register_srcaddr(me_srcaddr,
 		    NULL, M_WAITOK);
 		ecookie = ip_encap_attach(&me_encap_cfg, NULL, M_WAITOK);
 		break;
 	case MOD_UNLOAD:
 		ip_encap_detach(ecookie);
 		ip_encap_unregister_srcaddr(me_srcaddrtab);
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t me_mod = {
 	"if_me",
 	memodevent,
 	0
 };
 
 DECLARE_MODULE(if_me, me_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_me, 1);
diff --git a/sys/net/if_media.c b/sys/net/if_media.c
index 4100aaab9521..441c7258a473 100644
--- a/sys/net/if_media.c
+++ b/sys/net/if_media.c
@@ -1,508 +1,509 @@
 /*	$NetBSD: if_media.c,v 1.1 1997/03/17 02:55:15 thorpej Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1997
  *	Jonathan Stone and Jason R. Thorpe.  All rights reserved.
  *
  * This software is derived from information provided by Matt Thomas.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Jonathan Stone
  *	and Jason R. Thorpe for the NetBSD Project.
  * 4. The names of the authors may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * BSD/OS-compatible network interface media selection.
  *
  * Where it is safe to do so, this code strays slightly from the BSD/OS
  * design.  Software which uses the API (device drivers, basically)
  * shouldn't notice any difference.
  *
  * Many thanks to Matt Thomas for providing the information necessary
  * to implement this interface.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ifmedia.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_media.h>
 
 /*
  * Compile-time options:
  * IFMEDIA_DEBUG:
  *	turn on implementation-level debug printfs.
  * 	Useful for debugging newly-ported  drivers.
  */
 
 static struct ifmedia_entry *ifmedia_match(struct ifmedia *ifm,
     int flags, int mask);
 
 #ifdef IFMEDIA_DEBUG
 #include <net/if_var.h>
+#include <net/if_private.h>
 int	ifmedia_debug = 0;
 SYSCTL_INT(_debug, OID_AUTO, ifmedia, CTLFLAG_RW, &ifmedia_debug,
 	    0, "if_media debugging msgs");
 static	void ifmedia_printword(int);
 #endif
 
 /*
  * Initialize if_media struct for a specific interface instance.
  */
 void
 ifmedia_init(struct ifmedia *ifm, int dontcare_mask,
     ifm_change_cb_t change_callback, ifm_stat_cb_t status_callback)
 {
 
 	LIST_INIT(&ifm->ifm_list);
 	ifm->ifm_cur = NULL;
 	ifm->ifm_media = 0;
 	ifm->ifm_mask = dontcare_mask;		/* IF don't-care bits */
 	ifm->ifm_change = change_callback;
 	ifm->ifm_status = status_callback;
 }
 
 void
 ifmedia_removeall(struct ifmedia *ifm)
 {
 	struct ifmedia_entry *entry;
 
 	while ((entry = LIST_FIRST(&ifm->ifm_list)) != NULL) {
 		LIST_REMOVE(entry, ifm_list);
 		free(entry, M_IFADDR);
 	}
 	ifm->ifm_cur = NULL;
 }
 
 /*
  * Add a media configuration to the list of supported media
  * for a specific interface instance.
  */
 void
 ifmedia_add(struct ifmedia *ifm, int mword, int data, void *aux)
 {
 	struct ifmedia_entry *entry;
 
 #ifdef IFMEDIA_DEBUG
 	if (ifmedia_debug) {
 		if (ifm == NULL) {
 			printf("ifmedia_add: null ifm\n");
 			return;
 		}
 		printf("Adding entry for (%#010x) ", mword);
 		ifmedia_printword(mword);
 	}
 #endif
 
 	entry = malloc(sizeof(*entry), M_IFADDR, M_NOWAIT);
 	if (entry == NULL)
 		panic("ifmedia_add: can't malloc entry");
 
 	entry->ifm_media = mword;
 	entry->ifm_data = data;
 	entry->ifm_aux = aux;
 
 	LIST_INSERT_HEAD(&ifm->ifm_list, entry, ifm_list);
 }
 
 /*
  * Add an array of media configurations to the list of
  * supported media for a specific interface instance.
  */
 void
 ifmedia_list_add(struct ifmedia *ifm, struct ifmedia_entry *lp, int count)
 {
 	int i;
 
 	for (i = 0; i < count; i++)
 		ifmedia_add(ifm, lp[i].ifm_media, lp[i].ifm_data,
 		    lp[i].ifm_aux);
 }
 
 /*
  * Set the default active media. 
  *
  * Called by device-specific code which is assumed to have already
  * selected the default media in hardware.  We do _not_ call the
  * media-change callback.
  */
 void
 ifmedia_set(struct ifmedia *ifm, int target)
 {
 	struct ifmedia_entry *match;
 
 	match = ifmedia_match(ifm, target, ifm->ifm_mask);
 
 	if (match == NULL) {
 		printf("ifmedia_set: no match for 0x%x/0x%x\n",
 		    target, ~ifm->ifm_mask);
 		panic("ifmedia_set");
 	}
 	ifm->ifm_cur = match;
 
 #ifdef IFMEDIA_DEBUG
 	if (ifmedia_debug) {
 		printf("ifmedia_set: target ");
 		ifmedia_printword(target);
 		printf("ifmedia_set: setting to ");
 		ifmedia_printword(ifm->ifm_cur->ifm_media);
 	}
 #endif
 }
 
 /*
  * Given a media word, return one suitable for an application
  * using the original encoding.
  */
 static int
 compat_media(int media)
 {
 
 	if (IFM_TYPE(media) == IFM_ETHER && IFM_SUBTYPE(media) > IFM_OTHER) {
 		media &= ~(IFM_ETH_XTYPE|IFM_TMASK);
 		media |= IFM_OTHER;
 	}
 	return (media);
 }
 
 /*
  * Device-independent media ioctl support function.
  */
 int
 ifmedia_ioctl(struct ifnet *ifp, struct ifreq *ifr, struct ifmedia *ifm,
     u_long cmd)
 {
 	struct ifmedia_entry *match;
 	struct ifmediareq *ifmr = (struct ifmediareq *) ifr;
 	int error = 0;
 
 	if (ifp == NULL || ifr == NULL || ifm == NULL)
 		return (EINVAL);
 
 	switch (cmd) {
 	/*
 	 * Set the current media.
 	 */
 	case  SIOCSIFMEDIA:
 	{
 		struct ifmedia_entry *oldentry;
 		int oldmedia;
 		int newmedia = ifr->ifr_media;
 
 		match = ifmedia_match(ifm, newmedia, ifm->ifm_mask);
 		if (match == NULL) {
 #ifdef IFMEDIA_DEBUG
 			if (ifmedia_debug) {
 				printf(
 		    "ifmedia_ioctl: no media found for %#010x mask %#010x\n", 
 				    newmedia, ifm->ifm_mask);
 			}
 #endif
 			return (ENXIO);
 		}
 
 		/*
 		 * If no change, we're done.
 		 * XXX Automedia may invole software intervention.
 		 *     Keep going in case the connected media changed.
 		 *     Similarly, if best match changed (kernel debugger?).
 		 */
 		if (IFM_SUBTYPE(newmedia) != IFM_AUTO &&
 		    newmedia == ifm->ifm_media && match == ifm->ifm_cur)
 			return (0);
 
 		/*
 		 * We found a match, now make the driver switch to it.
 		 * Make sure to preserve our old media type in case the
 		 * driver can't switch.
 		 */
 #ifdef IFMEDIA_DEBUG
 		if (ifmedia_debug) {
 			printf("ifmedia_ioctl: switching %s to ",
 			    ifp->if_xname);
 			ifmedia_printword(match->ifm_media);
 		}
 #endif
 		oldentry = ifm->ifm_cur;
 		oldmedia = ifm->ifm_media;
 		ifm->ifm_cur = match;
 		ifm->ifm_media = newmedia;
 		error = (*ifm->ifm_change)(ifp);
 		if (error) {
 			ifm->ifm_cur = oldentry;
 			ifm->ifm_media = oldmedia;
 		}
 		break;
 	}
 
 	/*
 	 * Get list of available media and current media on interface.
 	 */
 	case  SIOCGIFMEDIA: 
 	case  SIOCGIFXMEDIA: 
 	{
 		struct ifmedia_entry *ep;
 		int i;
 
 		if (ifmr->ifm_count < 0)
 			return (EINVAL);
 
 		if (cmd == SIOCGIFMEDIA) {
 			ifmr->ifm_active = ifmr->ifm_current = ifm->ifm_cur ?
 			    compat_media(ifm->ifm_cur->ifm_media) : IFM_NONE;
 		} else {
 			ifmr->ifm_active = ifmr->ifm_current = ifm->ifm_cur ?
 			    ifm->ifm_cur->ifm_media : IFM_NONE;
 		}
 		ifmr->ifm_mask = ifm->ifm_mask;
 		ifmr->ifm_status = 0;
 		(*ifm->ifm_status)(ifp, ifmr);
 
 		/*
 		 * If there are more interfaces on the list, count
 		 * them.  This allows the caller to set ifmr->ifm_count
 		 * to 0 on the first call to know how much space to
 		 * allocate.
 		 */
 		i = 0;
 		LIST_FOREACH(ep, &ifm->ifm_list, ifm_list) {
 			if (i < ifmr->ifm_count) {
 				error = copyout(&ep->ifm_media,
 				    ifmr->ifm_ulist + i, sizeof(int));
 				if (error != 0)
 					break;
 			}
 			i++;
 		}
 		if (error == 0 && i > ifmr->ifm_count)
 			error = ifmr->ifm_count != 0 ? E2BIG : 0;
 		ifmr->ifm_count = i;
 		break;
 	}
 
 	default:
 		return (EINVAL);
 	}
 
 	return (error);
 }
 
 /*
  * Find media entry matching a given ifm word.
  *
  */
 static struct ifmedia_entry *
 ifmedia_match(struct ifmedia *ifm, int target, int mask)
 {
 	struct ifmedia_entry *match, *next;
 
 	match = NULL;
 	mask = ~mask;
 
 	LIST_FOREACH(next, &ifm->ifm_list, ifm_list) {
 		if ((next->ifm_media & mask) == (target & mask)) {
 #if defined(IFMEDIA_DEBUG) || defined(DIAGNOSTIC)
 			if (match) {
 				printf("ifmedia_match: multiple match for "
 				    "%#010x/%#010x\n", target, mask);
 			}
 #endif
 			match = next;
 		}
 	}
 
 	return (match);
 }
 
 /*
  * Compute the interface `baudrate' from the media, for the interface
  * metrics (used by routing daemons).
  */
 static const struct ifmedia_baudrate ifmedia_baudrate_descriptions[] =   
     IFM_BAUDRATE_DESCRIPTIONS;
 
 uint64_t
 ifmedia_baudrate(int mword)
 {
 	int i;
 
 	for (i = 0; ifmedia_baudrate_descriptions[i].ifmb_word != 0; i++) {
 		if (IFM_TYPE_MATCH(mword, ifmedia_baudrate_descriptions[i].
 		    ifmb_word))
 			return (ifmedia_baudrate_descriptions[i].ifmb_baudrate);
 	}
 
 	/* Not known. */
 	return (0);
 }
 
 #ifdef IFMEDIA_DEBUG
 static const struct ifmedia_description ifm_type_descriptions[] =
     IFM_TYPE_DESCRIPTIONS;
 
 static const struct ifmedia_description ifm_subtype_ethernet_descriptions[] =
     IFM_SUBTYPE_ETHERNET_DESCRIPTIONS;
 
 static const struct ifmedia_description
     ifm_subtype_ethernet_option_descriptions[] =
     IFM_SUBTYPE_ETHERNET_OPTION_DESCRIPTIONS;
 
 static const struct ifmedia_description ifm_subtype_ieee80211_descriptions[] =
     IFM_SUBTYPE_IEEE80211_DESCRIPTIONS;
 
 static const struct ifmedia_description
     ifm_subtype_ieee80211_option_descriptions[] =
     IFM_SUBTYPE_IEEE80211_OPTION_DESCRIPTIONS;
 
 static const struct ifmedia_description
     ifm_subtype_ieee80211_mode_descriptions[] =
     IFM_SUBTYPE_IEEE80211_MODE_DESCRIPTIONS;
 
 static const struct ifmedia_description ifm_subtype_atm_descriptions[] =
     IFM_SUBTYPE_ATM_DESCRIPTIONS;
 
 static const struct ifmedia_description ifm_subtype_atm_option_descriptions[] =
     IFM_SUBTYPE_ATM_OPTION_DESCRIPTIONS;
 
 static const struct ifmedia_description ifm_subtype_shared_descriptions[] =
     IFM_SUBTYPE_SHARED_DESCRIPTIONS;
 
 static const struct ifmedia_description ifm_shared_option_descriptions[] =
     IFM_SHARED_OPTION_DESCRIPTIONS;
 
 struct ifmedia_type_to_subtype {
 	const struct ifmedia_description *subtypes;
 	const struct ifmedia_description *options;
 	const struct ifmedia_description *modes;
 };
 
 /* must be in the same order as IFM_TYPE_DESCRIPTIONS */
 static const struct ifmedia_type_to_subtype ifmedia_types_to_subtypes[] = {
 	{
 	  &ifm_subtype_ethernet_descriptions[0],
 	  &ifm_subtype_ethernet_option_descriptions[0],
 	  NULL,
 	},
 	{
 	  &ifm_subtype_ieee80211_descriptions[0],
 	  &ifm_subtype_ieee80211_option_descriptions[0],
 	  &ifm_subtype_ieee80211_mode_descriptions[0]
 	},
 	{
 	  &ifm_subtype_atm_descriptions[0],
 	  &ifm_subtype_atm_option_descriptions[0],
 	  NULL,
 	},
 };
 
 /*
  * print a media word.
  */
 static void
 ifmedia_printword(int ifmw)
 {
 	const struct ifmedia_description *desc;
 	const struct ifmedia_type_to_subtype *ttos;
 	int seen_option = 0;
 
 	/* Find the top-level interface type. */
 	for (desc = ifm_type_descriptions, ttos = ifmedia_types_to_subtypes;
 	    desc->ifmt_string != NULL; desc++, ttos++)
 		if (IFM_TYPE(ifmw) == desc->ifmt_word)
 			break;
 	if (desc->ifmt_string == NULL) {
 		printf("<unknown type>\n");
 		return;
 	}
 	printf("%s", desc->ifmt_string);
 
 	/* Any mode. */
 	for (desc = ttos->modes; desc && desc->ifmt_string != NULL; desc++)
 		if (IFM_MODE(ifmw) == desc->ifmt_word) {
 			if (desc->ifmt_string != NULL)
 				printf(" mode %s", desc->ifmt_string);
 			break;
 		}
 
 	/*
 	 * Check for the shared subtype descriptions first, then the
 	 * type-specific ones.
 	 */
 	for (desc = ifm_subtype_shared_descriptions;
 	    desc->ifmt_string != NULL; desc++)
 		if (IFM_SUBTYPE(ifmw) == desc->ifmt_word)
 			goto got_subtype;
 
 	for (desc = ttos->subtypes; desc->ifmt_string != NULL; desc++)
 		if (IFM_SUBTYPE(ifmw) == desc->ifmt_word)
 			break;
 	if (desc->ifmt_string == NULL) {
 		printf(" <unknown subtype>\n");
 		return;
 	}
 
  got_subtype:
 	printf(" %s", desc->ifmt_string);
 
 	/*
 	 * Look for shared options.
 	 */
 	for (desc = ifm_shared_option_descriptions;
 	    desc->ifmt_string != NULL; desc++) {
 		if (ifmw & desc->ifmt_word) {
 			if (seen_option == 0)
 				printf(" <");
 			printf("%s%s", seen_option++ ? "," : "",
 			    desc->ifmt_string);
 		}
 	}
 
 	/*
 	 * Look for subtype-specific options.
 	 */
 	for (desc = ttos->options; desc->ifmt_string != NULL; desc++) {
 		if (ifmw & desc->ifmt_word) {
 			if (seen_option == 0)
 				printf(" <");
 			printf("%s%s", seen_option++ ? "," : "",
 			    desc->ifmt_string); 
 		}
 	}
 	printf("%s\n", seen_option ? ">" : "");
 }
 #endif /* IFMEDIA_DEBUG */
diff --git a/sys/net/if_mib.c b/sys/net/if_mib.c
index abf983a02c79..603ee72c10a4 100644
--- a/sys/net/if_mib.c
+++ b/sys/net/if_mib.c
@@ -1,144 +1,145 @@
 /*-
  * Copyright 1996 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  * 
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_mib.h>
 #include <net/vnet.h>
 
 /*
  * A sysctl(3) MIB for generic interface information.  This information
  * is exported in the net.link.generic branch, which has the following
  * structure:
  *
  * net.link.generic	.system			- system-wide control variables
  *						  and statistics (node)
  *			.ifdata.<ifindex>.general
  *						- what's in `struct ifdata'
  *						  plus some other info
  *			.ifdata.<ifindex>.linkspecific
  *						- a link-type-specific data
  *						  structure (as might be used
  *						  by an SNMP agent
  *
  * Perhaps someday we will make addresses accessible via this interface
  * as well (then there will be four such...).  The reason that the
  * index comes before the last element in the name is because it
  * seems more orthogonal that way, particularly with the possibility
  * of other per-interface data living down here as well (e.g., integrated
  * services stuff).
  */
 
 static int
 sysctl_ifdata(SYSCTL_HANDLER_ARGS) /* XXX bad syntax! */
 {
 	int *name = (int *)arg1;
 	int error;
 	u_int namelen = arg2;
 	struct ifnet *ifp;
 	struct ifmibdata ifmd;
 	struct epoch_tracker et;
 	size_t dlen;
 	char *dbuf;
 
 	if (namelen != 2)
 		return EINVAL;
 	if (name[0] <= 0)
 		return (ENOENT);
 	NET_EPOCH_ENTER(et);
 	ifp = ifnet_byindex_ref(name[0]);
 	NET_EPOCH_EXIT(et);
 	if (ifp == NULL)
 		return (ENOENT);
 
 	switch(name[1]) {
 	default:
 		error = ENOENT;
 		goto out;
 
 	case IFDATA_GENERAL:
 		bzero(&ifmd, sizeof(ifmd));
 		strlcpy(ifmd.ifmd_name, ifp->if_xname, sizeof(ifmd.ifmd_name));
 
 		ifmd.ifmd_pcount = ifp->if_pcount;
 		if_data_copy(ifp, &ifmd.ifmd_data);
 
 		ifmd.ifmd_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifmd.ifmd_snd_len = ifp->if_snd.ifq_len;
 		ifmd.ifmd_snd_maxlen = ifp->if_snd.ifq_maxlen;
 		ifmd.ifmd_snd_drops =
 		    ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS);
 
 		error = SYSCTL_OUT(req, &ifmd, sizeof ifmd);
 		if (error)
 			goto out;
 		break;
 
 	case IFDATA_LINKSPECIFIC:
 		error = SYSCTL_OUT(req, ifp->if_linkmib, ifp->if_linkmiblen);
 		if (error || !req->newptr)
 			goto out;
 		break;
 
 	case IFDATA_DRIVERNAME:
 		/* 20 is enough for 64bit ints */
 		dlen = strlen(ifp->if_dname) + 20 + 1;
 		if ((dbuf = malloc(dlen, M_TEMP, M_NOWAIT)) == NULL) {
 			error = ENOMEM;
 			goto out;
 		}
 		if (ifp->if_dunit == IF_DUNIT_NONE)
 			strcpy(dbuf, ifp->if_dname);
 		else
 			sprintf(dbuf, "%s%d", ifp->if_dname, ifp->if_dunit);
 
 		error = SYSCTL_OUT(req, dbuf, strlen(dbuf) + 1);
 		if (error == 0 && req->newptr != NULL)
 			error = EPERM;
 		free(dbuf, M_TEMP);
 		goto out;
 	}
 out:
 	if_rele(ifp);
 	return error;
 }
 
 SYSCTL_DECL(_net_link_generic);
 static SYSCTL_NODE(_net_link_generic, IFMIB_IFDATA, ifdata,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ifdata,
     "Interface table");
diff --git a/sys/net/if_ovpn.c b/sys/net/if_ovpn.c
index fa69e5277f75..169a17ec6083 100644
--- a/sys/net/if_ovpn.c
+++ b/sys/net/if_ovpn.c
@@ -1,2529 +1,2530 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2021-2022 Rubicon Communications, LLC (Netgate)
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf_ring.h>
 #include <sys/epoch.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/nv.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 
 #include <machine/atomic.h>
 
 #include <net/bpf.h>
 #include <net/if.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/netisr.h>
 #include <net/route/nhop.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_fib.h>
 
 #include <machine/in_cksum.h>
 
 #include <opencrypto/cryptodev.h>
 
 #include "if_ovpn.h"
 
 struct ovpn_kkey_dir {
 	int			refcount;
 	uint8_t			key[32];
 	uint8_t			keylen;
 	uint8_t			nonce[8];
 	uint8_t			noncelen;
 	enum ovpn_key_cipher	cipher;
 	crypto_session_t	cryptoid;
 
 	struct mtx		replay_mtx;
 	/*
 	 * Last seen gapless sequence number. New rx seq numbers must be
 	 * strictly higher than this.
 	 */
 	uint32_t		rx_seq;
 	/* Seen packets, relative to rx_seq. bit(0) will always be 0. */
 	uint64_t		rx_window;
 };
 
 struct ovpn_kkey {
 	struct ovpn_kkey_dir	*encrypt;
 	struct ovpn_kkey_dir	*decrypt;
 	uint8_t			 keyid;
 	uint32_t		 peerid;
 };
 
 struct ovpn_keepalive {
 	uint32_t	interval;
 	uint32_t	timeout;
 };
 
 struct ovpn_wire_header {
 	uint32_t	 opcode; /* opcode, key id, peer id */
 	uint32_t	 seq;
 	uint8_t		 auth_tag[16];
 };
 
 struct ovpn_peer_counters {
 	uint64_t	pkt_in;
 	uint64_t	pkt_out;
 	uint64_t	bytes_in;
 	uint64_t	bytes_out;
 };
 #define OVPN_PEER_COUNTER_SIZE (sizeof(struct ovpn_peer_counters)/sizeof(uint64_t))
 
 struct ovpn_notification {
 	enum ovpn_notif_type	type;
 	uint32_t		peerid;
 
 	/* Delete notification */
 	enum ovpn_del_reason	del_reason;
 	struct ovpn_peer_counters	counters;
 };
 
 struct ovpn_softc;
 
 struct ovpn_kpeer {
 	RB_ENTRY(ovpn_kpeer)	 tree;
 	int			 refcount;
 	uint32_t		 peerid;
 
 	struct ovpn_softc	*sc;
 	struct sockaddr_storage	 local;
 	struct sockaddr_storage	 remote;
 
 	struct in_addr		 vpn4;
 	struct in6_addr		 vpn6;
 
 	struct ovpn_kkey	 keys[2];
 	uint32_t		 tx_seq;
 
 	enum ovpn_del_reason	 del_reason;
 	struct ovpn_keepalive	 keepalive;
 	uint32_t		*last_active;
 	struct callout		 ping_send;
 	struct callout		 ping_rcv;
 
 	counter_u64_t		 counters[OVPN_PEER_COUNTER_SIZE];
 };
 
 struct ovpn_counters {
 	uint64_t	lost_ctrl_pkts_in;
 	uint64_t	lost_ctrl_pkts_out;
 	uint64_t	lost_data_pkts_in;
 	uint64_t	lost_data_pkts_out;
 	uint64_t	nomem_data_pkts_in;
 	uint64_t	nomem_data_pkts_out;
 	uint64_t	received_ctrl_pkts;
 	uint64_t	received_data_pkts;
 	uint64_t	sent_ctrl_pkts;
 	uint64_t	sent_data_pkts;
 
 	uint64_t	transport_bytes_sent;
 	uint64_t	transport_bytes_received;
 	uint64_t	tunnel_bytes_sent;
 	uint64_t	tunnel_bytes_received;
 };
 #define OVPN_COUNTER_SIZE (sizeof(struct ovpn_counters)/sizeof(uint64_t))
 
 RB_HEAD(ovpn_kpeers, ovpn_kpeer);
 
 struct ovpn_softc {
 	int			 refcount;
 	struct rmlock		 lock;
 	struct ifnet		*ifp;
 	struct socket		*so;
 	int			 peercount;
 	struct ovpn_kpeers	 peers;
 
 	/* Pending notification */
 	struct buf_ring		*notifring;
 
 	counter_u64_t 		 counters[OVPN_COUNTER_SIZE];
 
 	struct epoch_context	 epoch_ctx;
 };
 
 static struct ovpn_kpeer *ovpn_find_peer(struct ovpn_softc *, uint32_t);
 static bool ovpn_udp_input(struct mbuf *, int, struct inpcb *,
     const struct sockaddr *, void *);
 static int ovpn_transmit_to_peer(struct ifnet *, struct mbuf *,
     struct ovpn_kpeer *, struct rm_priotracker *);
 static int ovpn_encap(struct ovpn_softc *, uint32_t, struct mbuf *);
 static int ovpn_get_af(struct mbuf *);
 static void ovpn_free_kkey_dir(struct ovpn_kkey_dir *);
 static bool ovpn_check_replay(struct ovpn_kkey_dir *, uint32_t);
 static int ovpn_peer_compare(struct ovpn_kpeer *, struct ovpn_kpeer *);
 
 static RB_PROTOTYPE(ovpn_kpeers, ovpn_kpeer, tree, ovpn_peer_compare);
 static RB_GENERATE(ovpn_kpeers, ovpn_kpeer, tree, ovpn_peer_compare);
 
 #define OVPN_MTU_MIN		576
 #define OVPN_MTU_MAX		(IP_MAXPACKET - sizeof(struct ip) - \
     sizeof(struct udphdr) - sizeof(struct ovpn_wire_header))
 
 #define OVPN_OP_DATA_V2		0x09
 #define OVPN_OP_SHIFT		3
 
 VNET_DEFINE_STATIC(struct if_clone *, ovpn_cloner);
 #define	V_ovpn_cloner	VNET(ovpn_cloner)
 
 #define OVPN_RLOCK_TRACKER	struct rm_priotracker _ovpn_lock_tracker; \
     struct rm_priotracker *_ovpn_lock_trackerp = &_ovpn_lock_tracker
 #define OVPN_RLOCK(sc)		rm_rlock(&(sc)->lock, _ovpn_lock_trackerp)
 #define OVPN_RUNLOCK(sc)	rm_runlock(&(sc)->lock, _ovpn_lock_trackerp)
 #define OVPN_WLOCK(sc)		rm_wlock(&(sc)->lock)
 #define OVPN_WUNLOCK(sc)	rm_wunlock(&(sc)->lock)
 #define OVPN_ASSERT(sc)		rm_assert(&(sc)->lock, RA_LOCKED)
 #define OVPN_RASSERT(sc)	rm_assert(&(sc)->lock, RA_RLOCKED)
 #define OVPN_WASSERT(sc)	rm_assert(&(sc)->lock, RA_WLOCKED)
 #define OVPN_UNLOCK_ASSERT(sc)	rm_assert(&(sc)->lock, RA_UNLOCKED)
 
 #define OVPN_COUNTER(sc, name) \
 	((sc)->counters[offsetof(struct ovpn_counters, name)/sizeof(uint64_t)])
 #define OVPN_PEER_COUNTER(peer, name) \
 	((peer)->counters[offsetof(struct ovpn_peer_counters, name) / \
 	 sizeof(uint64_t)])
 
 #define OVPN_COUNTER_ADD(sc, name, val)	\
 	counter_u64_add(OVPN_COUNTER(sc, name), val)
 #define OVPN_PEER_COUNTER_ADD(p, name, val)	\
 	counter_u64_add(OVPN_PEER_COUNTER(p, name), val)
 
 #define TO_IN(x)		((struct sockaddr_in *)(x))
 #define TO_IN6(x)		((struct sockaddr_in6 *)(x))
 
 SDT_PROVIDER_DEFINE(if_ovpn);
 SDT_PROBE_DEFINE1(if_ovpn, tx, transmit, start, "struct mbuf *");
 SDT_PROBE_DEFINE2(if_ovpn, tx, route, ip4, "struct in_addr *", "struct ovpn_kpeer *");
 SDT_PROBE_DEFINE2(if_ovpn, tx, route, ip6, "struct in6_addr *", "struct ovpn_kpeer *");
 
 static const char ovpnname[] = "ovpn";
 static const char ovpngroupname[] = "openvpn";
 
 static MALLOC_DEFINE(M_OVPN, ovpnname, "OpenVPN DCO Interface");
 
 SYSCTL_DECL(_net_link);
 static SYSCTL_NODE(_net_link, IFT_OTHER, openvpn, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "OpenVPN DCO Interface");
 VNET_DEFINE_STATIC(int, replay_protection) = 0;
 #define	V_replay_protection	VNET(replay_protection)
 SYSCTL_INT(_net_link_openvpn, OID_AUTO, replay_protection, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(replay_protection), 0, "Validate sequence numbers");
 
 VNET_DEFINE_STATIC(int, async_crypto);
 #define	V_async_crypto		VNET(async_crypto)
 SYSCTL_INT(_net_link_openvpn, OID_AUTO, async_crypto,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(async_crypto), 0,
 	"Use asynchronous mode to parallelize crypto jobs.");
 
 VNET_DEFINE_STATIC(int, async_netisr_queue);
 #define	V_async_netisr_queue		VNET(async_netisr_queue)
 SYSCTL_INT(_net_link_openvpn, OID_AUTO, netisr_queue,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(async_netisr_queue), 0,
 	"Use netisr_queue() rather than netisr_dispatch().");
 
 static int
 ovpn_peer_compare(struct ovpn_kpeer *a, struct ovpn_kpeer *b)
 {
 	return (a->peerid - b->peerid);
 }
 
 static struct ovpn_kpeer *
 ovpn_find_peer(struct ovpn_softc *sc, uint32_t peerid)
 {
 	struct ovpn_kpeer p;
 
 	OVPN_ASSERT(sc);
 
 	p.peerid = peerid;
 
 	return (RB_FIND(ovpn_kpeers, &sc->peers, &p));
 }
 
 static struct ovpn_kpeer *
 ovpn_find_only_peer(struct ovpn_softc *sc)
 {
 	OVPN_ASSERT(sc);
 
 	return (RB_ROOT(&sc->peers));
 }
 
 static uint16_t
 ovpn_get_port(struct sockaddr_storage *s)
 {
 	switch (s->ss_family) {
 	case AF_INET: {
 		struct sockaddr_in *in = (struct sockaddr_in *)s;
 		return (in->sin_port);
 	}
 	case AF_INET6: {
 		struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)s;
 		return (in6->sin6_port);
 	}
 	default:
 		panic("Unsupported address family %d", s->ss_family);
 	}
 }
 
 static int
 ovpn_nvlist_to_sockaddr(const nvlist_t *nvl, struct sockaddr_storage *sa)
 {
 	int af;
 
 	if (! nvlist_exists_number(nvl, "af"))
 		return (EINVAL);
 	if (! nvlist_exists_binary(nvl, "address"))
 		return (EINVAL);
 	if (! nvlist_exists_number(nvl, "port"))
 		return (EINVAL);
 
 	af = nvlist_get_number(nvl, "af");
 
 	switch (af) {
 #ifdef INET
 	case AF_INET: {
 		struct sockaddr_in *in = (struct sockaddr_in *)sa;
 		size_t len;
 		const void *addr = nvlist_get_binary(nvl, "address", &len);
 		in->sin_family = af;
 		if (len != sizeof(in->sin_addr))
 			return (EINVAL);
 
 		memcpy(&in->sin_addr, addr, sizeof(in->sin_addr));
 		in->sin_port = nvlist_get_number(nvl, "port");
 		break;
 	}
 #endif
 #ifdef INET6
 	case AF_INET6: {
 		struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)sa;
 		size_t len;
 		const void *addr = nvlist_get_binary(nvl, "address", &len);
 		in6->sin6_family = af;
 		if (len != sizeof(in6->sin6_addr))
 			return (EINVAL);
 
 		memcpy(&in6->sin6_addr, addr, sizeof(in6->sin6_addr));
 		in6->sin6_port = nvlist_get_number(nvl, "port");
 		break;
 	}
 #endif
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static bool
 ovpn_has_peers(struct ovpn_softc *sc)
 {
 	OVPN_ASSERT(sc);
 
 	return (sc->peercount > 0);
 }
 
 static void
 ovpn_rele_so(struct ovpn_softc *sc, struct ovpn_kpeer *peer)
 {
 	bool has_peers;
 
 	OVPN_WASSERT(sc);
 
 	if (sc->so == NULL)
 		return;
 
 	has_peers = ovpn_has_peers(sc);
 
 	/* Only remove the tunnel function if we're releasing the socket for
 	 * the last peer. */
 	if (! has_peers)
 		(void)udp_set_kernel_tunneling(sc->so, NULL, NULL, NULL);
 
 	sorele(sc->so);
 
 	if (! has_peers)
 		sc->so = NULL;
 }
 
 static void
 ovpn_notify_del_peer(struct ovpn_softc *sc, struct ovpn_kpeer *peer)
 {
 	struct ovpn_notification *n;
 
 	OVPN_WASSERT(sc);
 
 	n = malloc(sizeof(*n), M_OVPN, M_NOWAIT);
 	if (n == NULL)
 		return;
 
 	n->peerid = peer->peerid;
 	n->type = OVPN_NOTIF_DEL_PEER;
 	n->del_reason = peer->del_reason;
 
 	n->counters.pkt_in = counter_u64_fetch(OVPN_PEER_COUNTER(peer, pkt_in));
 	n->counters.pkt_out = counter_u64_fetch(OVPN_PEER_COUNTER(peer, pkt_out));
 	n->counters.bytes_in = counter_u64_fetch(OVPN_PEER_COUNTER(peer, bytes_in));
 	n->counters.bytes_out = counter_u64_fetch(OVPN_PEER_COUNTER(peer, bytes_out));
 
 	if (buf_ring_enqueue(sc->notifring, n) != 0) {
 		free(n, M_OVPN);
 	} else if (sc->so != NULL) {
 		/* Wake up userspace */
 		sc->so->so_error = EAGAIN;
 		sorwakeup(sc->so);
 		sowwakeup(sc->so);
 	}
 }
 
 static void
 ovpn_peer_release_ref(struct ovpn_kpeer *peer, bool locked)
 {
 	struct ovpn_softc *sc;
 
 	CURVNET_ASSERT_SET();
 
 	atomic_add_int(&peer->refcount, -1);
 
 	if (atomic_load_int(&peer->refcount) > 0)
 		return;
 
 	sc = peer->sc;
 
 	if (! locked) {
 		OVPN_WLOCK(sc);
 
 		/* Might have changed before we acquired the lock. */
 		if (atomic_load_int(&peer->refcount) > 0) {
 			OVPN_WUNLOCK(sc);
 			return;
 		}
 	}
 
 	OVPN_ASSERT(sc);
 
 	/* The peer should have been removed from the list already. */
 	MPASS(ovpn_find_peer(sc, peer->peerid) == NULL);
 
 	ovpn_notify_del_peer(sc, peer);
 
 	for (int i = 0; i < 2; i++) {
 		ovpn_free_kkey_dir(peer->keys[i].encrypt);
 		ovpn_free_kkey_dir(peer->keys[i].decrypt);
 	}
 
 	ovpn_rele_so(sc, peer);
 
 	callout_stop(&peer->ping_send);
 	callout_stop(&peer->ping_rcv);
 	uma_zfree_pcpu(pcpu_zone_4, peer->last_active);
 	free(peer, M_OVPN);
 
 	if (! locked)
 		OVPN_WUNLOCK(sc);
 }
 
 static int
 ovpn_new_peer(struct ifnet *ifp, const nvlist_t *nvl)
 {
 #ifdef INET6
 	struct epoch_tracker et;
 #endif
 	struct sockaddr_storage remote;
 	struct ovpn_kpeer *peer = NULL;
 	struct file *fp = NULL;
 	struct sockaddr *name = NULL;
 	struct ovpn_softc *sc = ifp->if_softc;
 	struct thread *td = curthread;
 	struct socket *so = NULL;
 	int fd;
 	uint32_t peerid;
 	int ret = 0;
 
 	if (nvl == NULL)
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "peerid"))
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "fd"))
 		return (EINVAL);
 
 	if (! nvlist_exists_nvlist(nvl, "remote"))
 		return (EINVAL);
 
 	peerid = nvlist_get_number(nvl, "peerid");
 
 	ret = ovpn_nvlist_to_sockaddr(nvlist_get_nvlist(nvl, "remote"),
 	    &remote);
 	if (ret != 0)
 		return (ret);
 
 	fd = nvlist_get_number(nvl, "fd");
 
 	/* Look up the userspace process and use the fd to find the socket. */
 	ret = getsock(td, fd, &cap_connect_rights, &fp);
 	if (ret != 0)
 		return (ret);
 
 	so = fp->f_data;
 
 	peer = malloc(sizeof(*peer), M_OVPN, M_WAITOK | M_ZERO);
 	peer->peerid = peerid;
 	peer->sc = sc;
 	peer->tx_seq = 1;
 	peer->refcount = 1;
 	peer->last_active = uma_zalloc_pcpu(pcpu_zone_4, M_WAITOK | M_ZERO);
 	COUNTER_ARRAY_ALLOC(peer->counters, OVPN_PEER_COUNTER_SIZE, M_WAITOK);
 
 	if (nvlist_exists_binary(nvl, "vpn_ipv4")) {
 		size_t len;
 		const void *addr = nvlist_get_binary(nvl, "vpn_ipv4", &len);
 		if (len != sizeof(peer->vpn4)) {
 			ret = EINVAL;
 			goto error;
 		}
 		memcpy(&peer->vpn4, addr, len);
 	}
 
 	if (nvlist_exists_binary(nvl, "vpn_ipv6")) {
 		size_t len;
 		const void *addr = nvlist_get_binary(nvl, "vpn_ipv6", &len);
 		if (len != sizeof(peer->vpn6)) {
 			ret = EINVAL;
 			goto error;
 		}
 		memcpy(&peer->vpn6, addr, len);
 	}
 
 	callout_init_rm(&peer->ping_send, &sc->lock, CALLOUT_SHAREDLOCK);
 	callout_init_rm(&peer->ping_rcv, &sc->lock, 0);
 
 	ret = so->so_proto->pr_sockaddr(so, &name);
 	if (ret)
 		goto error;
 
 	if (ovpn_get_port((struct sockaddr_storage *)name) == 0) {
 		ret = EINVAL;
 		goto error;
 	}
 	if (name->sa_family != remote.ss_family) {
 		ret = EINVAL;
 		goto error;
 	}
 
 	memcpy(&peer->local, name, name->sa_len);
 	memcpy(&peer->remote, &remote, sizeof(remote));
 	free(name, M_SONAME);
 	name = NULL;
 
 	if (peer->local.ss_family == AF_INET6 &&
 	    IN6_IS_ADDR_V4MAPPED(&TO_IN6(&peer->remote)->sin6_addr)) {
 		/* V4 mapped address, so treat this as v4, not v6. */
 		in6_sin6_2_sin_in_sock((struct sockaddr *)&peer->local);
 		in6_sin6_2_sin_in_sock((struct sockaddr *)&peer->remote);
 	}
 
 #ifdef INET6
 	if (peer->local.ss_family == AF_INET6 &&
 	    IN6_IS_ADDR_UNSPECIFIED(&TO_IN6(&peer->local)->sin6_addr)) {
 		NET_EPOCH_ENTER(et);
 		ret = in6_selectsrc_addr(curthread->td_proc->p_fibnum,
 		    &TO_IN6(&peer->remote)->sin6_addr,
 		    0, NULL, &TO_IN6(&peer->local)->sin6_addr, NULL);
 		NET_EPOCH_EXIT(et);
 		if (ret != 0) {
 			goto error;
 		}
 	}
 #endif
 	OVPN_WLOCK(sc);
 
 	/* Disallow peer id re-use. */
 	if (ovpn_find_peer(sc, peerid) != NULL) {
 		ret = EEXIST;
 		goto error_locked;
 	}
 
 	/* Make sure this is really a UDP socket. */
 	if (so->so_type != SOCK_DGRAM || so->so_proto->pr_type != SOCK_DGRAM) {
 		ret = EPROTOTYPE;
 		goto error_locked;
 	}
 
 	/* Must be the same socket as for other peers on this interface. */
 	if (sc->so != NULL && so != sc->so)
 		goto error_locked;
 
 	if (sc->so == NULL)
 		sc->so = so;
 
 	/* Insert the peer into the list. */
 	RB_INSERT(ovpn_kpeers, &sc->peers, peer);
 	sc->peercount++;
 	soref(sc->so);
 
 	ret = udp_set_kernel_tunneling(sc->so, ovpn_udp_input, NULL, sc);
 	if (ret == EBUSY) {
 		/* Fine, another peer already set the input function. */
 		ret = 0;
 	}
 	if (ret != 0) {
 		RB_REMOVE(ovpn_kpeers, &sc->peers, peer);
 		sc->peercount--;
 		goto error_locked;
 	}
 
 	OVPN_WUNLOCK(sc);
 
 	goto done;
 
 error_locked:
 	OVPN_WUNLOCK(sc);
 error:
 	free(name, M_SONAME);
 	COUNTER_ARRAY_FREE(peer->counters, OVPN_PEER_COUNTER_SIZE);
 	uma_zfree_pcpu(pcpu_zone_4, peer->last_active);
 	free(peer, M_OVPN);
 done:
 	if (fp != NULL)
 		fdrop(fp, td);
 
 	return (ret);
 }
 
 static int
 _ovpn_del_peer(struct ovpn_softc *sc, struct ovpn_kpeer *peer)
 {
 	struct ovpn_kpeer *tmp __diagused;
 
 	OVPN_WASSERT(sc);
 	CURVNET_ASSERT_SET();
 
 	MPASS(RB_FIND(ovpn_kpeers, &sc->peers, peer) == peer);
 
 	tmp = RB_REMOVE(ovpn_kpeers, &sc->peers, peer);
 	MPASS(tmp != NULL);
 
 	sc->peercount--;
 
 	ovpn_peer_release_ref(peer, true);
 
 	return (0);
 }
 
 static int
 ovpn_del_peer(struct ifnet *ifp, nvlist_t *nvl)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 	struct ovpn_kpeer *peer;
 	uint32_t peerid;
 	int ret;
 
 	OVPN_WASSERT(sc);
 
 	if (nvl == NULL)
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "peerid"))
 		return (EINVAL);
 
 	peerid = nvlist_get_number(nvl, "peerid");
 
 	peer = ovpn_find_peer(sc, peerid);
 	if (peer == NULL)
 		return (ENOENT);
 
 	peer->del_reason = OVPN_DEL_REASON_REQUESTED;
 	ret = _ovpn_del_peer(sc, peer);
 
 	return (ret);
 }
 
 static int
 ovpn_create_kkey_dir(struct ovpn_kkey_dir **kdirp,
     const nvlist_t *nvl)
 {
 	struct crypto_session_params csp;
 	struct ovpn_kkey_dir *kdir;
 	const char *ciphername;
 	enum ovpn_key_cipher cipher;
 	const void *key, *iv;
 	size_t keylen = 0, ivlen = 0;
 	int error;
 
 	if (! nvlist_exists_string(nvl, "cipher"))
 		return (EINVAL);
 	ciphername = nvlist_get_string(nvl, "cipher");
 
 	if (strcmp(ciphername, "none") == 0)
 		cipher = OVPN_CIPHER_ALG_NONE;
 	else if (strcmp(ciphername, "AES-256-GCM") == 0 ||
 	    strcmp(ciphername, "AES-192-GCM") == 0 ||
 	    strcmp(ciphername, "AES-128-GCM") == 0)
 		cipher = OVPN_CIPHER_ALG_AES_GCM;
 	else if (strcmp(ciphername, "CHACHA20-POLY1305") == 0)
 		cipher = OVPN_CIPHER_ALG_CHACHA20_POLY1305;
 	else
 		return (EINVAL);
 
 	if (cipher != OVPN_CIPHER_ALG_NONE) {
 		if (! nvlist_exists_binary(nvl, "key"))
 			return (EINVAL);
 		key = nvlist_get_binary(nvl, "key", &keylen);
 		if (keylen > sizeof(kdir->key))
 			return (E2BIG);
 
 		if (! nvlist_exists_binary(nvl, "iv"))
 			return (EINVAL);
 		iv = nvlist_get_binary(nvl, "iv", &ivlen);
 		if (ivlen != 8)
 			return (E2BIG);
 	}
 
 	kdir = malloc(sizeof(struct ovpn_kkey_dir), M_OVPN,
 	    M_WAITOK | M_ZERO);
 
 	kdir->cipher = cipher;
 	kdir->keylen = keylen;
 	memcpy(kdir->key, key, keylen);
 	kdir->noncelen = ivlen;
 	memcpy(kdir->nonce, iv, ivlen);
 
 	if (kdir->cipher != OVPN_CIPHER_ALG_NONE) {
 		/* Crypto init */
 		bzero(&csp, sizeof(csp));
 		csp.csp_mode = CSP_MODE_AEAD;
 
 		if (kdir->cipher == OVPN_CIPHER_ALG_CHACHA20_POLY1305)
 			csp.csp_cipher_alg = CRYPTO_CHACHA20_POLY1305;
 		else
 			csp.csp_cipher_alg = CRYPTO_AES_NIST_GCM_16;
 
 		csp.csp_flags |= CSP_F_SEPARATE_AAD;
 
 		csp.csp_cipher_klen = kdir->keylen;
 		csp.csp_cipher_key = kdir->key;
 		csp.csp_ivlen = 96 / 8;
 
 		error = crypto_newsession(&kdir->cryptoid, &csp,
 		    CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE);
 		if (error) {
 			free(kdir, M_OVPN);
 			return (error);
 		}
 	}
 
 	mtx_init(&kdir->replay_mtx, "if_ovpn rx replay", NULL, MTX_DEF);
 	*kdirp = kdir;
 
 	return (0);
 }
 
 static void
 ovpn_free_kkey_dir(struct ovpn_kkey_dir *kdir)
 {
 	if (kdir == NULL)
 		return;
 
 	mtx_destroy(&kdir->replay_mtx);
 
 	crypto_freesession(kdir->cryptoid);
 	free(kdir, M_OVPN);
 }
 
 static int
 ovpn_set_key(struct ifnet *ifp, const nvlist_t *nvl)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 	struct ovpn_kkey_dir *enc, *dec;
 	struct ovpn_kpeer *peer;
 	int slot, keyid, peerid;
 	int error;
 
 	if (nvl == NULL)
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "slot"))
 		return (EINVAL);
 	slot = nvlist_get_number(nvl, "slot");
 
 	if (! nvlist_exists_number(nvl, "keyid"))
 		return (EINVAL);
 	keyid = nvlist_get_number(nvl, "keyid");
 
 	if (! nvlist_exists_number(nvl, "peerid"))
 		return (EINVAL);
 	peerid = nvlist_get_number(nvl, "peerid");
 
 	if (slot != OVPN_KEY_SLOT_PRIMARY &&
 	    slot != OVPN_KEY_SLOT_SECONDARY)
 		return (EINVAL);
 
 	if (! nvlist_exists_nvlist(nvl, "encrypt") ||
 	    ! nvlist_exists_nvlist(nvl, "decrypt"))
 		return (EINVAL);
 
 	error = ovpn_create_kkey_dir(&enc, nvlist_get_nvlist(nvl, "encrypt"));
 	if (error)
 		return (error);
 
 	error = ovpn_create_kkey_dir(&dec, nvlist_get_nvlist(nvl, "decrypt"));
 	if (error) {
 		ovpn_free_kkey_dir(enc);
 		return (error);
 	}
 
 	OVPN_WLOCK(sc);
 
 	peer = ovpn_find_peer(sc, peerid);
 	if (peer == NULL) {
 		ovpn_free_kkey_dir(dec);
 		ovpn_free_kkey_dir(enc);
 		OVPN_WUNLOCK(sc);
 		return (ENOENT);
 	}
 
 	ovpn_free_kkey_dir(peer->keys[slot].encrypt);
 	ovpn_free_kkey_dir(peer->keys[slot].decrypt);
 
 	peer->keys[slot].encrypt = enc;
 	peer->keys[slot].decrypt = dec;
 
 	peer->keys[slot].keyid = keyid;
 	peer->keys[slot].peerid = peerid;
 
 	OVPN_WUNLOCK(sc);
 
 	return (0);
 }
 
 static int
 ovpn_check_key(struct ovpn_softc *sc, struct ovpn_kpeer *peer, enum ovpn_key_slot slot)
 {
 	OVPN_ASSERT(sc);
 
 	if (peer->keys[slot].encrypt == NULL)
 		return (ENOLINK);
 
 	if (peer->keys[slot].decrypt == NULL)
 		return (ENOLINK);
 
 	return (0);
 }
 
 static int
 ovpn_start(struct ifnet *ifp)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 
 	OVPN_WLOCK(sc);
 
 	ifp->if_flags |= IFF_UP;
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	if_link_state_change(ifp, LINK_STATE_UP);
 
 	OVPN_WUNLOCK(sc);
 
 	return (0);
 }
 
 static int
 ovpn_swap_keys(struct ifnet *ifp, nvlist_t *nvl)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 	struct ovpn_kpeer *peer;
 	struct ovpn_kkey tmpkey;
 	int error;
 
 	if (nvl == NULL)
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "peerid"))
 		return (EINVAL);
 
 	OVPN_WLOCK(sc);
 
 	peer = ovpn_find_peer(sc, nvlist_get_number(nvl, "peerid"));
 	if (peer == NULL) {
 		OVPN_WUNLOCK(sc);
 		return (ENOENT);
 	}
 
 	/* Check that we have a second key to swap to. */
 	error = ovpn_check_key(sc, peer, OVPN_KEY_SLOT_SECONDARY);
 	if (error) {
 		OVPN_WUNLOCK(sc);
 		return (error);
 	}
 
 	tmpkey = peer->keys[0];
 	peer->keys[0] = peer->keys[1];
 	peer->keys[1] = tmpkey;
 
 	OVPN_WUNLOCK(sc);
 
 	return (0);
 }
 
 static int
 ovpn_del_key(struct ifnet *ifp, const nvlist_t *nvl)
 {
 	enum ovpn_key_slot slot;
 	struct ovpn_kpeer *peer;
 	struct ovpn_softc *sc = ifp->if_softc;
 
 	if (nvl == NULL)
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "peerid"))
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "slot"))
 		return (EINVAL);
 	slot = nvlist_get_number(nvl, "slot");
 
 	if (slot != OVPN_KEY_SLOT_PRIMARY &&
 	    slot != OVPN_KEY_SLOT_SECONDARY)
 		return (EINVAL);
 
 	OVPN_WLOCK(sc);
 
 	peer = ovpn_find_peer(sc, nvlist_get_number(nvl, "peerid"));
 	if (peer == NULL) {
 		OVPN_WUNLOCK(sc);
 		return (ENOENT);
 	}
 
 	ovpn_free_kkey_dir(peer->keys[slot].encrypt);
 	ovpn_free_kkey_dir(peer->keys[slot].decrypt);
 
 	peer->keys[slot].encrypt = NULL;
 	peer->keys[slot].decrypt = NULL;
 
 	peer->keys[slot].keyid = 0;
 	peer->keys[slot].peerid = 0;
 
 	OVPN_WUNLOCK(sc);
 
 	return (0);
 }
 
 static void
 ovpn_send_ping(void *arg)
 {
 	static const uint8_t ping_str[] = {
 		0x2a, 0x18, 0x7b, 0xf3, 0x64, 0x1e, 0xb4, 0xcb,
 		0x07, 0xed, 0x2d, 0x0a, 0x98, 0x1f, 0xc7, 0x48
 	};
 
 	struct epoch_tracker et;
 	struct ovpn_kpeer *peer = arg;
 	struct ovpn_softc *sc = peer->sc;
 	struct mbuf *m;
 
 	OVPN_RASSERT(sc);
 
 	/* Ensure we repeat! */
 	callout_reset(&peer->ping_send, peer->keepalive.interval * hz,
 	    ovpn_send_ping, peer);
 
 	m = m_get2(sizeof(ping_str), M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (m == NULL)
 		return;
 
 	m_copyback(m, 0, sizeof(ping_str), ping_str);
 	m->m_len = m->m_pkthdr.len = sizeof(ping_str);
 
 	CURVNET_SET(sc->ifp->if_vnet);
 	NET_EPOCH_ENTER(et);
 	(void)ovpn_transmit_to_peer(sc->ifp, m, peer, NULL);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 }
 
 static void
 ovpn_timeout(void *arg)
 {
 	struct ovpn_kpeer *peer = arg;
 	struct ovpn_softc *sc = peer->sc;
 	uint32_t last, _last_active;
 	int ret __diagused;
 	int cpu;
 
 	OVPN_WASSERT(sc);
 
 	last = 0;
 	CPU_FOREACH(cpu) {
 		_last_active = *zpcpu_get_cpu(peer->last_active, cpu);
 		if (_last_active > last)
 			last = _last_active;
 	}
 
 	if (last + peer->keepalive.timeout > time_uptime) {
 		callout_reset(&peer->ping_rcv,
 		    (peer->keepalive.timeout - (time_uptime - last)) * hz,
 		    ovpn_timeout, peer);
 		return;
 	}
 
 	CURVNET_SET(sc->ifp->if_vnet);
 	peer->del_reason = OVPN_DEL_REASON_TIMEOUT;
 	ret = _ovpn_del_peer(sc, peer);
 	MPASS(ret == 0);
 	CURVNET_RESTORE();
 }
 
 static int
 ovpn_set_peer(struct ifnet *ifp, const nvlist_t *nvl)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 	struct ovpn_kpeer *peer;
 
 	if (nvl == NULL)
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "interval") ||
 	    ! nvlist_exists_number(nvl, "timeout") ||
 	    ! nvlist_exists_number(nvl, "peerid"))
 		return (EINVAL);
 
 	OVPN_WLOCK(sc);
 
 	peer = ovpn_find_peer(sc, nvlist_get_number(nvl, "peerid"));
 	if (peer == NULL) {
 		OVPN_WUNLOCK(sc);
 		return (ENOENT);
 	}
 
 	peer->keepalive.interval = nvlist_get_number(nvl, "interval");
 	peer->keepalive.timeout = nvlist_get_number(nvl, "timeout");
 
 	if (peer->keepalive.interval > 0)
 		callout_reset(&peer->ping_send, peer->keepalive.interval * hz,
 		    ovpn_send_ping, peer);
 	if (peer->keepalive.timeout > 0)
 		callout_reset(&peer->ping_rcv, peer->keepalive.timeout * hz,
 		    ovpn_timeout, peer);
 
 	OVPN_WUNLOCK(sc);
 
 	return (0);
 }
 
 static int
 ovpn_set_ifmode(struct ifnet *ifp, const nvlist_t *nvl)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 	int ifmode;
 
 	if (nvl == NULL)
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "ifmode") )
 		return (EINVAL);
 
 	ifmode = nvlist_get_number(nvl, "ifmode");
 
 	OVPN_WLOCK(sc);
 
 	/* deny this if UP */
 	if (ifp->if_flags & IFF_UP) {
 		OVPN_WUNLOCK(sc);
 		return (EBUSY);
 	}
 
 	switch (ifmode & ~IFF_MULTICAST) {
 	case IFF_POINTOPOINT:
 	case IFF_BROADCAST:
 		ifp->if_flags &=
 		    ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST);
 		ifp->if_flags |= ifmode;
 		break;
 	default:
 		OVPN_WUNLOCK(sc);
 		return (EINVAL);
 	}
 
 	OVPN_WUNLOCK(sc);
 
 	return (0);
 }
 
 static int
 ovpn_ioctl_set(struct ifnet *ifp, struct ifdrv *ifd)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 	uint8_t *buf = NULL;
 	nvlist_t *nvl = NULL;
 	int ret;
 
 	if (ifd->ifd_len != 0) {
 		if (ifd->ifd_len > OVPN_MAX_REQUEST_SIZE)
 			return (E2BIG);
 
 		buf = malloc(ifd->ifd_len, M_OVPN, M_WAITOK);
 
 		ret = copyin(ifd->ifd_data, buf, ifd->ifd_len);
 		if (ret != 0) {
 			free(buf, M_OVPN);
 			return (ret);
 		}
 
 		nvl = nvlist_unpack(buf, ifd->ifd_len, 0);
 		free(buf, M_OVPN);
 		if (nvl == NULL) {
 			return (EINVAL);
 		}
 	}
 
 	switch (ifd->ifd_cmd) {
 	case OVPN_NEW_PEER:
 		ret = ovpn_new_peer(ifp, nvl);
 		break;
 	case OVPN_DEL_PEER:
 		OVPN_WLOCK(sc);
 		ret = ovpn_del_peer(ifp, nvl);
 		OVPN_WUNLOCK(sc);
 		break;
 	case OVPN_NEW_KEY:
 		ret = ovpn_set_key(ifp, nvl);
 		break;
 	case OVPN_START_VPN:
 		ret = ovpn_start(ifp);
 		break;
 	case OVPN_SWAP_KEYS:
 		ret = ovpn_swap_keys(ifp, nvl);
 		break;
 	case OVPN_DEL_KEY:
 		ret = ovpn_del_key(ifp, nvl);
 		break;
 	case OVPN_SET_PEER:
 		ret = ovpn_set_peer(ifp, nvl);
 		break;
 	case OVPN_SET_IFMODE:
 		ret = ovpn_set_ifmode(ifp, nvl);
 		break;
 	default:
 		ret = ENOTSUP;
 	}
 
 	nvlist_destroy(nvl);
 	return (ret);
 }
 
 static int
 ovpn_add_counters(nvlist_t *parent, const char *name, counter_u64_t in,
     counter_u64_t out)
 {
 	nvlist_t *nvl;
 
 	nvl = nvlist_create(0);
 	if (nvl == NULL)
 		return (ENOMEM);
 
 	nvlist_add_number(nvl, "in", counter_u64_fetch(in));
 	nvlist_add_number(nvl, "out", counter_u64_fetch(out));
 
 	nvlist_add_nvlist(parent, name, nvl);
 
 	nvlist_destroy(nvl);
 
 	return (0);
 }
 
 static int
 ovpn_get_stats(struct ovpn_softc *sc, nvlist_t **onvl)
 {
 	nvlist_t *nvl;
 	int ret;
 
 	nvl = nvlist_create(0);
 	if (nvl == NULL)
 		return (ENOMEM);
 
 #define OVPN_COUNTER_OUT(name, in, out) \
 	do { \
 		ret = ovpn_add_counters(nvl, name, OVPN_COUNTER(sc, in), \
 		    OVPN_COUNTER(sc, out)); \
 		if (ret != 0) \
 			goto error; \
 	} while(0)
 
 	OVPN_COUNTER_OUT("lost_ctrl", lost_ctrl_pkts_in, lost_ctrl_pkts_out);
 	OVPN_COUNTER_OUT("lost_data", lost_data_pkts_in, lost_data_pkts_out);
 	OVPN_COUNTER_OUT("nomem_data", nomem_data_pkts_in,
 	    nomem_data_pkts_out);
 	OVPN_COUNTER_OUT("data", received_data_pkts, sent_data_pkts);
 	OVPN_COUNTER_OUT("ctrl", received_ctrl_pkts, sent_ctrl_pkts);
 	OVPN_COUNTER_OUT("tunnel", tunnel_bytes_received,
 	    tunnel_bytes_received);
 	OVPN_COUNTER_OUT("transport", transport_bytes_received,
 	    transport_bytes_received);
 #undef OVPN_COUNTER_OUT
 
 	*onvl = nvl;
 
 	return (0);
 
 error:
 	nvlist_destroy(nvl);
 	return (ret);
 }
 
 static int
 ovpn_get_peer_stats(struct ovpn_softc *sc, nvlist_t **nvl)
 {
 	struct ovpn_kpeer *peer;
 	nvlist_t *nvpeer = NULL;
 	int ret;
 
 	OVPN_RLOCK_TRACKER;
 
 	*nvl = nvlist_create(0);
 	if (*nvl == NULL)
 		return (ENOMEM);
 
 #define OVPN_PEER_COUNTER_OUT(name, in, out) \
 	do { \
 		ret = ovpn_add_counters(nvpeer, name, \
 		    OVPN_PEER_COUNTER(peer, in), OVPN_PEER_COUNTER(peer, out)); \
 		if (ret != 0) \
 			goto error; \
 	} while(0)
 
 	OVPN_RLOCK(sc);
 	RB_FOREACH(peer, ovpn_kpeers, &sc->peers) {
 		nvpeer = nvlist_create(0);
 		if (nvpeer == NULL) {
 			OVPN_RUNLOCK(sc);
 			nvlist_destroy(*nvl);
 			*nvl = NULL;
 			return (ENOMEM);
 		}
 
 		nvlist_add_number(nvpeer, "peerid", peer->peerid);
 
 		OVPN_PEER_COUNTER_OUT("packets", pkt_in, pkt_out);
 		OVPN_PEER_COUNTER_OUT("bytes", bytes_in, bytes_out);
 
 		nvlist_append_nvlist_array(*nvl, "peers", nvpeer);
 		nvlist_destroy(nvpeer);
 	}
 #undef OVPN_PEER_COUNTER_OUT
 	OVPN_RUNLOCK(sc);
 
 	return (0);
 
 error:
 	nvlist_destroy(nvpeer);
 	nvlist_destroy(*nvl);
 	*nvl = NULL;
 	return (ret);
 }
 
 static int
 ovpn_poll_pkt(struct ovpn_softc *sc, nvlist_t **onvl)
 {
 	nvlist_t *nvl;
 
 	nvl = nvlist_create(0);
 	if (nvl == NULL)
 		return (ENOMEM);
 
 	nvlist_add_number(nvl, "pending", buf_ring_count(sc->notifring));
 
 	*onvl = nvl;
 
 	return (0);
 }
 
 static void
 ovpn_notif_add_counters(nvlist_t *parent, struct ovpn_notification *n)
 {
 	nvlist_t *nvl;
 
 	nvl = nvlist_create(0);
 	if (nvl == NULL)
 		return;
 
 	nvlist_add_number(nvl, "in", n->counters.pkt_in);
 	nvlist_add_number(nvl, "out", n->counters.pkt_out);
 
 	nvlist_add_nvlist(parent, "packets", nvl);
 	nvlist_destroy(nvl);
 
 	nvl = nvlist_create(0);
 	if (nvl == NULL)
 		return;
 
 	nvlist_add_number(nvl, "in", n->counters.bytes_in);
 	nvlist_add_number(nvl, "out", n->counters.bytes_out);
 
 	nvlist_add_nvlist(parent, "bytes", nvl);
 	nvlist_destroy(nvl);
 }
 
 static int
 opvn_get_pkt(struct ovpn_softc *sc, nvlist_t **onvl)
 {
 	struct ovpn_notification *n;
 	nvlist_t *nvl;
 
 	/* Check if we have notifications pending. */
 	n = buf_ring_dequeue_mc(sc->notifring);
 	if (n == NULL)
 		return (ENOENT);
 
 	nvl = nvlist_create(0);
 	if (nvl == NULL) {
 		free(n, M_OVPN);
 		return (ENOMEM);
 	}
 	nvlist_add_number(nvl, "peerid", n->peerid);
 	nvlist_add_number(nvl, "notification", n->type);
 	if (n->type == OVPN_NOTIF_DEL_PEER) {
 		nvlist_add_number(nvl, "del_reason", n->del_reason);
 
 		/* No error handling, because we want to send the notification
 		 * even if we can't attach the counters. */
 		ovpn_notif_add_counters(nvl, n);
 	}
 	free(n, M_OVPN);
 
 	*onvl = nvl;
 
 	return (0);
 }
 
 static int
 ovpn_ioctl_get(struct ifnet *ifp, struct ifdrv *ifd)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 	nvlist_t *nvl = NULL;
 	int error;
 
 	switch (ifd->ifd_cmd) {
 	case OVPN_GET_STATS:
 		error = ovpn_get_stats(sc, &nvl);
 		break;
 	case OVPN_GET_PEER_STATS:
 		error = ovpn_get_peer_stats(sc, &nvl);
 		break;
 	case OVPN_POLL_PKT:
 		error = ovpn_poll_pkt(sc, &nvl);
 		break;
 	case OVPN_GET_PKT:
 		error = opvn_get_pkt(sc, &nvl);
 		break;
 	default:
 		error = ENOTSUP;
 		break;
 	}
 
 	if (error == 0) {
 		void *packed = NULL;
 		size_t len;
 
 		MPASS(nvl != NULL);
 
 		packed = nvlist_pack(nvl, &len);
 		if (! packed) {
 			nvlist_destroy(nvl);
 			return (ENOMEM);
 		}
 
 		if (len > ifd->ifd_len) {
 			free(packed, M_NVLIST);
 			nvlist_destroy(nvl);
 			return (ENOSPC);
 		}
 
 		error = copyout(packed, ifd->ifd_data, len);
 		ifd->ifd_len = len;
 
 		free(packed, M_NVLIST);
 		nvlist_destroy(nvl);
 	}
 
 	return (error);
 }
 
 static int
 ovpn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifdrv *ifd;
 	int error;
 
 	CURVNET_ASSERT_SET();
 
 	switch (cmd) {
 	case SIOCSDRVSPEC:
 	case SIOCGDRVSPEC:
 		error = priv_check(curthread, PRIV_NET_OVPN);
 		if (error)
 			return (error);
 		break;
 	}
 
 	switch (cmd) {
 	case SIOCSDRVSPEC:
 		ifd = (struct ifdrv *)data;
 		error = ovpn_ioctl_set(ifp, ifd);
 		break;
 	case SIOCGDRVSPEC:
 		ifd = (struct ifdrv *)data;
 		error = ovpn_ioctl_get(ifp, ifd);
 		break;
 	case SIOCSIFMTU: {
 		struct ifreq *ifr = (struct ifreq *)data;
 		if (ifr->ifr_mtu < OVPN_MTU_MIN || ifr->ifr_mtu > OVPN_MTU_MAX)
 			return (EINVAL);
 
 		ifp->if_mtu = ifr->ifr_mtu;
 		return (0);
 	}
 	case SIOCSIFADDR:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 	case SIOCGIFMTU:
 	case SIOCSIFFLAGS:
 		return (0);
 	default:
 		error = EINVAL;
 	}
 
 	return (error);
 }
 
 static int
 ovpn_encrypt_tx_cb(struct cryptop *crp)
 {
 	struct epoch_tracker et;
 	struct ovpn_kpeer *peer = crp->crp_opaque;
 	struct ovpn_softc *sc = peer->sc;
 	struct mbuf *m = crp->crp_buf.cb_mbuf;
 	int tunnel_len;
 	int ret;
 
 	CURVNET_SET(sc->ifp->if_vnet);
 	NET_EPOCH_ENTER(et);
 
 	if (crp->crp_etype != 0) {
 		crypto_freereq(crp);
 		ovpn_peer_release_ref(peer, false);
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_out, 1);
 		m_freem(m);
 		return (0);
 	}
 
 	MPASS(crp->crp_buf.cb_type == CRYPTO_BUF_MBUF);
 
 	tunnel_len = m->m_pkthdr.len - sizeof(struct ovpn_wire_header);
 	ret = ovpn_encap(sc, peer->peerid, m);
 	if (ret == 0) {
 		OVPN_COUNTER_ADD(sc, sent_data_pkts, 1);
 		OVPN_COUNTER_ADD(sc, tunnel_bytes_sent, tunnel_len);
 	}
 
 	crypto_freereq(crp);
 	ovpn_peer_release_ref(peer, false);
 
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 
 	return (0);
 }
 
 static void
 ovpn_finish_rx(struct ovpn_softc *sc, struct mbuf *m,
     struct ovpn_kpeer *peer, struct ovpn_kkey *key, uint32_t seq,
     struct rm_priotracker *_ovpn_lock_trackerp)
 {
 	uint32_t af;
 
 	OVPN_RASSERT(sc);
 	NET_EPOCH_ASSERT();
 
 	/* Replay protection. */
 	if (V_replay_protection && ! ovpn_check_replay(key->decrypt, seq)) {
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1);
 		m_freem(m);
 		return;
 	}
 
 	critical_enter();
 	*zpcpu_get(peer->last_active) = time_uptime;
 	critical_exit();
 
 	OVPN_RUNLOCK(sc);
 
 	OVPN_COUNTER_ADD(sc, received_data_pkts, 1);
 	OVPN_COUNTER_ADD(sc, tunnel_bytes_received, m->m_pkthdr.len);
 	OVPN_PEER_COUNTER_ADD(peer, pkt_in, 1);
 	OVPN_PEER_COUNTER_ADD(peer, bytes_in, m->m_pkthdr.len);
 
 	/* Receive the packet on our interface. */
 	m->m_pkthdr.rcvif = sc->ifp;
 
 	/* Clear checksum flags in case the real hardware set them. */
 	m->m_pkthdr.csum_flags = 0;
 
 	/* Ensure we can read the first byte. */
 	m = m_pullup(m, 1);
 	if (m == NULL) {
 		OVPN_COUNTER_ADD(sc, nomem_data_pkts_in, 1);
 		return;
 	}
 
 	/*
 	 * Check for address family, and disregard any control packets (e.g.
 	 * keepalive).
 	 */
 	af = ovpn_get_af(m);
 	if (af != 0) {
 		BPF_MTAP2(sc->ifp, &af, sizeof(af), m);
 		if (V_async_netisr_queue)
 			netisr_queue(af == AF_INET ? NETISR_IP : NETISR_IPV6, m);
 		else
 			netisr_dispatch(af == AF_INET ? NETISR_IP : NETISR_IPV6, m);
 	} else {
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1);
 		m_freem(m);
 	}
 }
 
 static struct ovpn_kkey *
 ovpn_find_key(struct ovpn_softc *sc, struct ovpn_kpeer *peer,
     const struct ovpn_wire_header *ohdr)
 {
 	struct ovpn_kkey *key = NULL;
 	uint8_t keyid;
 
 	OVPN_RASSERT(sc);
 
 	keyid = (ntohl(ohdr->opcode) >> 24) & 0x07;
 
 	if (peer->keys[0].keyid == keyid)
 		key = &peer->keys[0];
 	else if (peer->keys[1].keyid == keyid)
 		key = &peer->keys[1];
 
 	return (key);
 }
 
 static int
 ovpn_decrypt_rx_cb(struct cryptop *crp)
 {
 	struct epoch_tracker et;
 	struct ovpn_softc *sc = crp->crp_opaque;
 	struct mbuf *m = crp->crp_buf.cb_mbuf;
 	struct ovpn_kkey *key;
 	struct ovpn_kpeer *peer;
 	struct ovpn_wire_header *ohdr;
 	uint32_t peerid;
 
 	OVPN_RLOCK_TRACKER;
 
 	OVPN_RLOCK(sc);
 
 	MPASS(crp->crp_buf.cb_type == CRYPTO_BUF_MBUF);
 
 	if (crp->crp_etype != 0) {
 		crypto_freereq(crp);
 		atomic_add_int(&sc->refcount, -1);
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1);
 		OVPN_RUNLOCK(sc);
 		m_freem(m);
 		return (0);
 	}
 
 	CURVNET_SET(sc->ifp->if_vnet);
 
 	ohdr = mtodo(m, sizeof(struct udphdr));
 
 	peerid = ntohl(ohdr->opcode) & 0x00ffffff;
 	peer = ovpn_find_peer(sc, peerid);
 	if (peer == NULL) {
 		/* No such peer. Drop packet. */
 		crypto_freereq(crp);
 		atomic_add_int(&sc->refcount, -1);
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1);
 		m_freem(m);
 		CURVNET_RESTORE();
 		return (0);
 	}
 
 	key = ovpn_find_key(sc, peer, ohdr);
 	if (key == NULL) {
 		crypto_freereq(crp);
 		atomic_add_int(&sc->refcount, -1);
 		/*
 		 * Has this key been removed between us starting the decrypt
 		 * and finishing it?
 		 */
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1);
 		m_freem(m);
 		CURVNET_RESTORE();
 		return (0);
 	}
 
 	/* Now remove the outer headers */
 	m_adj_decap(m, sizeof(struct udphdr) +
 	    sizeof(struct ovpn_wire_header));
 
 	NET_EPOCH_ENTER(et);
 	ovpn_finish_rx(sc, m, peer, key, ntohl(ohdr->seq), _ovpn_lock_trackerp);
 	NET_EPOCH_EXIT(et);
 	OVPN_UNLOCK_ASSERT(sc);
 
 	CURVNET_RESTORE();
 
 	crypto_freereq(crp);
 	atomic_add_int(&sc->refcount, -1);
 
 	return (0);
 }
 
 static int
 ovpn_get_af(struct mbuf *m)
 {
 	struct ip *ip;
 	struct ip6_hdr *ip6;
 
 	/*
 	 * We should pullup, but we're only interested in the first byte, so
 	 * that'll always be contiguous.
 	 */
 	ip = mtod(m, struct ip *);
 	if (ip->ip_v == IPVERSION)
 		return (AF_INET);
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if ((ip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION)
 		return (AF_INET6);
 
 	return (0);
 }
 
 static struct ovpn_kpeer *
 ovpn_find_peer_by_ip(struct ovpn_softc *sc, const struct in_addr addr)
 {
 	struct ovpn_kpeer *peer = NULL;
 
 	OVPN_ASSERT(sc);
 
 	/* TODO: Add a second RB so we can look up by IP. */
 	RB_FOREACH(peer, ovpn_kpeers, &sc->peers) {
 		if (addr.s_addr == peer->vpn4.s_addr)
 			return (peer);
 	}
 
 	return (peer);
 }
 
 static struct ovpn_kpeer *
 ovpn_find_peer_by_ip6(struct ovpn_softc *sc, const struct in6_addr *addr)
 {
 	struct ovpn_kpeer *peer = NULL;
 
 	OVPN_ASSERT(sc);
 
 	/* TODO: Add a third RB so we can look up by IPv6 address. */
 	RB_FOREACH(peer, ovpn_kpeers, &sc->peers) {
 		if (memcmp(addr, &peer->vpn6, sizeof(*addr)) == 0)
 			return (peer);
 	}
 
 	return (peer);
 }
 
 static struct ovpn_kpeer *
 ovpn_route_peer(struct ovpn_softc *sc, struct mbuf **m0,
     const struct sockaddr *dst)
 {
 	struct ovpn_kpeer *peer = NULL;
 	int af;
 
 	NET_EPOCH_ASSERT();
 	OVPN_ASSERT(sc);
 
 	/* Shortcut if we're a client (or are a server and have only one client). */
 	if (sc->peercount == 1)
 		return (ovpn_find_only_peer(sc));
 
 	if (dst != NULL)
 		af = dst->sa_family;
 	else
 		af = ovpn_get_af(*m0);
 
 	switch (af) {
 #ifdef INET
 	case AF_INET: {
 		const struct sockaddr_in *sa = (const struct sockaddr_in *)dst;
 		struct nhop_object *nh;
 		const struct in_addr *ip_dst;
 
 		if (sa != NULL) {
 			ip_dst = &sa->sin_addr;
 		} else {
 			struct ip *ip;
 
 			*m0 = m_pullup(*m0, sizeof(struct ip));
 			if (*m0 == NULL)
 				return (NULL);
 			ip = mtod(*m0, struct ip *);
 			ip_dst = &ip->ip_dst;
 		}
 
 		peer = ovpn_find_peer_by_ip(sc, *ip_dst);
 		SDT_PROBE2(if_ovpn, tx, route, ip4, ip_dst, peer);
 		if (peer == NULL) {
 			nh = fib4_lookup(M_GETFIB(*m0), *ip_dst, 0,
 			    NHR_NONE, 0);
 			if (nh && (nh->nh_flags & NHF_GATEWAY)) {
 				peer = ovpn_find_peer_by_ip(sc,
 				    nh->gw4_sa.sin_addr);
 				SDT_PROBE2(if_ovpn, tx, route, ip4,
 				    &nh->gw4_sa.sin_addr, peer);
 			}
 		}
 		break;
 	}
 #endif
 #ifdef INET6
 	case AF_INET6: {
 		const struct sockaddr_in6 *sa6 =
 		    (const struct sockaddr_in6 *)dst;
 		struct nhop_object *nh;
 		const struct in6_addr *ip6_dst;
 
 		if (sa6 != NULL) {
 			ip6_dst = &sa6->sin6_addr;
 		} else {
 			struct ip6_hdr *ip6;
 
 			*m0 = m_pullup(*m0, sizeof(struct ip6_hdr));
 			if (*m0 == NULL)
 				return (NULL);
 			ip6 = mtod(*m0, struct ip6_hdr *);
 			ip6_dst = &ip6->ip6_dst;
 		}
 
 		peer = ovpn_find_peer_by_ip6(sc, ip6_dst);
 		SDT_PROBE2(if_ovpn, tx, route, ip6, ip6_dst, peer);
 		if (peer == NULL) {
 			nh = fib6_lookup(M_GETFIB(*m0), ip6_dst, 0,
 			    NHR_NONE, 0);
 			if (nh && (nh->nh_flags & NHF_GATEWAY)) {
 				peer = ovpn_find_peer_by_ip6(sc,
 				    &nh->gw6_sa.sin6_addr);
 				SDT_PROBE2(if_ovpn, tx, route, ip6,
 				    &nh->gw6_sa.sin6_addr, peer);
 			}
 		}
 		break;
 	}
 #endif
 	}
 
 	return (peer);
 }
 
 static int
 ovpn_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	return (ifp->if_output(ifp, m, NULL, NULL));
 }
 
 static int
 ovpn_transmit_to_peer(struct ifnet *ifp, struct mbuf *m,
     struct ovpn_kpeer *peer, struct rm_priotracker *_ovpn_lock_trackerp)
 {
 	struct ovpn_wire_header *ohdr;
 	struct ovpn_kkey *key;
 	struct ovpn_softc *sc;
 	struct cryptop *crp;
 	uint32_t af, seq;
 	size_t len, ovpn_hdr_len;
 	int tunnel_len;
 	int ret;
 
 	sc = ifp->if_softc;
 
 	OVPN_RASSERT(sc);
 
 	tunnel_len = m->m_pkthdr.len;
 
 	key = &peer->keys[OVPN_KEY_SLOT_PRIMARY];
 	if (key->encrypt == NULL) {
 		if (_ovpn_lock_trackerp != NULL)
 			OVPN_RUNLOCK(sc);
 		m_freem(m);
 		return (ENOLINK);
 	}
 
 	af = ovpn_get_af(m);
 	/* Don't capture control packets. */
 	if (af != 0)
 		BPF_MTAP2(ifp, &af, sizeof(af), m);
 
 	len = m->m_pkthdr.len;
 	MPASS(len <= ifp->if_mtu);
 
 	ovpn_hdr_len = sizeof(struct ovpn_wire_header);
 	if (key->encrypt->cipher == OVPN_CIPHER_ALG_NONE)
 		ovpn_hdr_len -= 16; /* No auth tag. */
 
 	M_PREPEND(m, ovpn_hdr_len, M_NOWAIT);
 	if (m == NULL) {
 		if (_ovpn_lock_trackerp != NULL)
 			OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1);
 		return (ENOBUFS);
 	}
 	ohdr = mtod(m, struct ovpn_wire_header *);
 	ohdr->opcode = (OVPN_OP_DATA_V2 << OVPN_OP_SHIFT) | key->keyid;
 	ohdr->opcode <<= 24;
 	ohdr->opcode |= key->peerid;
 	ohdr->opcode = htonl(ohdr->opcode);
 
 	seq = atomic_fetchadd_32(&peer->tx_seq, 1);
 	seq = htonl(seq);
 	ohdr->seq = seq;
 
 	OVPN_PEER_COUNTER_ADD(peer, pkt_out, 1);
 	OVPN_PEER_COUNTER_ADD(peer, bytes_out, len);
 
 	if (key->encrypt->cipher == OVPN_CIPHER_ALG_NONE) {
 		ret = ovpn_encap(sc, peer->peerid, m);
 		if (_ovpn_lock_trackerp != NULL)
 			OVPN_RUNLOCK(sc);
 		if (ret == 0) {
 			OVPN_COUNTER_ADD(sc, sent_data_pkts, 1);
 			OVPN_COUNTER_ADD(sc, tunnel_bytes_sent, tunnel_len);
 		}
 		return (ret);
 	}
 
 	crp = crypto_getreq(key->encrypt->cryptoid, M_NOWAIT);
 	if (crp == NULL) {
 		if (_ovpn_lock_trackerp != NULL)
 			OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1);
 		m_freem(m);
 		return (ENOBUFS);
 	}
 
 	/* Encryption covers only the payload, not the header. */
 	crp->crp_payload_start = sizeof(*ohdr);
 	crp->crp_payload_length = len;
 	crp->crp_op = CRYPTO_OP_ENCRYPT;
 
 	/*
 	 * AAD data covers the ovpn_wire_header minus the auth
 	 * tag.
 	 */
 	crp->crp_aad_length = sizeof(*ohdr) - sizeof(ohdr->auth_tag);
 	crp->crp_aad = ohdr;
 	crp->crp_aad_start = 0;
 	crp->crp_op |= CRYPTO_OP_COMPUTE_DIGEST;
 	crp->crp_digest_start = offsetof(struct ovpn_wire_header, auth_tag);
 
 	crp->crp_flags |= CRYPTO_F_IV_SEPARATE;
 	memcpy(crp->crp_iv, &seq, sizeof(seq));
 	memcpy(crp->crp_iv + sizeof(seq), key->encrypt->nonce,
 	    key->encrypt->noncelen);
 
 	crypto_use_mbuf(crp, m);
 	crp->crp_flags |= CRYPTO_F_CBIFSYNC;
 	crp->crp_callback = ovpn_encrypt_tx_cb;
 	crp->crp_opaque = peer;
 
 	atomic_add_int(&peer->refcount, 1);
 	if (_ovpn_lock_trackerp != NULL)
 		OVPN_RUNLOCK(sc);
 	if (V_async_crypto)
 		ret = crypto_dispatch_async(crp, CRYPTO_ASYNC_ORDERED);
 	else
 		ret = crypto_dispatch(crp);
 	if (ret) {
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_out, 1);
 	}
 
 	return (ret);
 }
 
 /*
  * Note: Expects to hold the read lock on entry, and will release it itself.
  */
 static int
 ovpn_encap(struct ovpn_softc *sc, uint32_t peerid, struct mbuf *m)
 {
 	struct udphdr *udp;
 	struct ovpn_kpeer *peer;
 	int len;
 
 	OVPN_RLOCK_TRACKER;
 
 	OVPN_RLOCK(sc);
 	NET_EPOCH_ASSERT();
 
 	peer = ovpn_find_peer(sc, peerid);
 	if (peer == NULL || sc->ifp->if_link_state != LINK_STATE_UP) {
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_out, 1);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	len = m->m_pkthdr.len;
 
 	M_PREPEND(m, sizeof(struct udphdr), M_NOWAIT);
 	if (m == NULL) {
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1);
 		m_freem(m);
 		return (ENOBUFS);
 	}
 	udp = mtod(m, struct udphdr *);
 
 	MPASS(peer->local.ss_family == peer->remote.ss_family);
 
 	udp->uh_sport = ovpn_get_port(&peer->local);
 	udp->uh_dport = ovpn_get_port(&peer->remote);
 	udp->uh_ulen = htons(sizeof(struct udphdr) + len);
 
 	switch (peer->remote.ss_family) {
 #ifdef INET
 	case AF_INET: {
 		struct sockaddr_in *in_local = TO_IN(&peer->local);
 		struct sockaddr_in *in_remote = TO_IN(&peer->remote);
 		struct ip *ip;
 
 		/*
 		 * This requires knowing the source IP, which we don't. Happily
 		 * we're allowed to keep this at 0, and the checksum won't do
 		 * anything the crypto won't already do.
 		 */
 		udp->uh_sum = 0;
 
 		/* Set the checksum flags so we recalculate checksums. */
 		m->m_pkthdr.csum_flags |= CSUM_IP;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 
 		M_PREPEND(m, sizeof(struct ip), M_NOWAIT);
 		if (m == NULL) {
 			OVPN_RUNLOCK(sc);
 			OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1);
 			return (ENOBUFS);
 		}
 		ip = mtod(m, struct ip *);
 
 		ip->ip_tos = 0;
 		ip->ip_len = htons(sizeof(struct ip) + sizeof(struct udphdr) +
 		   len);
 		ip->ip_off = 0;
 		ip->ip_ttl = V_ip_defttl;
 		ip->ip_p = IPPROTO_UDP;
 		ip->ip_sum = 0;
 		if (in_local->sin_port != 0)
 			ip->ip_src = in_local->sin_addr;
 		else
 			ip->ip_src.s_addr = INADDR_ANY;
 		ip->ip_dst = in_remote->sin_addr;
 
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, transport_bytes_sent, m->m_pkthdr.len);
 
 		return (ip_output(m, NULL, NULL, 0, NULL, NULL));
 	}
 #endif
 #ifdef INET6
 	case AF_INET6: {
 		struct sockaddr_in6 *in6_local = TO_IN6(&peer->local);
 		struct sockaddr_in6 *in6_remote = TO_IN6(&peer->remote);
 		struct ip6_hdr *ip6;
 
 		M_PREPEND(m, sizeof(struct ip6_hdr), M_NOWAIT);
 		if (m == NULL) {
 			OVPN_RUNLOCK(sc);
 			OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1);
 			return (ENOBUFS);
 		}
 		m = m_pullup(m, sizeof(*ip6) + sizeof(*udp));
 		if (m == NULL) {
 			OVPN_RUNLOCK(sc);
 			OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1);
 			return (ENOBUFS);
 		}
 
 		ip6 = mtod(m, struct ip6_hdr *);
 
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_flow &= ~IPV6_FLOWINFO_MASK;
 		ip6->ip6_plen = htons(sizeof(*ip6) + sizeof(struct udphdr) +
 		    len);
 		ip6->ip6_nxt = IPPROTO_UDP;
 		ip6->ip6_hlim = V_ip6_defhlim;
 
 		memcpy(&ip6->ip6_src, &in6_local->sin6_addr,
 		    sizeof(ip6->ip6_src));
 		memcpy(&ip6->ip6_dst, &in6_remote->sin6_addr,
 		    sizeof(ip6->ip6_dst));
 
 		udp = mtodo(m, sizeof(*ip6));
 		udp->uh_sum = in6_cksum_pseudo(ip6,
 		    m->m_pkthdr.len - sizeof(struct ip6_hdr),
 		    IPPROTO_UDP, 0);
 
 		m->m_pkthdr.csum_flags |= CSUM_UDP_IPV6;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, transport_bytes_sent, m->m_pkthdr.len);
 
 		return (ip6_output(m, NULL, NULL, IPV6_UNSPECSRC, NULL, NULL,
 		    NULL));
 	}
 #endif
 	default:
 		panic("Unsupported address family %d",
 		    peer->remote.ss_family);
 	}
 }
 
 static int
 ovpn_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
 	struct route *ro)
 {
 	struct ovpn_softc *sc;
 	struct ovpn_kpeer *peer;
 
 	OVPN_RLOCK_TRACKER;
 
 	sc = ifp->if_softc;
 
 	OVPN_RLOCK(sc);
 
 	SDT_PROBE1(if_ovpn, tx, transmit, start, m);
 
 	if (__predict_false(ifp->if_link_state != LINK_STATE_UP)) {
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_out, 1);
 		OVPN_RUNLOCK(sc);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/**
 	 * Only obey 'dst' (i.e. the gateway) if no route is supplied.
 	 * That's our indication that we're being called through pf's route-to,
 	 * and we should route according to 'dst' instead. We can't do so
 	 * consistently, because the usual openvpn configuration sets the first
 	 * non-server IP in the subnet as the gateway. If we always use that
 	 * one we'd end up routing all traffic to the first client.
 	 * tl;dr: 'ro == NULL' tells us pf is doing a route-to, and then but
 	 * only then, we should treat 'dst' as the destination. */
 	peer = ovpn_route_peer(sc, &m, ro == NULL ? dst : NULL);
 	if (peer == NULL) {
 		/* No destination. */
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_out, 1);
 		OVPN_RUNLOCK(sc);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	return (ovpn_transmit_to_peer(ifp, m, peer, _ovpn_lock_trackerp));
 }
 
 static bool
 ovpn_check_replay(struct ovpn_kkey_dir *key, uint32_t seq)
 {
 	uint32_t d;
 
 	mtx_lock(&key->replay_mtx);
 
 	/* Sequence number must be strictly greater than rx_seq */
 	if (seq <= key->rx_seq) {
 		mtx_unlock(&key->replay_mtx);
 		return (false);
 	}
 
 	/* Large jump. The packet authenticated okay, so just accept that. */
 	if (seq > (key->rx_seq + (sizeof(key->rx_window) * 8))) {
 		key->rx_seq = seq;
 		key->rx_window = 0;
 		mtx_unlock(&key->replay_mtx);
 		return (true);
 	}
 
 	/* Happy case. */
 	if ((seq == key->rx_seq + 1) && key->rx_window == 0) {
 		key->rx_seq++;
 		mtx_unlock(&key->replay_mtx);
 		return (true);
 	}
 
 	d = seq - key->rx_seq - 1;
 
 	if (key->rx_window & ((uint64_t)1 << d)) {
 		/* Dupe! */
 		mtx_unlock(&key->replay_mtx);
 		return (false);
 	}
 
 	key->rx_window |= (uint64_t)1 << d;
 
 	while (key->rx_window & 1) {
 		key->rx_seq++;
 		key->rx_window >>= 1;
 	}
 
 	mtx_unlock(&key->replay_mtx);
 
 	return (true);
 }
 
 static struct ovpn_kpeer *
 ovpn_peer_from_mbuf(struct ovpn_softc *sc, struct mbuf *m, int off)
 {
 	struct ovpn_wire_header ohdr;
 	uint32_t peerid;
 	const size_t hdrlen = sizeof(ohdr) - sizeof(ohdr.auth_tag);
 
 	OVPN_RASSERT(sc);
 
 	if (m_length(m, NULL) < (off + sizeof(struct udphdr) + hdrlen))
 		return (NULL);
 
 	m_copydata(m, off + sizeof(struct udphdr), hdrlen, (caddr_t)&ohdr);
 
 	peerid = ntohl(ohdr.opcode) & 0x00ffffff;
 
 	return (ovpn_find_peer(sc, peerid));
 }
 
 static bool
 ovpn_udp_input(struct mbuf *m, int off, struct inpcb *inp,
     const struct sockaddr *sa, void *ctx)
 {
 	struct ovpn_softc *sc = ctx;
 	struct ovpn_wire_header tmphdr;
 	struct ovpn_wire_header *ohdr;
 	struct udphdr *uhdr;
 	struct ovpn_kkey *key;
 	struct cryptop *crp;
 	struct ovpn_kpeer *peer;
 	size_t ohdrlen;
 	int ret;
 	uint8_t op;
 
 	OVPN_RLOCK_TRACKER;
 
 	M_ASSERTPKTHDR(m);
 
 	OVPN_COUNTER_ADD(sc, transport_bytes_received, m->m_pkthdr.len - off);
 
 	ohdrlen = sizeof(*ohdr) - sizeof(ohdr->auth_tag);
 
 	OVPN_RLOCK(sc);
 
 	peer = ovpn_peer_from_mbuf(sc, m, off);
 	if (peer == NULL) {
 		OVPN_RUNLOCK(sc);
 		return (false);
 	}
 
 	if (m_length(m, NULL) < (off + sizeof(*uhdr) + ohdrlen)) {
 		/* Short packet. */
 		OVPN_RUNLOCK(sc);
 		return (false);
 	}
 
 	m_copydata(m, off + sizeof(*uhdr), ohdrlen, (caddr_t)&tmphdr);
 
 	op = ntohl(tmphdr.opcode) >> 24 >> OVPN_OP_SHIFT;
 	if (op != OVPN_OP_DATA_V2) {
 		/* Control packet? */
 		OVPN_RUNLOCK(sc);
 		return (false);
 	}
 
 	m = m_pullup(m, off + sizeof(*uhdr) + ohdrlen);
 	if (m == NULL) {
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, nomem_data_pkts_in, 1);
 		return (true);
 	}
 
 	/*
 	 * Simplify things by getting rid of the preceding headers, we don't
 	 * care about them.
 	 */
 	m_adj_decap(m, off);
 
 	uhdr = mtodo(m, 0);
 	ohdr = mtodo(m, sizeof(*uhdr));
 
 	key = ovpn_find_key(sc, peer, ohdr);
 	if (key == NULL || key->decrypt == NULL) {
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1);
 		m_freem(m);
 		return (true);
 	}
 
 	if (key->decrypt->cipher == OVPN_CIPHER_ALG_NONE) {
 		/* Now remove the outer headers */
 		m_adj_decap(m, sizeof(struct udphdr) + ohdrlen);
 
 		ohdr = mtodo(m, sizeof(*uhdr));
 
 		ovpn_finish_rx(sc, m, peer, key, ntohl(ohdr->seq),
 		    _ovpn_lock_trackerp);
 		OVPN_UNLOCK_ASSERT(sc);
 		return (true);
 	}
 
 	ohdrlen += sizeof(ohdr->auth_tag);
 
 	m = m_pullup(m, sizeof(*uhdr) + ohdrlen);
 	if (m == NULL) {
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, nomem_data_pkts_in, 1);
 		return (true);
 	}
 	uhdr = mtodo(m, 0);
 	ohdr = mtodo(m, sizeof(*uhdr));
 
 	/* Decrypt */
 	crp = crypto_getreq(key->decrypt->cryptoid, M_NOWAIT);
 	if (crp == NULL) {
 		OVPN_COUNTER_ADD(sc, nomem_data_pkts_in, 1);
 		OVPN_RUNLOCK(sc);
 		m_freem(m);
 		return (true);
 	}
 
 	crp->crp_payload_start = sizeof(struct udphdr) + sizeof(*ohdr);
 	crp->crp_payload_length = ntohs(uhdr->uh_ulen) -
 	    sizeof(*uhdr) - sizeof(*ohdr);
 	crp->crp_op = CRYPTO_OP_DECRYPT;
 
 	/* AAD validation. */
 	crp->crp_aad_length = sizeof(*ohdr) - sizeof(ohdr->auth_tag);
 	crp->crp_aad = ohdr;
 	crp->crp_aad_start = 0;
 	crp->crp_op |= CRYPTO_OP_VERIFY_DIGEST;
 	crp->crp_digest_start = sizeof(struct udphdr) +
 	    offsetof(struct ovpn_wire_header, auth_tag);
 
 	crp->crp_flags |= CRYPTO_F_IV_SEPARATE;
 	memcpy(crp->crp_iv, &ohdr->seq, sizeof(ohdr->seq));
 	memcpy(crp->crp_iv + sizeof(ohdr->seq), key->decrypt->nonce,
 	    key->decrypt->noncelen);
 
 	crypto_use_mbuf(crp, m);
 	crp->crp_flags |= CRYPTO_F_CBIFSYNC;
 	crp->crp_callback = ovpn_decrypt_rx_cb;
 	crp->crp_opaque = sc;
 
 	atomic_add_int(&sc->refcount, 1);
 	OVPN_RUNLOCK(sc);
 	if (V_async_crypto)
 		ret = crypto_dispatch_async(crp, CRYPTO_ASYNC_ORDERED);
 	else
 		ret = crypto_dispatch(crp);
 	if (ret != 0) {
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1);
 	}
 
 	return (true);
 }
 
 static void
 ovpn_qflush(struct ifnet *ifp __unused)
 {
 
 }
 
 static void
 ovpn_flush_rxring(struct ovpn_softc *sc)
 {
 	struct ovpn_notification *n;
 
 	OVPN_WASSERT(sc);
 
 	while (! buf_ring_empty(sc->notifring)) {
 		n = buf_ring_dequeue_sc(sc->notifring);
 		free(n, M_OVPN);
 	}
 }
 
 #ifdef VIMAGE
 static void
 ovpn_reassign(struct ifnet *ifp, struct vnet *new_vnet __unused,
     char *unused __unused)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 	struct ovpn_kpeer *peer, *tmppeer;
 	int ret __diagused;
 
 	OVPN_WLOCK(sc);
 
 	/* Flush keys & configuration. */
 	RB_FOREACH_SAFE(peer, ovpn_kpeers, &sc->peers, tmppeer) {
 		peer->del_reason = OVPN_DEL_REASON_REQUESTED;
 		ret = _ovpn_del_peer(sc, peer);
 		MPASS(ret == 0);
 	}
 
 	ovpn_flush_rxring(sc);
 
 	OVPN_WUNLOCK(sc);
 }
 #endif
 
 static int
 ovpn_clone_match(struct if_clone *ifc, const char *name)
 {
 	/*
 	 * Allow all names that start with 'ovpn', specifically because pfSense
 	 * uses ovpnc1 / ovpns2
 	 */
 	return (strncmp(ovpnname, name, strlen(ovpnname)) == 0);
 }
 
 static int
 ovpn_clone_create(struct if_clone *ifc, char *name, size_t len,
     struct ifc_data *ifd, struct ifnet **ifpp)
 {
 	struct ovpn_softc *sc;
 	struct ifnet *ifp;
 	char *dp;
 	int error, unit, wildcard;
 
 	/* Try to see if a special unit was requested. */
 	error = ifc_name2unit(name, &unit);
 	if (error != 0)
 		return (error);
 	wildcard = (unit < 0);
 
 	error = ifc_alloc_unit(ifc, &unit);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * If no unit had been given, we need to adjust the ifName.
 	 */
 	for (dp = name; *dp != '\0'; dp++);
 	if (wildcard) {
 		error = snprintf(dp, len - (dp - name), "%d", unit);
 		if (error > len - (dp - name)) {
 			/* ifName too long. */
 			ifc_free_unit(ifc, unit);
 			return (ENOSPC);
 		}
 		dp += error;
 	}
 
 	/* Make sure it doesn't already exist. */
 	if (ifunit(name) != NULL)
 		return (EEXIST);
 
 	sc = malloc(sizeof(struct ovpn_softc), M_OVPN, M_WAITOK | M_ZERO);
 	sc->ifp = if_alloc(IFT_ENC);
 	rm_init_flags(&sc->lock, "if_ovpn_lock", RM_RECURSE);
 	sc->refcount = 0;
 
 	sc->notifring = buf_ring_alloc(32, M_OVPN, M_WAITOK, NULL);
 
 	COUNTER_ARRAY_ALLOC(sc->counters, OVPN_COUNTER_SIZE, M_WAITOK);
 
 	ifp = sc->ifp;
 	ifp->if_softc = sc;
 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
 	ifp->if_dname = ovpngroupname;
 	ifp->if_dunit = unit;
 
 	ifp->if_addrlen = 0;
 	ifp->if_mtu = 1428;
 	ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST;
 	ifp->if_ioctl = ovpn_ioctl;
 	ifp->if_transmit = ovpn_transmit;
 	ifp->if_output = ovpn_output;
 	ifp->if_qflush = ovpn_qflush;
 #ifdef VIMAGE
 	ifp->if_reassign = ovpn_reassign;
 #endif
 	ifp->if_capabilities |= IFCAP_LINKSTATE;
 	ifp->if_capenable |= IFCAP_LINKSTATE;
 
 	if_attach(ifp);
 	bpfattach(ifp, DLT_NULL, sizeof(uint32_t));
 	*ifpp = ifp;
 
 	return (0);
 }
 
 static void
 ovpn_clone_destroy_cb(struct epoch_context *ctx)
 {
 	struct ovpn_softc *sc;
 
 	sc = __containerof(ctx, struct ovpn_softc, epoch_ctx);
 
 	MPASS(sc->peercount == 0);
 	MPASS(RB_EMPTY(&sc->peers));
 
 	COUNTER_ARRAY_FREE(sc->counters, OVPN_COUNTER_SIZE);
 
 	if_free(sc->ifp);
 	free(sc, M_OVPN);
 }
 
 static int
 ovpn_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags)
 {
 	struct ovpn_softc *sc;
 	struct ovpn_kpeer *peer, *tmppeer;
 	int unit;
 	int ret __diagused;
 
 	sc = ifp->if_softc;
 	unit = ifp->if_dunit;
 
 	OVPN_WLOCK(sc);
 
 	if (atomic_load_int(&sc->refcount) > 0) {
 		OVPN_WUNLOCK(sc);
 		return (EBUSY);
 	}
 
 	RB_FOREACH_SAFE(peer, ovpn_kpeers, &sc->peers, tmppeer) {
 		peer->del_reason = OVPN_DEL_REASON_REQUESTED;
 		ret = _ovpn_del_peer(sc, peer);
 		MPASS(ret == 0);
 	}
 
 	ovpn_flush_rxring(sc);
 	buf_ring_free(sc->notifring, M_OVPN);
 
 	OVPN_WUNLOCK(sc);
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 	ifp->if_softc = NULL;
 
 	NET_EPOCH_CALL(ovpn_clone_destroy_cb, &sc->epoch_ctx);
 
 	if (unit != IF_DUNIT_NONE)
 		ifc_free_unit(ifc, unit);
 
 	NET_EPOCH_DRAIN_CALLBACKS();
 
 	return (0);
 }
 
 static void
 vnet_ovpn_init(const void *unused __unused)
 {
 	struct if_clone_addreq req = {
 		.match_f = ovpn_clone_match,
 		.create_f = ovpn_clone_create,
 		.destroy_f = ovpn_clone_destroy,
 	};
 	V_ovpn_cloner = ifc_attach_cloner(ovpngroupname, &req);
 }
 VNET_SYSINIT(vnet_ovpn_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_ovpn_init, NULL);
 
 static void
 vnet_ovpn_uninit(const void *unused __unused)
 {
 	if_clone_detach(V_ovpn_cloner);
 }
 VNET_SYSUNINIT(vnet_ovpn_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_ovpn_uninit, NULL);
 
 static int
 ovpnmodevent(module_t mod, int type, void *data)
 {
 	switch (type) {
 	case MOD_LOAD:
 		/* Done in vnet_ovpn_init() */
 		break;
 	case MOD_UNLOAD:
 		/* Done in vnet_ovpn_uninit() */
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static moduledata_t ovpn_mod = {
 	"if_ovpn",
 	ovpnmodevent,
 	0
 };
 
 DECLARE_MODULE(if_ovpn, ovpn_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_ovpn, 1);
diff --git a/sys/net/if_private.h b/sys/net/if_private.h
new file mode 100644
index 000000000000..70212e79d077
--- /dev/null
+++ b/sys/net/if_private.h
@@ -0,0 +1,198 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	From: @(#)if.h	8.1 (Berkeley) 6/10/93
+ */
+
+#ifndef	_NET_IF_PRIVATE_H_
+#define	_NET_IF_PRIVATE_H_
+
+#ifdef	_KERNEL
+/*
+ * Structure defining a network interface.
+ */
+struct ifnet {
+	/* General book keeping of interface lists. */
+	CK_STAILQ_ENTRY(ifnet) if_link; 	/* all struct ifnets are chained (CK_) */
+	LIST_ENTRY(ifnet) if_clones;	/* interfaces of a cloner */
+	CK_STAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if (CK_) */
+					/* protected by if_addr_lock */
+	u_char	if_alloctype;		/* if_type at time of allocation */
+	uint8_t	if_numa_domain;		/* NUMA domain of device */
+	/* Driver and protocol specific information that remains stable. */
+	void	*if_softc;		/* pointer to driver state */
+	void	*if_llsoftc;		/* link layer softc */
+	void	*if_l2com;		/* pointer to protocol bits */
+	const char *if_dname;		/* driver name */
+	int	if_dunit;		/* unit or IF_DUNIT_NONE */
+	u_short	if_index;		/* numeric abbreviation for this if  */
+	u_short	if_idxgen;		/* ... and its generation count */
+	char	if_xname[IFNAMSIZ];	/* external name (name + unit) */
+	char	*if_description;	/* interface description */
+
+	/* Variable fields that are touched by the stack and drivers. */
+	int	if_flags;		/* up/down, broadcast, etc. */
+	int	if_drv_flags;		/* driver-managed status flags */
+	int	if_capabilities;	/* interface features & capabilities */
+	int	if_capabilities2;	/* part 2 */
+	int	if_capenable;		/* enabled features & capabilities */
+	int	if_capenable2;		/* part 2 */
+	void	*if_linkmib;		/* link-type-specific MIB data */
+	size_t	if_linkmiblen;		/* length of above data */
+	u_int	if_refcount;		/* reference count */
+
+	/* These fields are shared with struct if_data. */
+	uint8_t		if_type;	/* ethernet, tokenring, etc */
+	uint8_t		if_addrlen;	/* media address length */
+	uint8_t		if_hdrlen;	/* media header length */
+	uint8_t		if_link_state;	/* current link state */
+	uint32_t	if_mtu;		/* maximum transmission unit */
+	uint32_t	if_metric;	/* routing metric (external only) */
+	uint64_t	if_baudrate;	/* linespeed */
+	uint64_t	if_hwassist;	/* HW offload capabilities, see IFCAP */
+	time_t		if_epoch;	/* uptime at attach or stat reset */
+	struct timeval	if_lastchange;	/* time of last administrative change */
+
+	struct  ifaltq if_snd;		/* output queue (includes altq) */
+	struct	task if_linktask;	/* task for link change events */
+	struct	task if_addmultitask;	/* task for SIOCADDMULTI */
+
+	/* Addresses of different protocol families assigned to this if. */
+	struct mtx if_addr_lock;	/* lock to protect address lists */
+		/*
+		 * if_addrhead is the list of all addresses associated to
+		 * an interface.
+		 * Some code in the kernel assumes that first element
+		 * of the list has type AF_LINK, and contains sockaddr_dl
+		 * addresses which store the link-level address and the name
+		 * of the interface.
+		 * However, access to the AF_LINK address through this
+		 * field is deprecated. Use if_addr instead.
+		 */
+	struct	ifaddrhead if_addrhead;	/* linked list of addresses per if */
+	struct	ifmultihead if_multiaddrs; /* multicast addresses configured */
+	int	if_amcount;		/* number of all-multicast requests */
+	struct	ifaddr	*if_addr;	/* pointer to link-level address */
+	void	*if_hw_addr;		/* hardware link-level address */
+	const u_int8_t *if_broadcastaddr; /* linklevel broadcast bytestring */
+	struct	mtx if_afdata_lock;
+	void	*if_afdata[AF_MAX];
+	int	if_afdata_initialized;
+
+	/* Additional features hung off the interface. */
+	u_int	if_fib;			/* interface FIB */
+	struct	vnet *if_vnet;		/* pointer to network stack instance */
+	struct	vnet *if_home_vnet;	/* where this ifnet originates from */
+	struct  ifvlantrunk *if_vlantrunk; /* pointer to 802.1q data */
+	struct	bpf_if *if_bpf;		/* packet filter structure */
+	int	if_pcount;		/* number of promiscuous listeners */
+	void	*if_bridge;		/* bridge glue */
+	void	*if_lagg;		/* lagg glue */
+	void	*if_pf_kif;		/* pf glue */
+	struct	carp_if *if_carp;	/* carp interface structure */
+	struct	label *if_label;	/* interface MAC label */
+	struct	netmap_adapter *if_netmap; /* netmap(4) softc */
+
+	/* Various procedures of the layer2 encapsulation and drivers. */
+	if_output_fn_t if_output;	/* output routine (enqueue) */
+	if_input_fn_t if_input;		/* input routine (from h/w driver) */
+	struct mbuf *(*if_bridge_input)(struct ifnet *, struct mbuf *);
+	int	(*if_bridge_output)(struct ifnet *, struct mbuf *, struct sockaddr *,
+		    struct rtentry *);
+	void (*if_bridge_linkstate)(struct ifnet *ifp);
+	if_start_fn_t	if_start;	/* initiate output routine */
+	if_ioctl_fn_t	if_ioctl;	/* ioctl routine */
+	if_init_fn_t	if_init;	/* Init routine */
+	int	(*if_resolvemulti)	/* validate/resolve multicast */
+		(struct ifnet *, struct sockaddr **, struct sockaddr *);
+	if_qflush_fn_t	if_qflush;	/* flush any queue */
+	if_transmit_fn_t if_transmit;   /* initiate output routine */
+
+	void	(*if_reassign)		/* reassign to vnet routine */
+		(struct ifnet *, struct vnet *, char *);
+	if_get_counter_t if_get_counter; /* get counter values */
+	int	(*if_requestencap)	/* make link header from request */
+		(struct ifnet *, struct if_encap_req *);
+
+	/* Statistics. */
+	counter_u64_t	if_counters[IFCOUNTERS];
+
+	/* Stuff that's only temporary and doesn't belong here. */
+
+	/*
+	 * Network adapter TSO limits:
+	 * ===========================
+	 *
+	 * If the "if_hw_tsomax" field is zero the maximum segment
+	 * length limit does not apply. If the "if_hw_tsomaxsegcount"
+	 * or the "if_hw_tsomaxsegsize" field is zero the TSO segment
+	 * count limit does not apply. If all three fields are zero,
+	 * there is no TSO limit.
+	 *
+	 * NOTE: The TSO limits should reflect the values used in the
+	 * BUSDMA tag a network adapter is using to load a mbuf chain
+	 * for transmission. The TCP/IP network stack will subtract
+	 * space for all linklevel and protocol level headers and
+	 * ensure that the full mbuf chain passed to the network
+	 * adapter fits within the given limits.
+	 */
+	u_int	if_hw_tsomax;		/* TSO maximum size in bytes */
+	u_int	if_hw_tsomaxsegcount;	/* TSO maximum segment count */
+	u_int	if_hw_tsomaxsegsize;	/* TSO maximum segment size in bytes */
+
+	/*
+	 * Network adapter send tag support:
+	 */
+	if_snd_tag_alloc_t *if_snd_tag_alloc;
+
+	/* Ratelimit (packet pacing) */
+	if_ratelimit_query_t *if_ratelimit_query;
+	if_ratelimit_setup_t *if_ratelimit_setup;
+
+	/* Ethernet PCP */
+	uint8_t if_pcp;
+
+	/*
+	 * Debugnet (Netdump) hooks to be called while in db/panic.
+	 */
+	struct debugnet_methods *if_debugnet_methods;
+	struct epoch_context	if_epoch_ctx;
+
+	/*
+	 * Spare fields to be added before branching a stable branch, so
+	 * that structure can be enhanced without changing the kernel
+	 * binary interface.
+	 */
+	int	if_ispare[4];		/* general use */
+};
+
+#endif	/* _KERNEL */
+
+#endif	/* _NET_IF_PRIVATE_H_ */
diff --git a/sys/net/if_stf.c b/sys/net/if_stf.c
index 9a469a82c34c..afc7b446fca7 100644
--- a/sys/net/if_stf.c
+++ b/sys/net/if_stf.c
@@ -1,1062 +1,1063 @@
 /*	$FreeBSD$	*/
 /*	$KAME: if_stf.c,v 1.73 2001/12/03 11:08:30 keiichi Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 2000 WIDE Project.
  * Copyright (c) 2010 Hiroki Sato <hrs@FreeBSD.org>
  * Copyright (c) 2013 Ermal Luci <eri@FreeBSD.org>
  * Copyright (c) 2017-2021 Rubicon Communications, LLC (Netgate)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * 6to4 interface, based on RFC3056.
  *
  * 6to4 interface is NOT capable of link-layer (I mean, IPv4) multicasting.
  * There is no address mapping defined from IPv6 multicast address to IPv4
  * address.  Therefore, we do not have IFF_MULTICAST on the interface.
  *
  * Due to the lack of address mapping for link-local addresses, we cannot
  * throw packets toward link-local addresses (fe80::x).  Also, we cannot throw
  * packets to link-local multicast addresses (ff02::x).
  *
  * Here are interesting symptoms due to the lack of link-local address:
  *
  * Unicast routing exchange:
  * - RIPng: Impossible.  Uses link-local multicast packet toward ff02::9,
  *   and link-local addresses as nexthop.
  * - OSPFv6: Impossible.  OSPFv6 assumes that there's link-local address
  *   assigned to the link, and makes use of them.  Also, HELLO packets use
  *   link-local multicast addresses (ff02::5 and ff02::6).
  * - BGP4+: Maybe.  You can only use global address as nexthop, and global
  *   address as TCP endpoint address.
  *
  * Multicast routing protocols:
  * - PIM: Hello packet cannot be used to discover adjacent PIM routers.
  *   Adjacent PIM routers must be configured manually (is it really spec-wise
  *   correct thing to do?).
  *
  * ICMPv6:
  * - Redirects cannot be used due to the lack of link-local address.
  *
  * stf interface does not have, and will not need, a link-local address.  
  * It seems to have no real benefit and does not help the above symptoms much.
  * Even if we assign link-locals to interface, we cannot really
  * use link-local unicast/multicast on top of 6to4 cloud (since there's no
  * encapsulation defined for link-local address), and the above analysis does
  * not change.  RFC3056 does not mandate the assignment of link-local address
  * either.
  *
  * 6to4 interface has security issues.  Refer to
  * http://playground.iijlab.net/i-d/draft-itojun-ipv6-transition-abuse-00.txt
  * for details.  The code tries to filter out some of malicious packets.
  * Note that there is no way to be 100% secure.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/endian.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 #include <machine/cpu.h>
 
 #include <sys/malloc.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_clone.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/netisr.h>
 #include <net/if_stf.h>
 #include <net/if_types.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_var.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip_ecn.h>
 
 #include <netinet/ip_encap.h>
 
 #include <machine/stdarg.h>
 
 #include <net/bpf.h>
 
 #include <security/mac/mac_framework.h>
 
 SDT_PROVIDER_DEFINE(if_stf);
 SDT_PROBE_DEFINE3(if_stf, , encapcheck, in, "struct mbuf *", "int", "int");
 SDT_PROBE_DEFINE0(if_stf, , encapcheck, accept);
 SDT_PROBE_DEFINE3(if_stf, , getsrcifa6, in, "struct ifnet *",
     "struct in6_addr *", "struct in6_addr *");
 SDT_PROBE_DEFINE2(if_stf, , getsrcifa6, found, "struct in6_addr *",
     "struct in6_addr *");
 SDT_PROBE_DEFINE0(if_stf, , getsrcifa6, notfound);
 
 SDT_PROBE_DEFINE4(if_stf, , stf_output, in, "struct ifnet *", "struct mbuf *",
     "struct sockaddr *", "struct route *");
 SDT_PROBE_DEFINE2(if_stf, , stf_output, error, "int", "int");
 SDT_PROBE_DEFINE1(if_stf, , stf_output, out, "int");
 
 SDT_PROBE_DEFINE3(if_stf, , checkaddr6, in, "struct stf_softc *",
     "struct in6_addr *", "struct ifnet *");
 SDT_PROBE_DEFINE2(if_stf, , checkaddr6, out, "int", "int");
 
 SDT_PROBE_DEFINE3(if_stf, , stf_input, in, "struct mbuf *", "int", "int");
 SDT_PROBE_DEFINE2(if_stf, , stf_input, out, "int", "int");
 
 SDT_PROBE_DEFINE3(if_stf, , ioctl, sv4net, "struct in_addr *",
     "struct in_addr *", "int");
 SDT_PROBE_DEFINE1(if_stf, , ioctl, sdstv4, "struct in_addr *");
 SDT_PROBE_DEFINE1(if_stf, , ioctl, ifaddr, "struct ifaddr *");
 
 SDT_PROBE_DEFINE4(if_stf, , getin4addr_in6, out, "struct in6_addr *",
     "struct in6_addr *", "struct in6_addr *", "struct sockaddr_in *");
 
 SDT_PROBE_DEFINE2(if_stf, , getin4addr, in, "struct in6_addr *", "struct in6_addr *");
 SDT_PROBE_DEFINE1(if_stf, , getin4addr, out, "struct sockaddr_in *");
 
 SYSCTL_DECL(_net_link);
 static SYSCTL_NODE(_net_link, IFT_STF, stf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "6to4 Interface");
 
 static int stf_permit_rfc1918 = 0;
 SYSCTL_INT(_net_link_stf, OID_AUTO, permit_rfc1918, CTLFLAG_RWTUN,
     &stf_permit_rfc1918, 0, "Permit the use of private IPv4 addresses");
 
 #define STFUNIT		0
 
 #define IN6_IS_ADDR_6TO4(x)	(ntohs((x)->s6_addr16[0]) == 0x2002)
 
 /*
  * XXX: Return a pointer with 16-bit aligned.  Don't cast it to
  * struct in_addr *; use bcopy() instead.
  */
 #define GET_V4(x)	(&(x)->s6_addr16[1])
 
 struct stf_softc {
 	struct ifnet	*sc_ifp;
 	in_addr_t	braddr;		/* Border relay IPv4 address */
 	in_addr_t	srcv4_addr;	/* Our IPv4 WAN address */
 	u_int		v4prefixlen;	/* How much of the v4 address to include in our address. */
 	u_int		sc_fibnum;
 	const struct encaptab *encap_cookie;
 };
 #define STF2IFP(sc)	((sc)->sc_ifp)
 
 static const char stfname[] = "stf";
 
 static MALLOC_DEFINE(M_STF, stfname, "6to4 Tunnel Interface");
 static const int ip_stf_ttl = 40;
 
 static int in_stf_input(struct mbuf *, int, int, void *);
 static char *stfnames[] = {"stf0", "stf", "6to4", NULL};
 
 static int stfmodevent(module_t, int, void *);
 static int stf_encapcheck(const struct mbuf *, int, int, void *);
 static int stf_getsrcifa6(struct ifnet *, struct in6_addr *, struct in6_addr *);
 static int stf_output(struct ifnet *, struct mbuf *, const struct sockaddr *,
 	struct route *);
 static int isrfc1918addr(struct in_addr *);
 static int stf_checkaddr4(struct stf_softc *, struct in_addr *,
 	struct ifnet *);
 static int stf_checkaddr6(struct stf_softc *, struct in6_addr *,
 	struct ifnet *);
 static struct sockaddr_in *stf_getin4addr_in6(struct stf_softc *,
 	struct sockaddr_in *, struct in6_addr, struct in6_addr,
 	struct in6_addr);
 static struct sockaddr_in *stf_getin4addr(struct stf_softc *,
 	struct sockaddr_in *, struct in6_addr, struct in6_addr);
 static int stf_ioctl(struct ifnet *, u_long, caddr_t);
 
 VNET_DEFINE_STATIC(struct if_clone *, stf_cloner);
 #define V_stf_cloner	VNET(stf_cloner)
 
 static const struct encap_config ipv4_encap_cfg = {
 	.proto = IPPROTO_IPV6,
 	.min_length = sizeof(struct ip),
 	.exact_match = (sizeof(in_addr_t) << 3) + 8,
 	.check = stf_encapcheck,
 	.input = in_stf_input
 };
 
 static int
 stf_clone_match(struct if_clone *ifc, const char *name)
 {
 	int i;
 
 	for(i = 0; stfnames[i] != NULL; i++) {
 		if (strcmp(stfnames[i], name) == 0)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 stf_clone_create(struct if_clone *ifc, char *name, size_t len,
     struct ifc_data *ifd, struct ifnet **ifpp)
 {
 	char *dp;
 	int err, unit, wildcard;
 	struct stf_softc *sc;
 	struct ifnet *ifp;
 
 	err = ifc_name2unit(name, &unit);
 	if (err != 0)
 		return (err);
 	wildcard = (unit < 0);
 
 	/*
 	 * We can only have one unit, but since unit allocation is
 	 * already locked, we use it to keep from allocating extra
 	 * interfaces.
 	 */
 	unit = STFUNIT;
 	err = ifc_alloc_unit(ifc, &unit);
 	if (err != 0)
 		return (err);
 
 	sc = malloc(sizeof(struct stf_softc), M_STF, M_WAITOK | M_ZERO);
 	ifp = STF2IFP(sc) = if_alloc(IFT_STF);
 	if (ifp == NULL) {
 		free(sc, M_STF);
 		ifc_free_unit(ifc, unit);
 		return (ENOSPC);
 	}
 	ifp->if_softc = sc;
 	sc->sc_fibnum = curthread->td_proc->p_fibnum;
 
 	/*
 	 * Set the name manually rather then using if_initname because
 	 * we don't conform to the default naming convention for interfaces.
 	 * In the wildcard case, we need to update the name.
 	 */
 	if (wildcard) {
 		for (dp = name; *dp != '\0'; dp++);
 		if (snprintf(dp, len - (dp-name), "%d", unit) >
 		    len - (dp-name) - 1) {
 			/*
 			 * This can only be a programmer error and
 			 * there's no straightforward way to recover if
 			 * it happens.
 			 */
 			panic("if_clone_create(): interface name too long");
 		}
 	}
 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
 	ifp->if_dname = stfname;
 	ifp->if_dunit = IF_DUNIT_NONE;
 
 	sc->encap_cookie = ip_encap_attach(&ipv4_encap_cfg, sc, M_WAITOK);
 	if (sc->encap_cookie == NULL) {
 		if_printf(ifp, "attach failed\n");
 		free(sc, M_STF);
 		ifc_free_unit(ifc, unit);
 		return (ENOMEM);
 	}
 
 	ifp->if_mtu    = IPV6_MMTU;
 	ifp->if_ioctl  = stf_ioctl;
 	ifp->if_output = stf_output;
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	if_attach(ifp);
 	bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));
 	*ifpp = ifp;
 
 	return (0);
 }
 
 static int
 stf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags)
 {
 	struct stf_softc *sc = ifp->if_softc;
 	int err __unused;
 
 	err = ip_encap_detach(sc->encap_cookie);
 	KASSERT(err == 0, ("Unexpected error detaching encap_cookie"));
 	bpfdetach(ifp);
 	if_detach(ifp);
 	if_free(ifp);
 
 	free(sc, M_STF);
 	ifc_free_unit(ifc, STFUNIT);
 
 	return (0);
 }
 
 static void
 vnet_stf_init(const void *unused __unused)
 {
 	struct if_clone_addreq req = {
 		.match_f = stf_clone_match,
 		.create_f = stf_clone_create,
 		.destroy_f = stf_clone_destroy,
 	};
 	V_stf_cloner = ifc_attach_cloner(stfname, &req);
 }
 VNET_SYSINIT(vnet_stf_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_stf_init, NULL);
 
 static void
 vnet_stf_uninit(const void *unused __unused)
 {
 	if_clone_detach(V_stf_cloner);
 	V_stf_cloner = NULL;
 }
 VNET_SYSUNINIT(vnet_stf_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_stf_uninit,
     NULL);
 
 static int
 stfmodevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		/* Done in vnet_stf_init() */
 		break;
 	case MOD_UNLOAD:
 		/* Done in vnet_stf_uninit() */
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static moduledata_t stf_mod = {
 	"if_stf",
 	stfmodevent,
 	0
 };
 
 DECLARE_MODULE(if_stf, stf_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_stf, 2);
 
 static int
 stf_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
 {
 	struct ip ip;
 	struct stf_softc *sc;
 	struct in6_addr addr6, mask6;
 	struct sockaddr_in sin4addr, sin4mask;
 
 	SDT_PROBE3(if_stf, , encapcheck, in, m, off, proto);
 
 	sc = (struct stf_softc *)arg;
 	if (sc == NULL)
 		return (0);
 
 	if ((STF2IFP(sc)->if_flags & IFF_UP) == 0)
 		return (0);
 
 	/* IFF_LINK0 means "no decapsulation" */
 	if ((STF2IFP(sc)->if_flags & IFF_LINK0) != 0)
 		return (0);
 
 	if (proto != IPPROTO_IPV6)
 		return (0);
 
 	m_copydata(m, 0, sizeof(ip), (caddr_t)&ip);
 
 	if (ip.ip_v != 4)
 		return (0);
 
 	if (stf_getsrcifa6(STF2IFP(sc), &addr6, &mask6) != 0)
 		return (0);
 
 	if (sc->srcv4_addr != INADDR_ANY) {
 		sin4addr.sin_addr.s_addr = sc->srcv4_addr;
 		sin4addr.sin_family = AF_INET;
 	} else
 		if (stf_getin4addr(sc, &sin4addr, addr6, mask6) == NULL)
 			return (0);
 
 	if (sin4addr.sin_addr.s_addr != ip.ip_dst.s_addr)
 		return (0);
 
 	if (IN6_IS_ADDR_6TO4(&addr6)) {
 		/*
 		 * 6to4 (RFC 3056).
 		 * Check if IPv4 src matches the IPv4 address derived
 		 * from the local 6to4 address masked by prefixmask.
 		 * success on: src = 10.1.1.1, ia6->ia_addr = 2002:0a00:.../24
 		 * fail on: src = 10.1.1.1, ia6->ia_addr = 2002:0b00:.../24
 		 */
 		memcpy(&sin4mask.sin_addr, GET_V4(&mask6),
 		    sizeof(sin4mask.sin_addr));
 		if ((sin4addr.sin_addr.s_addr & sin4mask.sin_addr.s_addr) !=
 		    (ip.ip_src.s_addr & sin4mask.sin_addr.s_addr))
 			return (0);
 	} else {
 		/* 6rd (RFC 5569) */
 		/*
 		 * No restriction on the src address in the case of
 		 * 6rd because the stf(4) interface always has a
 		 * prefix which covers whole of IPv4 src address
 		 * range.  So, stf_output() will catch all of
 		 * 6rd-capsuled IPv4 traffic with suspicious inner dst
 		 * IPv4 address (i.e. the IPv6 destination address is
 		 * one the admin does not like to route to outside),
 		 * and then it discard them silently.
 		 */
 	}
 
 	SDT_PROBE0(if_stf, , encapcheck, accept);
 
 	/* stf interface makes single side match only */
 	return (32);
 }
 
 static int
 stf_getsrcifa6(struct ifnet *ifp, struct in6_addr *addr, struct in6_addr *mask)
 {
 	struct ifaddr *ia;
 	struct in_ifaddr *ia4;
 	struct in6_addr addr6, mask6;
 	struct sockaddr_in sin4;
 	struct stf_softc *sc;
 	struct in_addr in;
 
 	NET_EPOCH_ASSERT();
 
 	sc = ifp->if_softc;
 
 	SDT_PROBE3(if_stf, , getsrcifa6, in, ifp, addr, mask);
 
 	CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
 		if (ia->ifa_addr->sa_family != AF_INET6)
 			continue;
 
 		addr6 = *IFA_IN6(ia);
 		mask6 = *IFA_MASKIN6(ia);
 		if (sc->srcv4_addr != INADDR_ANY)
 			bcopy(&sc->srcv4_addr, &in, sizeof(in));
 		else {
 			if (stf_getin4addr(sc, &sin4, addr6, mask6) == NULL)
 				continue;
 			bcopy(&sin4.sin_addr, &in, sizeof(in));
 		}
 
 		CK_LIST_FOREACH(ia4, INADDR_HASH(in.s_addr), ia_hash)
 			if (ia4->ia_addr.sin_addr.s_addr == in.s_addr)
 				break;
 		if (ia4 == NULL)
 			continue;
 
 		*addr = addr6;
 		*mask = mask6;
 
 		SDT_PROBE2(if_stf, , getsrcifa6, found, addr, mask);
 
 		return (0);
 	}
 
 	SDT_PROBE0(if_stf, , getsrcifa6, notfound);
 
 	return (ENOENT);
 }
 
 static int
 stf_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
     struct route *ro)
 {
 	struct stf_softc *sc;
 	const struct sockaddr_in6 *dst6;
 	struct sockaddr_in dst4, src4;
 	u_int8_t tos;
 	struct ip *ip;
 	struct ip6_hdr *ip6;
 	struct in6_addr addr6, mask6;
 	int error;
 
 	SDT_PROBE4(if_stf, , stf_output, in, ifp, m, dst, ro);
 
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error) {
 		m_freem(m);
 		SDT_PROBE2(if_stf, , stf_output, error, error, __LINE__);
 		return (error);
 	}
 #endif
 
 	sc = ifp->if_softc;
 	dst6 = (const struct sockaddr_in6 *)dst;
 
 	/* just in case */
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		SDT_PROBE2(if_stf, , stf_output, error, ENETDOWN, __LINE__);
 		return (ENETDOWN);
 	}
 
 	/*
 	 * If we don't have an ip4 address that match my inner ip6 address,
 	 * we shouldn't generate output.  Without this check, we'll end up
 	 * using wrong IPv4 source.
 	 */
 	if (stf_getsrcifa6(ifp, &addr6, &mask6) != 0) {
 		m_freem(m);
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		SDT_PROBE2(if_stf, , stf_output, error, ENETDOWN, __LINE__);
 		return (ENETDOWN);
 	}
 
 	if (m->m_len < sizeof(*ip6)) {
 		m = m_pullup(m, sizeof(*ip6));
 		if (!m) {
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			SDT_PROBE2(if_stf, , stf_output, error, ENOBUFS,
 			    __LINE__);
 			return (ENOBUFS);
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 	tos = IPV6_TRAFFIC_CLASS(ip6);
 
 	/*
 	 * Pickup the right outer dst addr from the list of candidates.
 	 * ip6_dst has priority as it may be able to give us shorter IPv4 hops.
 	 */
 	if (stf_getin4addr_in6(sc, &dst4, addr6, mask6,
 	    ip6->ip6_dst) == NULL) {
 		if (sc->braddr != INADDR_ANY)
 			dst4.sin_addr.s_addr = sc->braddr;
 		else if (stf_getin4addr_in6(sc, &dst4, addr6, mask6,
 		    dst6->sin6_addr) == NULL) {
 			m_freem(m);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			SDT_PROBE2(if_stf, , stf_output, error, ENETUNREACH,
 			    __LINE__);
 			return (ENETUNREACH);
 		}
 	}
 
 	if (bpf_peers_present(ifp->if_bpf)) {
 		/*
 		 * We need to prepend the address family as
 		 * a four byte field.  Cons up a dummy header
 		 * to pacify bpf.  This is safe because bpf
 		 * will only read from the mbuf (i.e., it won't
 		 * try to free it or keep a pointer a to it).
 		 */
 		u_int af = AF_INET6;
 		bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m);
 	}
 
 	M_PREPEND(m, sizeof(struct ip), M_NOWAIT);
 	if (m == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		SDT_PROBE2(if_stf, , stf_output, error, ENOBUFS, __LINE__);
 		return (ENOBUFS);
 	}
 	ip = mtod(m, struct ip *);
 
 	bzero(ip, sizeof(*ip));
 
 	if (sc->srcv4_addr != INADDR_ANY)
 		src4.sin_addr.s_addr = sc->srcv4_addr;
 	else if (stf_getin4addr(sc, &src4, addr6, mask6) == NULL) {
 		m_freem(m);
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		SDT_PROBE2(if_stf, , stf_output, error, ENETUNREACH, __LINE__);
 		return (ENETUNREACH);
 	}
 	bcopy(&src4.sin_addr, &ip->ip_src, sizeof(ip->ip_src));
 	bcopy(&dst4.sin_addr, &ip->ip_dst, sizeof(ip->ip_dst));
 
 	ip->ip_p = IPPROTO_IPV6;
 	ip->ip_ttl = ip_stf_ttl;
 	ip->ip_len = htons(m->m_pkthdr.len);
 	if (ifp->if_flags & IFF_LINK1)
 		ip_ecn_ingress(ECN_ALLOWED, &ip->ip_tos, &tos);
 	else
 		ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos);
 
 	M_SETFIB(m, sc->sc_fibnum);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 	error = ip_output(m, NULL, NULL, 0, NULL, NULL);
 
 	SDT_PROBE1(if_stf, , stf_output, out, error);
 	return (error);
 }
 
 static int
 isrfc1918addr(struct in_addr *in)
 {
 	/*
 	 * returns 1 if private address range:
 	 * 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16
 	 */
 	if (stf_permit_rfc1918 == 0 && (
 	    (ntohl(in->s_addr) & 0xff000000) >> 24 == 10 ||
 	    (ntohl(in->s_addr) & 0xfff00000) >> 16 == 172 * 256 + 16 ||
 	    (ntohl(in->s_addr) & 0xffff0000) >> 16 == 192 * 256 + 168))
 		return (1);
 
 	return (0);
 }
 
 static int
 stf_checkaddr4(struct stf_softc *sc, struct in_addr *in, struct ifnet *inifp)
 {
 	struct in_ifaddr *ia4;
 
 	/*
 	 * reject packets with the following address:
 	 * 224.0.0.0/4 0.0.0.0/8 127.0.0.0/8 255.0.0.0/8
 	 */
 	if (IN_MULTICAST(ntohl(in->s_addr)))
 		return (-1);
 	switch ((ntohl(in->s_addr) & 0xff000000) >> 24) {
 	case 0: case 127: case 255:
 		return (-1);
 	}
 
 	/*
 	 * reject packets with broadcast
 	 */
 	CK_STAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) {
 		if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0)
 			continue;
 		if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr) {
 			return (-1);
 		}
 	}
 
 	/*
 	 * perform ingress filter
 	 */
 	if (sc && (STF2IFP(sc)->if_flags & IFF_LINK2) == 0 && inifp) {
 		struct nhop_object *nh;
 
 		NET_EPOCH_ASSERT();
 		nh = fib4_lookup(sc->sc_fibnum, *in, 0, 0, 0);
 		if (nh == NULL)
 			return (-1);
 
 		if (nh->nh_ifp != inifp)
 			return (-1);
 	}
 
 	return (0);
 }
 
 static int
 stf_checkaddr6(struct stf_softc *sc, struct in6_addr *in6, struct ifnet *inifp)
 {
 	SDT_PROBE3(if_stf, , checkaddr6, in, sc, in6, inifp);
 
 	/*
 	 * check 6to4 addresses
 	 */
 	if (IN6_IS_ADDR_6TO4(in6)) {
 		struct in_addr in4;
 		int ret;
 
 		bcopy(GET_V4(in6), &in4, sizeof(in4));
 		ret = stf_checkaddr4(sc, &in4, inifp);
 		SDT_PROBE2(if_stf, , checkaddr6, out, ret, __LINE__);
 		return (ret);
 	}
 
 	/*
 	 * reject anything that look suspicious.  the test is implemented
 	 * in ip6_input too, but we check here as well to
 	 * (1) reject bad packets earlier, and
 	 * (2) to be safe against future ip6_input change.
 	 */
 	if (IN6_IS_ADDR_V4COMPAT(in6)) {
 		SDT_PROBE2(if_stf, , checkaddr6, out, -1, __LINE__);
 		return (-1);
 	}
 
 	if (IN6_IS_ADDR_V4MAPPED(in6)) {
 		SDT_PROBE2(if_stf, , checkaddr6, out, -1, __LINE__);
 		return (-1);
 	}
 
 	SDT_PROBE2(if_stf, , checkaddr6, out, 0, __LINE__);
 	return (0);
 }
 
 static int
 in_stf_input(struct mbuf *m, int off, int proto, void *arg)
 {
 	struct stf_softc *sc = arg;
 	struct ip ip;
 	struct ip6_hdr *ip6;
 	u_int8_t otos, itos;
 	struct ifnet *ifp;
 	struct nhop_object *nh;
 
 	NET_EPOCH_ASSERT();
 
 	SDT_PROBE3(if_stf, , stf_input, in, m, off, proto);
 
 	if (proto != IPPROTO_IPV6) {
 		m_freem(m);
 		SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__);
 		return (IPPROTO_DONE);
 	}
 
 	m_copydata(m, 0, sizeof(struct ip), (caddr_t)&ip);
 	if (sc == NULL || (STF2IFP(sc)->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__);
 		return (IPPROTO_DONE);
 	}
 
 	ifp = STF2IFP(sc);
 
 #ifdef MAC
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	/*
 	 * perform sanity check against outer src/dst.
 	 * for source, perform ingress filter as well.
 	 */
 	if (stf_checkaddr4(sc, &ip.ip_dst, NULL) < 0 ||
 	    stf_checkaddr4(sc, &ip.ip_src, m->m_pkthdr.rcvif) < 0) {
 		m_freem(m);
 		SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__);
 		return (IPPROTO_DONE);
 	}
 
 	otos = ip.ip_tos;
 	m_adj(m, off);
 
 	if (m->m_len < sizeof(*ip6)) {
 		m = m_pullup(m, sizeof(*ip6));
 		if (!m) {
 			SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE,
 			    __LINE__);
 			return (IPPROTO_DONE);
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * perform sanity check against inner src/dst.
 	 * for source, perform ingress filter as well.
 	 */
 	if (stf_checkaddr6(sc, &ip6->ip6_dst, NULL) < 0 ||
 	    stf_checkaddr6(sc, &ip6->ip6_src, m->m_pkthdr.rcvif) < 0) {
 		m_freem(m);
 		SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * reject packets with private address range.
 	 * (requirement from RFC3056 section 2 1st paragraph)
 	 */
 	if ((IN6_IS_ADDR_6TO4(&ip6->ip6_src) && isrfc1918addr(&ip.ip_src)) ||
 	    (IN6_IS_ADDR_6TO4(&ip6->ip6_dst) && isrfc1918addr(&ip.ip_dst))) {
 		m_freem(m);
 		SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Ignore if the destination is the same stf interface because
 	 * all of valid IPv6 outgoing traffic should go interfaces
 	 * except for it.
 	 */
 	nh = fib6_lookup(sc->sc_fibnum, &ip6->ip6_dst, 0, 0, 0);
 	if (nh == NULL) {
 		m_free(m);
 		SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__);
 		return (IPPROTO_DONE);
 	}
 	if ((nh->nh_ifp == ifp) &&
 	    (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &nh->gw6_sa.sin6_addr))) {
 		m_free(m);
 		SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__);
 		return (IPPROTO_DONE);
 	}
 
 	itos = IPV6_TRAFFIC_CLASS(ip6);
 	if ((ifp->if_flags & IFF_LINK1) != 0)
 		ip_ecn_egress(ECN_ALLOWED, &otos, &itos);
 	else
 		ip_ecn_egress(ECN_NOCARE, &otos, &itos);
 	ip6->ip6_flow &= ~htonl(0xff << 20);
 	ip6->ip6_flow |= htonl((u_int32_t)itos << 20);
 
 	m->m_pkthdr.rcvif = ifp;
 
 	if (bpf_peers_present(ifp->if_bpf)) {
 		/*
 		 * We need to prepend the address family as
 		 * a four byte field.  Cons up a dummy header
 		 * to pacify bpf.  This is safe because bpf
 		 * will only read from the mbuf (i.e., it won't
 		 * try to free it or keep a pointer a to it).
 		 */
 		u_int32_t af = AF_INET6;
 		bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m);
 	}
 
 	/*
 	 * Put the packet to the network layer input queue according to the
 	 * specified address family.
 	 * See net/if_gif.c for possible issues with packet processing
 	 * reorder due to extra queueing.
 	 */
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	M_SETFIB(m, ifp->if_fib);
 	netisr_dispatch(NETISR_IPV6, m);
 	SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__);
 	return (IPPROTO_DONE);
 }
 
 static struct sockaddr_in *
 stf_getin4addr_in6(struct stf_softc *sc, struct sockaddr_in *sin,
     struct in6_addr addr6, struct in6_addr mask6, struct in6_addr in6)
 {
        int i;
        struct sockaddr_in *out;
 
 	/*
 	* When (src addr & src mask) != (in6 & src mask),
 	* the dst is not in the 6rd domain.  The IPv4 address must
 	* not be used.
 	*/
 	for (i = 0; i < sizeof(addr6); i++) {
 		if ((((u_char *)&addr6)[i] & ((u_char *)&mask6)[i]) !=
 		    (((u_char *)&in6)[i] & ((u_char *)&mask6)[i])) {
 			SDT_PROBE4(if_stf, , getin4addr_in6, out, &addr6,
 			    &mask6, &in6, NULL);
 			return (NULL);
 		}
 	}
 
 	/* After the mask check, use in6 instead of addr6. */
 	out = stf_getin4addr(sc, sin, in6, mask6);
 	SDT_PROBE4(if_stf, , getin4addr_in6, out, &addr6, &mask6, &in6, out);
 	return (out);
 }
 
 static struct sockaddr_in *
 stf_getin4addr(struct stf_softc *sc, struct sockaddr_in *sin,
     struct in6_addr addr6, struct in6_addr mask6)
 {
 	struct in_addr *in;
 
 	SDT_PROBE2(if_stf, , getin4addr, in, &addr6, &mask6);
 
 	memset(sin, 0, sizeof(*sin));
 	in = &sin->sin_addr;
 	if (IN6_IS_ADDR_6TO4(&addr6)) {
 		/* 6to4 (RFC 3056) */
 		bcopy(GET_V4(&addr6), in, sizeof(*in));
 		if (isrfc1918addr(in))
 			return (NULL);
 	} else {
 		/* 6rd (RFC 5569) */
 		in_addr_t v4prefix;
 		uint8_t *v6 = (uint8_t*)&addr6;
 		uint64_t v6prefix;
 		u_int plen;
 		u_int v4suffixlen;
 
 		v4prefix = 0;
 		if (sc->v4prefixlen < 32) {
 			v4suffixlen = 32 - sc->v4prefixlen;
 			v4prefix = ntohl(sc->srcv4_addr) &
 			    (0xffffffffU << v4suffixlen);
 		} else {
 			MPASS(sc->v4prefixlen == 32);
 			v4suffixlen = 32;
 		}
 
 		plen = in6_mask2len(&mask6, NULL);
 		if (plen > 64)
 			return (NULL);
 
 		/* To make this simple we do not support prefixes longer than
 		 * 64 bits. RFC5969 says "a 6rd delegated prefix SHOULD be /64
 		 * or shorter." so this is a moderately safe assumption. */
 		v6prefix = be64toh(*(uint64_t *)v6);
 
 		/* Shift away the v6 prefix itself. */
 		v6prefix <<= plen;
 		v6prefix >>= plen;
 
 		/* Now shift away everything after the v4 address. */
 		v6prefix >>= 64 - plen - v4suffixlen;
 
 		sin->sin_addr.s_addr = htonl(v4prefix | (uint32_t)v6prefix);
 	}
 
 	SDT_PROBE1(if_stf, , getin4addr, out, sin);
 
 	return (sin);
 }
 
 static int
 stf_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifaddr *ifa;
 	struct ifdrv *ifd;
 	struct ifreq *ifr;
 	struct sockaddr_in sin4;
 	struct stf_softc *sc_cur;
 	struct stfv4args args;
 	int error, mtu;
 
 	error = 0;
 	sc_cur = ifp->if_softc;
 
 	switch (cmd) {
 	case SIOCSDRVSPEC:
 		ifd = (struct ifdrv *)data;
 		error = priv_check(curthread, PRIV_NET_ADDIFADDR);
 		if (error)
 			break;
 		if (ifd->ifd_cmd == STF6RD_SV4NET) {
 			if (ifd->ifd_len != sizeof(args)) {
 				error = EINVAL;
 				break;
 			}
 			bzero(&args, sizeof(args));
 			error = copyin(ifd->ifd_data, &args, ifd->ifd_len);
 			if (error)
 				break;
 
 			if (args.v4_prefixlen < 1 || args.v4_prefixlen > 32) {
 				error = EINVAL;
 				break;
 			}
 
 			bcopy(&args.srcv4_addr, &sc_cur->srcv4_addr,
 			    sizeof(sc_cur->srcv4_addr));
 			sc_cur->v4prefixlen = args.v4_prefixlen;
 			SDT_PROBE3(if_stf, , ioctl, sv4net, sc_cur->srcv4_addr,
 			    sc_cur->srcv4_addr, sc_cur->v4prefixlen);
 		} else if (ifd->ifd_cmd == STF6RD_SBR) {
 			if (ifd->ifd_len != sizeof(args)) {
 				error = EINVAL;
 				break;
 			}
 			bzero(&args, sizeof(args));
 			error = copyin(ifd->ifd_data, &args, ifd->ifd_len);
 			if (error)
 				break;
 			sc_cur->braddr = args.braddr.s_addr;
 			SDT_PROBE1(if_stf, , ioctl, sdstv4,
 			    sc_cur->braddr);
 		} else
 			error = EINVAL;
 		break;
 	case SIOCGDRVSPEC:
 		ifd = (struct ifdrv *)data;
 		if (ifd->ifd_cmd != STF6RD_GV4NET) {
 			error = EINVAL;
 			break;
 		}
 		if (ifd->ifd_len != sizeof(args)) {
 			error = EINVAL;
 			break;
 		}
 		bzero(&args, sizeof(args));
 		args.srcv4_addr.s_addr = sc_cur->srcv4_addr;
 		args.braddr.s_addr = sc_cur->braddr;
 		args.v4_prefixlen = sc_cur->v4prefixlen;
 		error = copyout(&args, ifd->ifd_data, ifd->ifd_len);
 		break;
 	case SIOCSIFADDR:
 		ifa = (struct ifaddr *)data;
 		SDT_PROBE1(if_stf, , ioctl, ifaddr, ifa);
 		if (ifa == NULL || ifa->ifa_addr->sa_family != AF_INET6) {
 			error = EAFNOSUPPORT;
 			break;
 		}
 		if (stf_getin4addr(sc_cur, &sin4,
 		    satosin6(ifa->ifa_addr)->sin6_addr,
 		    satosin6(ifa->ifa_netmask)->sin6_addr) == NULL) {
 			error = EINVAL;
 			break;
 		}
 		ifp->if_flags |= IFF_UP;
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		ifr = (struct ifreq *)data;
 		if (ifr && ifr->ifr_addr.sa_family == AF_INET6)
 			;
 		else
 			error = EAFNOSUPPORT;
 		break;
 
 	case SIOCGIFMTU:
 		break;
 
 	case SIOCSIFMTU:
 		ifr = (struct ifreq *)data;
 		mtu = ifr->ifr_mtu;
 		/* RFC 4213 3.2 ideal world MTU */
 		if (mtu < IPV6_MINMTU || mtu > IF_MAXMTU - 20)
 			return (EINVAL);
 		ifp->if_mtu = mtu;
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
diff --git a/sys/net/if_tuntap.c b/sys/net/if_tuntap.c
index 8328f9f94442..7e4f47746e23 100644
--- a/sys/net/if_tuntap.c
+++ b/sys/net/if_tuntap.c
@@ -1,2019 +1,2020 @@
 /*	$NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $	*/
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (C) 1999-2000 by Maksim Yevmenkin <m_evmenkin@yahoo.com>
  * All rights reserved.
  * Copyright (c) 2019 Kyle Evans <kevans@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * BASED ON:
  * -------------------------------------------------------------------------
  *
  * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk>
  * Nottingham University 1987.
  *
  * This source may be freely distributed, however I would be interested
  * in any changes that are made.
  *
  * This driver takes packets off the IP i/f and hands them up to a
  * user process to have its wicked way with. This driver has it's
  * roots in a similar driver written by Phil Cockcroft (formerly) at
  * UCL. This driver is based much more on read/write/poll mode of
  * operation though.
  *
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/jail.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/syslog.h>
 #include <sys/ttycom.h>
 #include <sys/poll.h>
 #include <sys/selinfo.h>
 #include <sys/signalvar.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/random.h>
 #include <sys/ctype.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/vnet.h>
 #include <netinet/in.h>
 #ifdef INET
 #include <netinet/ip.h>
 #endif
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/tcp.h>
 #include <net/bpf.h>
 #include <net/if_tap.h>
 #include <net/if_tun.h>
 
 #include <dev/virtio/network/virtio_net.h>
 
 #include <sys/queue.h>
 #include <sys/condvar.h>
 #include <security/mac/mac_framework.h>
 
 struct tuntap_driver;
 
 /*
  * tun_list is protected by global tunmtx.  Other mutable fields are
  * protected by tun->tun_mtx, or by their owning subsystem.  tun_dev is
  * static for the duration of a tunnel interface.
  */
 struct tuntap_softc {
 	TAILQ_ENTRY(tuntap_softc)	 tun_list;
 	struct cdev			*tun_alias;
 	struct cdev			*tun_dev;
 	u_short				 tun_flags;	/* misc flags */
 #define	TUN_OPEN	0x0001
 #define	TUN_INITED	0x0002
 #define	TUN_UNUSED1	0x0008
 #define	TUN_UNUSED2	0x0010
 #define	TUN_LMODE	0x0020
 #define	TUN_RWAIT	0x0040
 #define	TUN_ASYNC	0x0080
 #define	TUN_IFHEAD	0x0100
 #define	TUN_DYING	0x0200
 #define	TUN_L2		0x0400
 #define	TUN_VMNET	0x0800
 
 #define	TUN_DRIVER_IDENT_MASK	(TUN_L2 | TUN_VMNET)
 #define	TUN_READY		(TUN_OPEN | TUN_INITED)
 
 	pid_t			 tun_pid;	/* owning pid */
 	struct ifnet		*tun_ifp;	/* the interface */
 	struct sigio		*tun_sigio;	/* async I/O info */
 	struct tuntap_driver	*tun_drv;	/* appropriate driver */
 	struct selinfo		 tun_rsel;	/* read select */
 	struct mtx		 tun_mtx;	/* softc field mutex */
 	struct cv		 tun_cv;	/* for ref'd dev destroy */
 	struct ether_addr	 tun_ether;	/* remote address */
 	int			 tun_busy;	/* busy count */
 	int			 tun_vhdrlen;	/* virtio-net header length */
 };
 #define	TUN2IFP(sc)	((sc)->tun_ifp)
 
 #define	TUNDEBUG	if (tundebug) if_printf
 
 #define	TUN_LOCK(tp)		mtx_lock(&(tp)->tun_mtx)
 #define	TUN_UNLOCK(tp)		mtx_unlock(&(tp)->tun_mtx)
 #define	TUN_LOCK_ASSERT(tp)	mtx_assert(&(tp)->tun_mtx, MA_OWNED);
 
 #define	TUN_VMIO_FLAG_MASK	0x0fff
 
 /*
  * Interface capabilities of a tap device that supports the virtio-net
  * header.
  */
 #define TAP_VNET_HDR_CAPS	(IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6	\
 				| IFCAP_VLAN_HWCSUM			\
 				| IFCAP_TSO | IFCAP_LRO			\
 				| IFCAP_VLAN_HWTSO)
 
 #define TAP_ALL_OFFLOAD		(CSUM_TSO | CSUM_TCP | CSUM_UDP |\
 				    CSUM_TCP_IPV6 | CSUM_UDP_IPV6)
 
 /*
  * All mutable global variables in if_tun are locked using tunmtx, with
  * the exception of tundebug, which is used unlocked, and the drivers' *clones,
  * which are static after setup.
  */
 static struct mtx tunmtx;
 static eventhandler_tag arrival_tag;
 static eventhandler_tag clone_tag;
 static const char tunname[] = "tun";
 static const char tapname[] = "tap";
 static const char vmnetname[] = "vmnet";
 static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface");
 static int tundebug = 0;
 static int tundclone = 1;
 static int tap_allow_uopen = 0;	/* allow user devfs cloning */
 static int tapuponopen = 0;	/* IFF_UP on open() */
 static int tapdclone = 1;	/* enable devfs cloning */
 
 static TAILQ_HEAD(,tuntap_softc)	tunhead = TAILQ_HEAD_INITIALIZER(tunhead);
 SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, "");
 
 static struct sx tun_ioctl_sx;
 SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl");
 
 SYSCTL_DECL(_net_link);
 /* tun */
 static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IP tunnel software network interface");
 SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0,
     "Enable legacy devfs interface creation");
 
 /* tap */
 static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Ethernet tunnel software network interface");
 SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tap_allow_uopen, 0,
     "Enable legacy devfs interface creation for all users");
 SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0,
     "Bring interface up when /dev/tap is opened");
 SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0,
     "Enable legacy devfs interface creation");
 SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tundebug, 0, "");
 
 static int	tun_create_device(struct tuntap_driver *drv, int unit,
     struct ucred *cr, struct cdev **dev, const char *name);
 static int	tun_busy_locked(struct tuntap_softc *tp);
 static void	tun_unbusy_locked(struct tuntap_softc *tp);
 static int	tun_busy(struct tuntap_softc *tp);
 static void	tun_unbusy(struct tuntap_softc *tp);
 
 static int	tuntap_name2info(const char *name, int *unit, int *flags);
 static void	tunclone(void *arg, struct ucred *cred, char *name,
 		    int namelen, struct cdev **dev);
 static void	tuncreate(struct cdev *dev);
 static void	tundtor(void *data);
 static void	tunrename(void *arg, struct ifnet *ifp);
 static int	tunifioctl(struct ifnet *, u_long, caddr_t);
 static void	tuninit(struct ifnet *);
 static void	tunifinit(void *xtp);
 static int	tuntapmodevent(module_t, int, void *);
 static int	tunoutput(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *ro);
 static void	tunstart(struct ifnet *);
 static void	tunstart_l2(struct ifnet *);
 
 static int	tun_clone_match(struct if_clone *ifc, const char *name);
 static int	tap_clone_match(struct if_clone *ifc, const char *name);
 static int	vmnet_clone_match(struct if_clone *ifc, const char *name);
 static int	tun_clone_create(struct if_clone *, char *, size_t,
 		    struct ifc_data *, struct ifnet **);
 static int	tun_clone_destroy(struct if_clone *, struct ifnet *, uint32_t);
 static void	tun_vnethdr_set(struct ifnet *ifp, int vhdrlen);
 
 static d_open_t		tunopen;
 static d_read_t		tunread;
 static d_write_t	tunwrite;
 static d_ioctl_t	tunioctl;
 static d_poll_t		tunpoll;
 static d_kqfilter_t	tunkqfilter;
 
 static int		tunkqread(struct knote *, long);
 static int		tunkqwrite(struct knote *, long);
 static void		tunkqdetach(struct knote *);
 
 static struct filterops tun_read_filterops = {
 	.f_isfd =	1,
 	.f_attach =	NULL,
 	.f_detach =	tunkqdetach,
 	.f_event =	tunkqread,
 };
 
 static struct filterops tun_write_filterops = {
 	.f_isfd =	1,
 	.f_attach =	NULL,
 	.f_detach =	tunkqdetach,
 	.f_event =	tunkqwrite,
 };
 
 static struct tuntap_driver {
 	struct cdevsw		 cdevsw;
 	int			 ident_flags;
 	struct unrhdr		*unrhdr;
 	struct clonedevs	*clones;
 	ifc_match_f		*clone_match_fn;
 	ifc_create_f		*clone_create_fn;
 	ifc_destroy_f		*clone_destroy_fn;
 } tuntap_drivers[] = {
 	{
 		.ident_flags =	0,
 		.cdevsw =	{
 		    .d_version =	D_VERSION,
 		    .d_flags =		D_NEEDMINOR,
 		    .d_open =		tunopen,
 		    .d_read =		tunread,
 		    .d_write =		tunwrite,
 		    .d_ioctl =		tunioctl,
 		    .d_poll =		tunpoll,
 		    .d_kqfilter =	tunkqfilter,
 		    .d_name =		tunname,
 		},
 		.clone_match_fn =	tun_clone_match,
 		.clone_create_fn =	tun_clone_create,
 		.clone_destroy_fn =	tun_clone_destroy,
 	},
 	{
 		.ident_flags =	TUN_L2,
 		.cdevsw =	{
 		    .d_version =	D_VERSION,
 		    .d_flags =		D_NEEDMINOR,
 		    .d_open =		tunopen,
 		    .d_read =		tunread,
 		    .d_write =		tunwrite,
 		    .d_ioctl =		tunioctl,
 		    .d_poll =		tunpoll,
 		    .d_kqfilter =	tunkqfilter,
 		    .d_name =		tapname,
 		},
 		.clone_match_fn =	tap_clone_match,
 		.clone_create_fn =	tun_clone_create,
 		.clone_destroy_fn =	tun_clone_destroy,
 	},
 	{
 		.ident_flags =	TUN_L2 | TUN_VMNET,
 		.cdevsw =	{
 		    .d_version =	D_VERSION,
 		    .d_flags =		D_NEEDMINOR,
 		    .d_open =		tunopen,
 		    .d_read =		tunread,
 		    .d_write =		tunwrite,
 		    .d_ioctl =		tunioctl,
 		    .d_poll =		tunpoll,
 		    .d_kqfilter =	tunkqfilter,
 		    .d_name =		vmnetname,
 		},
 		.clone_match_fn =	vmnet_clone_match,
 		.clone_create_fn =	tun_clone_create,
 		.clone_destroy_fn =	tun_clone_destroy,
 	},
 };
 
 struct tuntap_driver_cloner {
 	SLIST_ENTRY(tuntap_driver_cloner)	 link;
 	struct tuntap_driver			*drv;
 	struct if_clone				*cloner;
 };
 
 VNET_DEFINE_STATIC(SLIST_HEAD(, tuntap_driver_cloner), tuntap_driver_cloners) =
     SLIST_HEAD_INITIALIZER(tuntap_driver_cloners);
 
 #define	V_tuntap_driver_cloners	VNET(tuntap_driver_cloners)
 
 /*
  * Mechanism for marking a tunnel device as busy so that we can safely do some
  * orthogonal operations (such as operations on devices) without racing against
  * tun_destroy.  tun_destroy will wait on the condvar if we're at all busy or
  * open, to be woken up when the condition is alleviated.
  */
 static int
 tun_busy_locked(struct tuntap_softc *tp)
 {
 
 	TUN_LOCK_ASSERT(tp);
 	if ((tp->tun_flags & TUN_DYING) != 0) {
 		/*
 		 * Perhaps unintuitive, but the device is busy going away.
 		 * Other interpretations of EBUSY from tun_busy make little
 		 * sense, since making a busy device even more busy doesn't
 		 * sound like a problem.
 		 */
 		return (EBUSY);
 	}
 
 	++tp->tun_busy;
 	return (0);
 }
 
 static void
 tun_unbusy_locked(struct tuntap_softc *tp)
 {
 
 	TUN_LOCK_ASSERT(tp);
 	KASSERT(tp->tun_busy != 0, ("tun_unbusy: called for non-busy tunnel"));
 
 	--tp->tun_busy;
 	/* Wake up anything that may be waiting on our busy tunnel. */
 	if (tp->tun_busy == 0)
 		cv_broadcast(&tp->tun_cv);
 }
 
 static int
 tun_busy(struct tuntap_softc *tp)
 {
 	int ret;
 
 	TUN_LOCK(tp);
 	ret = tun_busy_locked(tp);
 	TUN_UNLOCK(tp);
 	return (ret);
 }
 
 static void
 tun_unbusy(struct tuntap_softc *tp)
 {
 
 	TUN_LOCK(tp);
 	tun_unbusy_locked(tp);
 	TUN_UNLOCK(tp);
 }
 
 /*
  * Sets unit and/or flags given the device name.  Must be called with correct
  * vnet context.
  */
 static int
 tuntap_name2info(const char *name, int *outunit, int *outflags)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_driver_cloner *drvc;
 	char *dname;
 	int flags, unit;
 	bool found;
 
 	if (name == NULL)
 		return (EINVAL);
 
 	/*
 	 * Needed for dev_stdclone, but dev_stdclone will not modify, it just
 	 * wants to be able to pass back a char * through the second param. We
 	 * will always set that as NULL here, so we'll fake it.
 	 */
 	dname = __DECONST(char *, name);
 	found = false;
 
 	KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners),
 	    ("tuntap_driver_cloners failed to initialize"));
 	SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) {
 		KASSERT(drvc->drv != NULL,
 		    ("tuntap_driver_cloners entry not properly initialized"));
 		drv = drvc->drv;
 
 		if (strcmp(name, drv->cdevsw.d_name) == 0) {
 			found = true;
 			unit = -1;
 			flags = drv->ident_flags;
 			break;
 		}
 
 		if (dev_stdclone(dname, NULL, drv->cdevsw.d_name, &unit) == 1) {
 			found = true;
 			flags = drv->ident_flags;
 			break;
 		}
 	}
 
 	if (!found)
 		return (ENXIO);
 
 	if (outunit != NULL)
 		*outunit = unit;
 	if (outflags != NULL)
 		*outflags = flags;
 	return (0);
 }
 
 /*
  * Get driver information from a set of flags specified.  Masks the identifying
  * part of the flags and compares it against all of the available
  * tuntap_drivers. Must be called with correct vnet context.
  */
 static struct tuntap_driver *
 tuntap_driver_from_flags(int tun_flags)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_driver_cloner *drvc;
 
 	KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners),
 	    ("tuntap_driver_cloners failed to initialize"));
 	SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) {
 		KASSERT(drvc->drv != NULL,
 		    ("tuntap_driver_cloners entry not properly initialized"));
 		drv = drvc->drv;
 		if ((tun_flags & TUN_DRIVER_IDENT_MASK) == drv->ident_flags)
 			return (drv);
 	}
 
 	return (NULL);
 }
 
 static int
 tun_clone_match(struct if_clone *ifc, const char *name)
 {
 	int tunflags;
 
 	if (tuntap_name2info(name, NULL, &tunflags) == 0) {
 		if ((tunflags & TUN_L2) == 0)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 tap_clone_match(struct if_clone *ifc, const char *name)
 {
 	int tunflags;
 
 	if (tuntap_name2info(name, NULL, &tunflags) == 0) {
 		if ((tunflags & (TUN_L2 | TUN_VMNET)) == TUN_L2)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 vmnet_clone_match(struct if_clone *ifc, const char *name)
 {
 	int tunflags;
 
 	if (tuntap_name2info(name, NULL, &tunflags) == 0) {
 		if ((tunflags & TUN_VMNET) != 0)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 tun_clone_create(struct if_clone *ifc, char *name, size_t len,
     struct ifc_data *ifd, struct ifnet **ifpp)
 {
 	struct tuntap_driver *drv;
 	struct cdev *dev;
 	int err, i, tunflags, unit;
 
 	tunflags = 0;
 	/* The name here tells us exactly what we're creating */
 	err = tuntap_name2info(name, &unit, &tunflags);
 	if (err != 0)
 		return (err);
 
 	drv = tuntap_driver_from_flags(tunflags);
 	if (drv == NULL)
 		return (ENXIO);
 
 	if (unit != -1) {
 		/* If this unit number is still available that's okay. */
 		if (alloc_unr_specific(drv->unrhdr, unit) == -1)
 			return (EEXIST);
 	} else {
 		unit = alloc_unr(drv->unrhdr);
 	}
 
 	snprintf(name, IFNAMSIZ, "%s%d", drv->cdevsw.d_name, unit);
 
 	/* find any existing device, or allocate new unit number */
 	dev = NULL;
 	i = clone_create(&drv->clones, &drv->cdevsw, &unit, &dev, 0);
 	/* No preexisting struct cdev *, create one */
 	if (i != 0)
 		i = tun_create_device(drv, unit, NULL, &dev, name);
 	if (i == 0) {
 		tuncreate(dev);
 		struct tuntap_softc *tp = dev->si_drv1;
 		*ifpp = tp->tun_ifp;
 	}
 
 	return (i);
 }
 
 static void
 tunclone(void *arg, struct ucred *cred, char *name, int namelen,
     struct cdev **dev)
 {
 	char devname[SPECNAMELEN + 1];
 	struct tuntap_driver *drv;
 	int append_unit, i, u, tunflags;
 	bool mayclone;
 
 	if (*dev != NULL)
 		return;
 
 	tunflags = 0;
 	CURVNET_SET(CRED_TO_VNET(cred));
 	if (tuntap_name2info(name, &u, &tunflags) != 0)
 		goto out;	/* Not recognized */
 
 	if (u != -1 && u > IF_MAXUNIT)
 		goto out;	/* Unit number too high */
 
 	mayclone = priv_check_cred(cred, PRIV_NET_IFCREATE) == 0;
 	if ((tunflags & TUN_L2) != 0) {
 		/* tap/vmnet allow user open with a sysctl */
 		mayclone = (mayclone || tap_allow_uopen) && tapdclone;
 	} else {
 		mayclone = mayclone && tundclone;
 	}
 
 	/*
 	 * If tun cloning is enabled, only the superuser can create an
 	 * interface.
 	 */
 	if (!mayclone)
 		goto out;
 
 	if (u == -1)
 		append_unit = 1;
 	else
 		append_unit = 0;
 
 	drv = tuntap_driver_from_flags(tunflags);
 	if (drv == NULL)
 		goto out;
 
 	/* find any existing device, or allocate new unit number */
 	i = clone_create(&drv->clones, &drv->cdevsw, &u, dev, 0);
 	if (i) {
 		if (append_unit) {
 			namelen = snprintf(devname, sizeof(devname), "%s%d",
 			    name, u);
 			name = devname;
 		}
 
 		i = tun_create_device(drv, u, cred, dev, name);
 	}
 	if (i == 0)
 		if_clone_create(name, namelen, NULL);
 out:
 	CURVNET_RESTORE();
 }
 
 static void
 tun_destroy(struct tuntap_softc *tp)
 {
 
 	TUN_LOCK(tp);
 	tp->tun_flags |= TUN_DYING;
 	if (tp->tun_busy != 0)
 		cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx);
 	else
 		TUN_UNLOCK(tp);
 
 	CURVNET_SET(TUN2IFP(tp)->if_vnet);
 
 	/* destroy_dev will take care of any alias. */
 	destroy_dev(tp->tun_dev);
 	seldrain(&tp->tun_rsel);
 	knlist_clear(&tp->tun_rsel.si_note, 0);
 	knlist_destroy(&tp->tun_rsel.si_note);
 	if ((tp->tun_flags & TUN_L2) != 0) {
 		ether_ifdetach(TUN2IFP(tp));
 	} else {
 		bpfdetach(TUN2IFP(tp));
 		if_detach(TUN2IFP(tp));
 	}
 	sx_xlock(&tun_ioctl_sx);
 	TUN2IFP(tp)->if_softc = NULL;
 	sx_xunlock(&tun_ioctl_sx);
 	free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit);
 	if_free(TUN2IFP(tp));
 	mtx_destroy(&tp->tun_mtx);
 	cv_destroy(&tp->tun_cv);
 	free(tp, M_TUN);
 	CURVNET_RESTORE();
 }
 
 static int
 tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp, uint32_t flags)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 
 	mtx_lock(&tunmtx);
 	TAILQ_REMOVE(&tunhead, tp, tun_list);
 	mtx_unlock(&tunmtx);
 	tun_destroy(tp);
 
 	return (0);
 }
 
 static void
 vnet_tun_init(const void *unused __unused)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_driver_cloner *drvc;
 	int i;
 
 	for (i = 0; i < nitems(tuntap_drivers); ++i) {
 		drv = &tuntap_drivers[i];
 		drvc = malloc(sizeof(*drvc), M_TUN, M_WAITOK | M_ZERO);
 
 		drvc->drv = drv;
 		struct if_clone_addreq req = {
 			.match_f = drv->clone_match_fn,
 			.create_f = drv->clone_create_fn,
 			.destroy_f = drv->clone_destroy_fn,
 		};
 		drvc->cloner = ifc_attach_cloner(drv->cdevsw.d_name, &req);
 		SLIST_INSERT_HEAD(&V_tuntap_driver_cloners, drvc, link);
 	};
 }
 VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY,
 		vnet_tun_init, NULL);
 
 static void
 vnet_tun_uninit(const void *unused __unused)
 {
 	struct tuntap_driver_cloner *drvc;
 
 	while (!SLIST_EMPTY(&V_tuntap_driver_cloners)) {
 		drvc = SLIST_FIRST(&V_tuntap_driver_cloners);
 		SLIST_REMOVE_HEAD(&V_tuntap_driver_cloners, link);
 
 		if_clone_detach(drvc->cloner);
 		free(drvc, M_TUN);
 	}
 }
 VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY,
     vnet_tun_uninit, NULL);
 
 static void
 tun_uninit(const void *unused __unused)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_softc *tp;
 	int i;
 
 	EVENTHANDLER_DEREGISTER(ifnet_arrival_event, arrival_tag);
 	EVENTHANDLER_DEREGISTER(dev_clone, clone_tag);
 
 	mtx_lock(&tunmtx);
 	while ((tp = TAILQ_FIRST(&tunhead)) != NULL) {
 		TAILQ_REMOVE(&tunhead, tp, tun_list);
 		mtx_unlock(&tunmtx);
 		tun_destroy(tp);
 		mtx_lock(&tunmtx);
 	}
 	mtx_unlock(&tunmtx);
 	for (i = 0; i < nitems(tuntap_drivers); ++i) {
 		drv = &tuntap_drivers[i];
 		delete_unrhdr(drv->unrhdr);
 		clone_cleanup(&drv->clones);
 	}
 	mtx_destroy(&tunmtx);
 }
 SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL);
 
 static struct tuntap_driver *
 tuntap_driver_from_ifnet(const struct ifnet *ifp)
 {
 	struct tuntap_driver *drv;
 	int i;
 
 	if (ifp == NULL)
 		return (NULL);
 
 	for (i = 0; i < nitems(tuntap_drivers); ++i) {
 		drv = &tuntap_drivers[i];
 		if (strcmp(ifp->if_dname, drv->cdevsw.d_name) == 0)
 			return (drv);
 	}
 
 	return (NULL);
 }
 
 static int
 tuntapmodevent(module_t mod, int type, void *data)
 {
 	struct tuntap_driver *drv;
 	int i;
 
 	switch (type) {
 	case MOD_LOAD:
 		mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF);
 		for (i = 0; i < nitems(tuntap_drivers); ++i) {
 			drv = &tuntap_drivers[i];
 			clone_setup(&drv->clones);
 			drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx);
 		}
 		arrival_tag = EVENTHANDLER_REGISTER(ifnet_arrival_event,
 		   tunrename, 0, 1000);
 		if (arrival_tag == NULL)
 			return (ENOMEM);
 		clone_tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000);
 		if (clone_tag == NULL)
 			return (ENOMEM);
 		break;
 	case MOD_UNLOAD:
 		/* See tun_uninit, so it's done after the vnet_sysuninit() */
 		break;
 	default:
 		return EOPNOTSUPP;
 	}
 	return 0;
 }
 
 static moduledata_t tuntap_mod = {
 	"if_tuntap",
 	tuntapmodevent,
 	0
 };
 
 /* We'll only ever have these two, so no need for a macro. */
 static moduledata_t tun_mod = { "if_tun", NULL, 0 };
 static moduledata_t tap_mod = { "if_tap", NULL, 0 };
 
 DECLARE_MODULE(if_tuntap, tuntap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_tuntap, 1);
 DECLARE_MODULE(if_tun, tun_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_tun, 1);
 DECLARE_MODULE(if_tap, tap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_tap, 1);
 
 static int
 tun_create_device(struct tuntap_driver *drv, int unit, struct ucred *cr,
     struct cdev **dev, const char *name)
 {
 	struct make_dev_args args;
 	struct tuntap_softc *tp;
 	int error;
 
 	tp = malloc(sizeof(*tp), M_TUN, M_WAITOK | M_ZERO);
 	mtx_init(&tp->tun_mtx, "tun_mtx", NULL, MTX_DEF);
 	cv_init(&tp->tun_cv, "tun_condvar");
 	tp->tun_flags = drv->ident_flags;
 	tp->tun_drv = drv;
 
 	make_dev_args_init(&args);
 	if (cr != NULL)
 		args.mda_flags = MAKEDEV_REF;
 	args.mda_devsw = &drv->cdevsw;
 	args.mda_cr = cr;
 	args.mda_uid = UID_UUCP;
 	args.mda_gid = GID_DIALER;
 	args.mda_mode = 0600;
 	args.mda_unit = unit;
 	args.mda_si_drv1 = tp;
 	error = make_dev_s(&args, dev, "%s", name);
 	if (error != 0) {
 		free(tp, M_TUN);
 		return (error);
 	}
 
 	KASSERT((*dev)->si_drv1 != NULL,
 	    ("Failed to set si_drv1 at %s creation", name));
 	tp->tun_dev = *dev;
 	knlist_init_mtx(&tp->tun_rsel.si_note, &tp->tun_mtx);
 	mtx_lock(&tunmtx);
 	TAILQ_INSERT_TAIL(&tunhead, tp, tun_list);
 	mtx_unlock(&tunmtx);
 	return (0);
 }
 
 static void
 tunstart(struct ifnet *ifp)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 	struct mbuf *m;
 
 	TUNDEBUG(ifp, "starting\n");
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		IFQ_LOCK(&ifp->if_snd);
 		IFQ_POLL_NOLOCK(&ifp->if_snd, m);
 		if (m == NULL) {
 			IFQ_UNLOCK(&ifp->if_snd);
 			return;
 		}
 		IFQ_UNLOCK(&ifp->if_snd);
 	}
 
 	TUN_LOCK(tp);
 	if (tp->tun_flags & TUN_RWAIT) {
 		tp->tun_flags &= ~TUN_RWAIT;
 		wakeup(tp);
 	}
 	selwakeuppri(&tp->tun_rsel, PZERO + 1);
 	KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
 	if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) {
 		TUN_UNLOCK(tp);
 		pgsigio(&tp->tun_sigio, SIGIO, 0);
 	} else
 		TUN_UNLOCK(tp);
 }
 
 /*
  * tunstart_l2
  *
  * queue packets from higher level ready to put out
  */
 static void
 tunstart_l2(struct ifnet *ifp)
 {
 	struct tuntap_softc	*tp = ifp->if_softc;
 
 	TUNDEBUG(ifp, "starting\n");
 
 	/*
 	 * do not junk pending output if we are in VMnet mode.
 	 * XXX: can this do any harm because of queue overflow?
 	 */
 
 	TUN_LOCK(tp);
 	if (((tp->tun_flags & TUN_VMNET) == 0) &&
 	    ((tp->tun_flags & TUN_READY) != TUN_READY)) {
 		struct mbuf *m;
 
 		/* Unlocked read. */
 		TUNDEBUG(ifp, "not ready, tun_flags = 0x%x\n", tp->tun_flags);
 
 		for (;;) {
 			IF_DEQUEUE(&ifp->if_snd, m);
 			if (m != NULL) {
 				m_freem(m);
 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			} else
 				break;
 		}
 		TUN_UNLOCK(tp);
 
 		return;
 	}
 
 	ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 
 	if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
 		if (tp->tun_flags & TUN_RWAIT) {
 			tp->tun_flags &= ~TUN_RWAIT;
 			wakeup(tp);
 		}
 
 		if ((tp->tun_flags & TUN_ASYNC) && (tp->tun_sigio != NULL)) {
 			TUN_UNLOCK(tp);
 			pgsigio(&tp->tun_sigio, SIGIO, 0);
 			TUN_LOCK(tp);
 		}
 
 		selwakeuppri(&tp->tun_rsel, PZERO+1);
 		KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */
 	}
 
 	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 	TUN_UNLOCK(tp);
 } /* tunstart_l2 */
 
 /* XXX: should return an error code so it can fail. */
 static void
 tuncreate(struct cdev *dev)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_softc *tp;
 	struct ifnet *ifp;
 	struct ether_addr eaddr;
 	int iflags;
 	u_char type;
 
 	tp = dev->si_drv1;
 	KASSERT(tp != NULL,
 	    ("si_drv1 should have been initialized at creation"));
 
 	drv = tp->tun_drv;
 	iflags = IFF_MULTICAST;
 	if ((tp->tun_flags & TUN_L2) != 0) {
 		type = IFT_ETHER;
 		iflags |= IFF_BROADCAST | IFF_SIMPLEX;
 	} else {
 		type = IFT_PPP;
 		iflags |= IFF_POINTOPOINT;
 	}
 	ifp = tp->tun_ifp = if_alloc(type);
 	if (ifp == NULL)
 		panic("%s%d: failed to if_alloc() interface.\n",
 		    drv->cdevsw.d_name, dev2unit(dev));
 	ifp->if_softc = tp;
 	if_initname(ifp, drv->cdevsw.d_name, dev2unit(dev));
 	ifp->if_ioctl = tunifioctl;
 	ifp->if_flags = iflags;
 	IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen);
 	ifp->if_capabilities |= IFCAP_LINKSTATE;
 	ifp->if_capenable |= IFCAP_LINKSTATE;
 
 	if ((tp->tun_flags & TUN_L2) != 0) {
 		ifp->if_init = tunifinit;
 		ifp->if_start = tunstart_l2;
 
 		ether_gen_addr(ifp, &eaddr);
 		ether_ifattach(ifp, eaddr.octet);
 	} else {
 		ifp->if_mtu = TUNMTU;
 		ifp->if_start = tunstart;
 		ifp->if_output = tunoutput;
 
 		ifp->if_snd.ifq_drv_maxlen = 0;
 		IFQ_SET_READY(&ifp->if_snd);
 
 		if_attach(ifp);
 		bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));
 	}
 
 	TUN_LOCK(tp);
 	tp->tun_flags |= TUN_INITED;
 	TUN_UNLOCK(tp);
 
 	TUNDEBUG(ifp, "interface %s is created, minor = %#x\n",
 	    ifp->if_xname, dev2unit(dev));
 }
 
 static void
 tunrename(void *arg __unused, struct ifnet *ifp)
 {
 	struct tuntap_softc *tp;
 	int error;
 
 	if ((ifp->if_flags & IFF_RENAMING) == 0)
 		return;
 
 	if (tuntap_driver_from_ifnet(ifp) == NULL)
 		return;
 
 	/*
 	 * We need to grab the ioctl sx long enough to make sure the softc is
 	 * still there.  If it is, we can safely try to busy the tun device.
 	 * The busy may fail if the device is currently dying, in which case
 	 * we do nothing.  If it doesn't fail, the busy count stops the device
 	 * from dying until we've created the alias (that will then be
 	 * subsequently destroyed).
 	 */
 	sx_xlock(&tun_ioctl_sx);
 	tp = ifp->if_softc;
 	if (tp == NULL) {
 		sx_xunlock(&tun_ioctl_sx);
 		return;
 	}
 	error = tun_busy(tp);
 	sx_xunlock(&tun_ioctl_sx);
 	if (error != 0)
 		return;
 	if (tp->tun_alias != NULL) {
 		destroy_dev(tp->tun_alias);
 		tp->tun_alias = NULL;
 	}
 
 	if (strcmp(ifp->if_xname, tp->tun_dev->si_name) == 0)
 		goto out;
 
 	/*
 	 * Failure's ok, aliases are created on a best effort basis.  If a
 	 * tun user/consumer decides to rename the interface to conflict with
 	 * another device (non-ifnet) on the system, we will assume they know
 	 * what they are doing.  make_dev_alias_p won't touch tun_alias on
 	 * failure, so we use it but ignore the return value.
 	 */
 	make_dev_alias_p(MAKEDEV_CHECKNAME, &tp->tun_alias, tp->tun_dev, "%s",
 	    ifp->if_xname);
 out:
 	tun_unbusy(tp);
 }
 
 static int
 tunopen(struct cdev *dev, int flag, int mode, struct thread *td)
 {
 	struct ifnet	*ifp;
 	struct tuntap_softc *tp;
 	int error __diagused, tunflags;
 
 	tunflags = 0;
 	CURVNET_SET(TD_TO_VNET(td));
 	error = tuntap_name2info(dev->si_name, NULL, &tunflags);
 	if (error != 0) {
 		CURVNET_RESTORE();
 		return (error);	/* Shouldn't happen */
 	}
 
 	tp = dev->si_drv1;
 	KASSERT(tp != NULL,
 	    ("si_drv1 should have been initialized at creation"));
 
 	TUN_LOCK(tp);
 	if ((tp->tun_flags & TUN_INITED) == 0) {
 		TUN_UNLOCK(tp);
 		CURVNET_RESTORE();
 		return (ENXIO);
 	}
 	if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) {
 		TUN_UNLOCK(tp);
 		CURVNET_RESTORE();
 		return (EBUSY);
 	}
 
 	error = tun_busy_locked(tp);
 	KASSERT(error == 0, ("Must be able to busy an unopen tunnel"));
 	ifp = TUN2IFP(tp);
 
 	if ((tp->tun_flags & TUN_L2) != 0) {
 		bcopy(IF_LLADDR(ifp), tp->tun_ether.octet,
 		    sizeof(tp->tun_ether.octet));
 
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 
 		if (tapuponopen)
 			ifp->if_flags |= IFF_UP;
 	}
 
 	tp->tun_pid = td->td_proc->p_pid;
 	tp->tun_flags |= TUN_OPEN;
 
 	if_link_state_change(ifp, LINK_STATE_UP);
 	TUNDEBUG(ifp, "open\n");
 	TUN_UNLOCK(tp);
 
 	/*
 	 * This can fail with either ENOENT or EBUSY.  This is in the middle of
 	 * d_open, so ENOENT should not be possible.  EBUSY is possible, but
 	 * the only cdevpriv dtor being set will be tundtor and the softc being
 	 * passed is constant for a given cdev.  We ignore the possible error
 	 * because of this as either "unlikely" or "not actually a problem."
 	 */
 	(void)devfs_set_cdevpriv(tp, tundtor);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * tundtor - tear down the device - mark i/f down & delete
  * routing info
  */
 static void
 tundtor(void *data)
 {
 	struct proc *p;
 	struct tuntap_softc *tp;
 	struct ifnet *ifp;
 	bool l2tun;
 
 	tp = data;
 	p = curproc;
 	ifp = TUN2IFP(tp);
 
 	TUN_LOCK(tp);
 
 	/*
 	 * Realistically, we can't be obstinate here.  This only means that the
 	 * tuntap device was closed out of order, and the last closer wasn't the
 	 * controller.  These are still good to know about, though, as software
 	 * should avoid multiple processes with a tuntap device open and
 	 * ill-defined transfer of control (e.g., handoff, TUNSIFPID, close in
 	 * parent).
 	 */
 	if (p->p_pid != tp->tun_pid) {
 		log(LOG_INFO,
 		    "pid %d (%s), %s: tun/tap protocol violation, non-controlling process closed last.\n",
 		    p->p_pid, p->p_comm, tp->tun_dev->si_name);
 	}
 
 	/*
 	 * junk all pending output
 	 */
 	CURVNET_SET(ifp->if_vnet);
 
 	l2tun = false;
 	if ((tp->tun_flags & TUN_L2) != 0) {
 		l2tun = true;
 		IF_DRAIN(&ifp->if_snd);
 	} else {
 		IFQ_PURGE(&ifp->if_snd);
 	}
 
 	/* For vmnet, we won't do most of the address/route bits */
 	if ((tp->tun_flags & TUN_VMNET) != 0 ||
 	    (l2tun && (ifp->if_flags & IFF_LINK0) != 0))
 		goto out;
 
 	if (ifp->if_flags & IFF_UP) {
 		TUN_UNLOCK(tp);
 		if_down(ifp);
 		TUN_LOCK(tp);
 	}
 
 	/* Delete all addresses and routes which reference this interface. */
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 		TUN_UNLOCK(tp);
 		if_purgeaddrs(ifp);
 		TUN_LOCK(tp);
 	}
 
 out:
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 	CURVNET_RESTORE();
 
 	funsetown(&tp->tun_sigio);
 	selwakeuppri(&tp->tun_rsel, PZERO + 1);
 	KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
 	TUNDEBUG (ifp, "closed\n");
 	tp->tun_flags &= ~TUN_OPEN;
 	tp->tun_pid = 0;
 	tun_vnethdr_set(ifp, 0);
 
 	tun_unbusy_locked(tp);
 	TUN_UNLOCK(tp);
 }
 
 static void
 tuninit(struct ifnet *ifp)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 
 	TUNDEBUG(ifp, "tuninit\n");
 
 	TUN_LOCK(tp);
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	if ((tp->tun_flags & TUN_L2) == 0) {
 		ifp->if_flags |= IFF_UP;
 		getmicrotime(&ifp->if_lastchange);
 		TUN_UNLOCK(tp);
 	} else {
 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 		TUN_UNLOCK(tp);
 		/* attempt to start output */
 		tunstart_l2(ifp);
 	}
 
 }
 
 /*
  * Used only for l2 tunnel.
  */
 static void
 tunifinit(void *xtp)
 {
 	struct tuntap_softc *tp;
 
 	tp = (struct tuntap_softc *)xtp;
 	tuninit(tp->tun_ifp);
 }
 
 /*
  * To be called under TUN_LOCK. Update ifp->if_hwassist according to the
  * current value of ifp->if_capenable.
  */
 static void
 tun_caps_changed(struct ifnet *ifp)
 {
 	uint64_t hwassist = 0;
 
 	TUN_LOCK_ASSERT((struct tuntap_softc *)ifp->if_softc);
 	if (ifp->if_capenable & IFCAP_TXCSUM)
 		hwassist |= CSUM_TCP | CSUM_UDP;
 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
 		hwassist |= CSUM_TCP_IPV6
 		    | CSUM_UDP_IPV6;
 	if (ifp->if_capenable & IFCAP_TSO4)
 		hwassist |= CSUM_IP_TSO;
 	if (ifp->if_capenable & IFCAP_TSO6)
 		hwassist |= CSUM_IP6_TSO;
 	ifp->if_hwassist = hwassist;
 }
 
 /*
  * To be called under TUN_LOCK. Update tp->tun_vhdrlen and adjust
  * if_capabilities and if_capenable as needed.
  */
 static void
 tun_vnethdr_set(struct ifnet *ifp, int vhdrlen)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 
 	TUN_LOCK_ASSERT(tp);
 
 	if (tp->tun_vhdrlen == vhdrlen)
 		return;
 
 	/*
 	 * Update if_capabilities to reflect the
 	 * functionalities offered by the virtio-net
 	 * header.
 	 */
 	if (vhdrlen != 0)
 		ifp->if_capabilities |=
 			TAP_VNET_HDR_CAPS;
 	else
 		ifp->if_capabilities &=
 			~TAP_VNET_HDR_CAPS;
 	/*
 	 * Disable any capabilities that we don't
 	 * support anymore.
 	 */
 	ifp->if_capenable &= ifp->if_capabilities;
 	tun_caps_changed(ifp);
 	tp->tun_vhdrlen = vhdrlen;
 
 	TUNDEBUG(ifp, "vnet_hdr_len=%d, if_capabilities=%x\n",
 	    vhdrlen, ifp->if_capabilities);
 }
 
 /*
  * Process an ioctl request.
  */
 static int
 tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct tuntap_softc *tp;
 	struct ifstat *ifs;
 	struct ifmediareq	*ifmr;
 	int		dummy, error = 0;
 	bool		l2tun;
 
 	ifmr = NULL;
 	sx_xlock(&tun_ioctl_sx);
 	tp = ifp->if_softc;
 	if (tp == NULL) {
 		error = ENXIO;
 		goto bad;
 	}
 	l2tun = (tp->tun_flags & TUN_L2) != 0;
 	switch(cmd) {
 	case SIOCGIFSTATUS:
 		ifs = (struct ifstat *)data;
 		TUN_LOCK(tp);
 		if (tp->tun_pid)
 			snprintf(ifs->ascii, sizeof(ifs->ascii),
 			    "\tOpened by PID %d\n", tp->tun_pid);
 		else
 			ifs->ascii[0] = '\0';
 		TUN_UNLOCK(tp);
 		break;
 	case SIOCSIFADDR:
 		if (l2tun)
 			error = ether_ioctl(ifp, cmd, data);
 		else
 			tuninit(ifp);
 		if (error == 0)
 		    TUNDEBUG(ifp, "address set\n");
 		break;
 	case SIOCSIFMTU:
 		ifp->if_mtu = ifr->ifr_mtu;
 		TUNDEBUG(ifp, "mtu set\n");
 		break;
 	case SIOCSIFFLAGS:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		break;
 	case SIOCGIFMEDIA:
 		if (!l2tun) {
 			error = EINVAL;
 			break;
 		}
 
 		ifmr = (struct ifmediareq *)data;
 		dummy = ifmr->ifm_count;
 		ifmr->ifm_count = 1;
 		ifmr->ifm_status = IFM_AVALID;
 		ifmr->ifm_active = IFM_ETHER;
 		if (tp->tun_flags & TUN_OPEN)
 			ifmr->ifm_status |= IFM_ACTIVE;
 		ifmr->ifm_current = ifmr->ifm_active;
 		if (dummy >= 1) {
 			int media = IFM_ETHER;
 			error = copyout(&media, ifmr->ifm_ulist, sizeof(int));
 		}
 		break;
 	case SIOCSIFCAP:
 		TUN_LOCK(tp);
 		ifp->if_capenable = ifr->ifr_reqcap;
 		tun_caps_changed(ifp);
 		TUN_UNLOCK(tp);
 		VLAN_CAPABILITIES(ifp);
 		break;
 	default:
 		if (l2tun) {
 			error = ether_ioctl(ifp, cmd, data);
 		} else {
 			error = EINVAL;
 		}
 	}
 bad:
 	sx_xunlock(&tun_ioctl_sx);
 	return (error);
 }
 
 /*
  * tunoutput - queue packets from higher level ready to put out.
  */
 static int
 tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst,
     struct route *ro)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 	u_short cached_tun_flags;
 	int error;
 	u_int32_t af;
 
 	TUNDEBUG (ifp, "tunoutput\n");
 
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m0);
 	if (error) {
 		m_freem(m0);
 		return (error);
 	}
 #endif
 
 	/* Could be unlocked read? */
 	TUN_LOCK(tp);
 	cached_tun_flags = tp->tun_flags;
 	TUN_UNLOCK(tp);
 	if ((cached_tun_flags & TUN_READY) != TUN_READY) {
 		TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags);
 		m_freem (m0);
 		return (EHOSTDOWN);
 	}
 
 	if ((ifp->if_flags & IFF_UP) != IFF_UP) {
 		m_freem (m0);
 		return (EHOSTDOWN);
 	}
 
 	/* BPF writes need to be handled specially. */
 	if (dst->sa_family == AF_UNSPEC)
 		bcopy(dst->sa_data, &af, sizeof(af));
 	else
 		af = RO_GET_FAMILY(ro, dst);
 
 	if (bpf_peers_present(ifp->if_bpf))
 		bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0);
 
 	/* prepend sockaddr? this may abort if the mbuf allocation fails */
 	if (cached_tun_flags & TUN_LMODE) {
 		/* allocate space for sockaddr */
 		M_PREPEND(m0, dst->sa_len, M_NOWAIT);
 
 		/* if allocation failed drop packet */
 		if (m0 == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			return (ENOBUFS);
 		} else {
 			bcopy(dst, m0->m_data, dst->sa_len);
 		}
 	}
 
 	if (cached_tun_flags & TUN_IFHEAD) {
 		/* Prepend the address family */
 		M_PREPEND(m0, 4, M_NOWAIT);
 
 		/* if allocation failed drop packet */
 		if (m0 == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			return (ENOBUFS);
 		} else
 			*(u_int32_t *)m0->m_data = htonl(af);
 	} else {
 #ifdef INET
 		if (af != AF_INET)
 #endif
 		{
 			m_freem(m0);
 			return (EAFNOSUPPORT);
 		}
 	}
 
 	error = (ifp->if_transmit)(ifp, m0);
 	if (error)
 		return (ENOBUFS);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 	return (0);
 }
 
 /*
  * the cdevsw interface is now pretty minimal.
  */
 static	int
 tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
     struct thread *td)
 {
 	struct ifreq ifr, *ifrp;
 	struct tuntap_softc *tp = dev->si_drv1;
 	struct ifnet *ifp = TUN2IFP(tp);
 	struct tuninfo *tunp;
 	int error, iflags, ival;
 	bool	l2tun;
 
 	l2tun = (tp->tun_flags & TUN_L2) != 0;
 	if (l2tun) {
 		/* tap specific ioctls */
 		switch(cmd) {
 		/* VMware/VMnet port ioctl's */
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4)
 		case _IO('V', 0):
 			ival = IOCPARM_IVAL(data);
 			data = (caddr_t)&ival;
 			/* FALLTHROUGH */
 #endif
 		case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */
 			iflags = *(int *)data;
 			iflags &= TUN_VMIO_FLAG_MASK;
 			iflags &= ~IFF_CANTCHANGE;
 			iflags |= IFF_UP;
 
 			TUN_LOCK(tp);
 			ifp->if_flags = iflags |
 			    (ifp->if_flags & IFF_CANTCHANGE);
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case SIOCGIFADDR:	/* get MAC address of the remote side */
 			TUN_LOCK(tp);
 			bcopy(&tp->tun_ether.octet, data,
 			    sizeof(tp->tun_ether.octet));
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case SIOCSIFADDR:	/* set MAC address of the remote side */
 			TUN_LOCK(tp);
 			bcopy(data, &tp->tun_ether.octet,
 			    sizeof(tp->tun_ether.octet));
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case TAPSVNETHDR:
 			ival = *(int *)data;
 			if (ival != 0 &&
 			    ival != sizeof(struct virtio_net_hdr) &&
 			    ival != sizeof(struct virtio_net_hdr_mrg_rxbuf)) {
 				return (EINVAL);
 			}
 			TUN_LOCK(tp);
 			tun_vnethdr_set(ifp, ival);
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case TAPGVNETHDR:
 			TUN_LOCK(tp);
 			*(int *)data = tp->tun_vhdrlen;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		}
 
 		/* Fall through to the common ioctls if unhandled */
 	} else {
 		switch (cmd) {
 		case TUNSLMODE:
 			TUN_LOCK(tp);
 			if (*(int *)data) {
 				tp->tun_flags |= TUN_LMODE;
 				tp->tun_flags &= ~TUN_IFHEAD;
 			} else
 				tp->tun_flags &= ~TUN_LMODE;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case TUNSIFHEAD:
 			TUN_LOCK(tp);
 			if (*(int *)data) {
 				tp->tun_flags |= TUN_IFHEAD;
 				tp->tun_flags &= ~TUN_LMODE;
 			} else
 				tp->tun_flags &= ~TUN_IFHEAD;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case TUNGIFHEAD:
 			TUN_LOCK(tp);
 			*(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case TUNSIFMODE:
 			/* deny this if UP */
 			if (TUN2IFP(tp)->if_flags & IFF_UP)
 				return (EBUSY);
 
 			switch (*(int *)data & ~IFF_MULTICAST) {
 			case IFF_POINTOPOINT:
 			case IFF_BROADCAST:
 				TUN_LOCK(tp);
 				TUN2IFP(tp)->if_flags &=
 				    ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST);
 				TUN2IFP(tp)->if_flags |= *(int *)data;
 				TUN_UNLOCK(tp);
 
 				break;
 			default:
 				return (EINVAL);
 			}
 
 			return (0);
 		case TUNSIFPID:
 			TUN_LOCK(tp);
 			tp->tun_pid = curthread->td_proc->p_pid;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		}
 		/* Fall through to the common ioctls if unhandled */
 	}
 
 	switch (cmd) {
 	case TUNGIFNAME:
 		ifrp = (struct ifreq *)data;
 		strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, IFNAMSIZ);
 
 		return (0);
 	case TUNSIFINFO:
 		tunp = (struct tuninfo *)data;
 		if (TUN2IFP(tp)->if_type != tunp->type)
 			return (EPROTOTYPE);
 		TUN_LOCK(tp);
 		if (TUN2IFP(tp)->if_mtu != tunp->mtu) {
 			strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ);
 			ifr.ifr_mtu = tunp->mtu;
 			CURVNET_SET(TUN2IFP(tp)->if_vnet);
 			error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp),
 			    (caddr_t)&ifr, td);
 			CURVNET_RESTORE();
 			if (error) {
 				TUN_UNLOCK(tp);
 				return (error);
 			}
 		}
 		TUN2IFP(tp)->if_baudrate = tunp->baudrate;
 		TUN_UNLOCK(tp);
 		break;
 	case TUNGIFINFO:
 		tunp = (struct tuninfo *)data;
 		TUN_LOCK(tp);
 		tunp->mtu = TUN2IFP(tp)->if_mtu;
 		tunp->type = TUN2IFP(tp)->if_type;
 		tunp->baudrate = TUN2IFP(tp)->if_baudrate;
 		TUN_UNLOCK(tp);
 		break;
 	case TUNSDEBUG:
 		tundebug = *(int *)data;
 		break;
 	case TUNGDEBUG:
 		*(int *)data = tundebug;
 		break;
 	case FIONBIO:
 		break;
 	case FIOASYNC:
 		TUN_LOCK(tp);
 		if (*(int *)data)
 			tp->tun_flags |= TUN_ASYNC;
 		else
 			tp->tun_flags &= ~TUN_ASYNC;
 		TUN_UNLOCK(tp);
 		break;
 	case FIONREAD:
 		if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) {
 			struct mbuf *mb;
 			IFQ_LOCK(&TUN2IFP(tp)->if_snd);
 			IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb);
 			for (*(int *)data = 0; mb != NULL; mb = mb->m_next)
 				*(int *)data += mb->m_len;
 			IFQ_UNLOCK(&TUN2IFP(tp)->if_snd);
 		} else
 			*(int *)data = 0;
 		break;
 	case FIOSETOWN:
 		return (fsetown(*(int *)data, &tp->tun_sigio));
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&tp->tun_sigio);
 		return (0);
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		return (fsetown(-(*(int *)data), &tp->tun_sigio));
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)data = -fgetown(&tp->tun_sigio);
 		return (0);
 
 	default:
 		return (ENOTTY);
 	}
 	return (0);
 }
 
 /*
  * The cdevsw read interface - reads a packet at a time, or at
  * least as much of a packet as can be read.
  */
 static	int
 tunread(struct cdev *dev, struct uio *uio, int flag)
 {
 	struct tuntap_softc *tp = dev->si_drv1;
 	struct ifnet	*ifp = TUN2IFP(tp);
 	struct mbuf	*m;
 	size_t		len;
 	int		error = 0;
 
 	TUNDEBUG (ifp, "read\n");
 	TUN_LOCK(tp);
 	if ((tp->tun_flags & TUN_READY) != TUN_READY) {
 		TUN_UNLOCK(tp);
 		TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags);
 		return (EHOSTDOWN);
 	}
 
 	tp->tun_flags &= ~TUN_RWAIT;
 
 	for (;;) {
 		IFQ_DEQUEUE(&ifp->if_snd, m);
 		if (m != NULL)
 			break;
 		if (flag & O_NONBLOCK) {
 			TUN_UNLOCK(tp);
 			return (EWOULDBLOCK);
 		}
 		tp->tun_flags |= TUN_RWAIT;
 		error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1),
 		    "tunread", 0);
 		if (error != 0) {
 			TUN_UNLOCK(tp);
 			return (error);
 		}
 	}
 	TUN_UNLOCK(tp);
 
 	if ((tp->tun_flags & TUN_L2) != 0)
 		BPF_MTAP(ifp, m);
 
 	len = min(tp->tun_vhdrlen, uio->uio_resid);
 	if (len > 0) {
 		struct virtio_net_hdr_mrg_rxbuf vhdr;
 
 		bzero(&vhdr, sizeof(vhdr));
 		if (m->m_pkthdr.csum_flags & TAP_ALL_OFFLOAD) {
 			m = virtio_net_tx_offload(ifp, m, false, &vhdr.hdr);
 		}
 
 		TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, "
 		    "gs %u, cs %u, co %u\n", vhdr.hdr.flags,
 		    vhdr.hdr.gso_type, vhdr.hdr.hdr_len,
 		    vhdr.hdr.gso_size, vhdr.hdr.csum_start,
 		    vhdr.hdr.csum_offset);
 		error = uiomove(&vhdr, len, uio);
 	}
 
 	while (m && uio->uio_resid > 0 && error == 0) {
 		len = min(uio->uio_resid, m->m_len);
 		if (len != 0)
 			error = uiomove(mtod(m, void *), len, uio);
 		m = m_free(m);
 	}
 
 	if (m) {
 		TUNDEBUG(ifp, "Dropping mbuf\n");
 		m_freem(m);
 	}
 	return (error);
 }
 
 static int
 tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m,
 	    struct virtio_net_hdr_mrg_rxbuf *vhdr)
 {
 	struct epoch_tracker et;
 	struct ether_header *eh;
 	struct ifnet *ifp;
 
 	ifp = TUN2IFP(tp);
 
 	/*
 	 * Only pass a unicast frame to ether_input(), if it would
 	 * actually have been received by non-virtual hardware.
 	 */
 	if (m->m_len < sizeof(struct ether_header)) {
 		m_freem(m);
 		return (0);
 	}
 
 	eh = mtod(m, struct ether_header *);
 
 	if (eh && (ifp->if_flags & IFF_PROMISC) == 0 &&
 	    !ETHER_IS_MULTICAST(eh->ether_dhost) &&
 	    bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) {
 		m_freem(m);
 		return (0);
 	}
 
 	if (vhdr != NULL && virtio_net_rx_csum(m, &vhdr->hdr)) {
 		m_freem(m);
 		return (0);
 	}
 
 	/* Pass packet up to parent. */
 	CURVNET_SET(ifp->if_vnet);
 	NET_EPOCH_ENTER(et);
 	(*ifp->if_input)(ifp, m);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	/* ibytes are counted in parent */
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	return (0);
 }
 
 static int
 tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct ifnet *ifp;
 	int family, isr;
 
 	ifp = TUN2IFP(tp);
 	/* Could be unlocked read? */
 	TUN_LOCK(tp);
 	if (tp->tun_flags & TUN_IFHEAD) {
 		TUN_UNLOCK(tp);
 		if (m->m_len < sizeof(family) &&
 		(m = m_pullup(m, sizeof(family))) == NULL)
 			return (ENOBUFS);
 		family = ntohl(*mtod(m, u_int32_t *));
 		m_adj(m, sizeof(family));
 	} else {
 		TUN_UNLOCK(tp);
 		family = AF_INET;
 	}
 
 	BPF_MTAP2(ifp, &family, sizeof(family), m);
 
 	switch (family) {
 #ifdef INET
 	case AF_INET:
 		isr = NETISR_IP;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 	random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN);
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	CURVNET_SET(ifp->if_vnet);
 	M_SETFIB(m, ifp->if_fib);
 	NET_EPOCH_ENTER(et);
 	netisr_dispatch(isr, m);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * the cdevsw write interface - an atomic write is a packet - or else!
  */
 static	int
 tunwrite(struct cdev *dev, struct uio *uio, int flag)
 {
 	struct virtio_net_hdr_mrg_rxbuf vhdr;
 	struct tuntap_softc *tp;
 	struct ifnet	*ifp;
 	struct mbuf	*m;
 	uint32_t	mru;
 	int		align, vhdrlen, error;
 	bool		l2tun;
 
 	tp = dev->si_drv1;
 	ifp = TUN2IFP(tp);
 	TUNDEBUG(ifp, "tunwrite\n");
 	if ((ifp->if_flags & IFF_UP) != IFF_UP)
 		/* ignore silently */
 		return (0);
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	l2tun = (tp->tun_flags & TUN_L2) != 0;
 	mru = l2tun ? TAPMRU : TUNMRU;
 	vhdrlen = tp->tun_vhdrlen;
 	align = 0;
 	if (l2tun) {
 		align = ETHER_ALIGN;
 		mru += vhdrlen;
 	} else if ((tp->tun_flags & TUN_IFHEAD) != 0)
 		mru += sizeof(uint32_t);	/* family */
 	if (uio->uio_resid < 0 || uio->uio_resid > mru) {
 		TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid);
 		return (EIO);
 	}
 
 	if (vhdrlen > 0) {
 		error = uiomove(&vhdr, vhdrlen, uio);
 		if (error != 0)
 			return (error);
 		TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, "
 		    "gs %u, cs %u, co %u\n", vhdr.hdr.flags,
 		    vhdr.hdr.gso_type, vhdr.hdr.hdr_len,
 		    vhdr.hdr.gso_size, vhdr.hdr.csum_start,
 		    vhdr.hdr.csum_offset);
 	}
 
 	if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		return (ENOBUFS);
 	}
 
 	m->m_pkthdr.rcvif = ifp;
 #ifdef MAC
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	if (l2tun)
 		return (tunwrite_l2(tp, m, vhdrlen > 0 ? &vhdr : NULL));
 
 	return (tunwrite_l3(tp, m));
 }
 
 /*
  * tunpoll - the poll interface, this is only useful on reads
  * really. The write detect always returns true, write never blocks
  * anyway, it either accepts the packet or drops it.
  */
 static	int
 tunpoll(struct cdev *dev, int events, struct thread *td)
 {
 	struct tuntap_softc *tp = dev->si_drv1;
 	struct ifnet	*ifp = TUN2IFP(tp);
 	int		revents = 0;
 
 	TUNDEBUG(ifp, "tunpoll\n");
 
 	if (events & (POLLIN | POLLRDNORM)) {
 		IFQ_LOCK(&ifp->if_snd);
 		if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
 			TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len);
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			TUNDEBUG(ifp, "tunpoll waiting\n");
 			selrecord(td, &tp->tun_rsel);
 		}
 		IFQ_UNLOCK(&ifp->if_snd);
 	}
 	revents |= events & (POLLOUT | POLLWRNORM);
 
 	return (revents);
 }
 
 /*
  * tunkqfilter - support for the kevent() system call.
  */
 static int
 tunkqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct tuntap_softc	*tp = dev->si_drv1;
 	struct ifnet	*ifp = TUN2IFP(tp);
 
 	switch(kn->kn_filter) {
 	case EVFILT_READ:
 		TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n",
 		    ifp->if_xname, dev2unit(dev));
 		kn->kn_fop = &tun_read_filterops;
 		break;
 
 	case EVFILT_WRITE:
 		TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n",
 		    ifp->if_xname, dev2unit(dev));
 		kn->kn_fop = &tun_write_filterops;
 		break;
 
 	default:
 		TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n",
 		    ifp->if_xname, dev2unit(dev));
 		return(EINVAL);
 	}
 
 	kn->kn_hook = tp;
 	knlist_add(&tp->tun_rsel.si_note, kn, 0);
 
 	return (0);
 }
 
 /*
  * Return true of there is data in the interface queue.
  */
 static int
 tunkqread(struct knote *kn, long hint)
 {
 	int			ret;
 	struct tuntap_softc	*tp = kn->kn_hook;
 	struct cdev		*dev = tp->tun_dev;
 	struct ifnet	*ifp = TUN2IFP(tp);
 
 	if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) {
 		TUNDEBUG(ifp,
 		    "%s have data in the queue.  Len = %d, minor = %#x\n",
 		    ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev));
 		ret = 1;
 	} else {
 		TUNDEBUG(ifp,
 		    "%s waiting for data, minor = %#x\n", ifp->if_xname,
 		    dev2unit(dev));
 		ret = 0;
 	}
 
 	return (ret);
 }
 
 /*
  * Always can write, always return MTU in kn->data.
  */
 static int
 tunkqwrite(struct knote *kn, long hint)
 {
 	struct tuntap_softc	*tp = kn->kn_hook;
 	struct ifnet	*ifp = TUN2IFP(tp);
 
 	kn->kn_data = ifp->if_mtu;
 
 	return (1);
 }
 
 static void
 tunkqdetach(struct knote *kn)
 {
 	struct tuntap_softc	*tp = kn->kn_hook;
 
 	knlist_remove(&tp->tun_rsel.si_note, kn, 0);
 }
diff --git a/sys/net/if_var.h b/sys/net/if_var.h
index afb58578e96b..831c609b3e2c 100644
--- a/sys/net/if_var.h
+++ b/sys/net/if_var.h
@@ -1,839 +1,681 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)if.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef	_NET_IF_VAR_H_
 #define	_NET_IF_VAR_H_
 
 /*
  * Structures defining a network interface, providing a packet
  * transport mechanism (ala level 0 of the PUP protocols).
  *
  * Each interface accepts output datagrams of a specified maximum
  * length, and provides higher level routines with input datagrams
  * received from its medium.
  *
  * Output occurs when the routine if_output is called, with three parameters:
  *	(*ifp->if_output)(ifp, m, dst, ro)
  * Here m is the mbuf chain to be sent and dst is the destination address.
  * The output routine encapsulates the supplied datagram if necessary,
  * and then transmits it on its medium.
  *
  * On input, each interface unwraps the data received by it, and either
  * places it on the input queue of an internetwork datagram routine
  * and posts the associated software interrupt, or passes the datagram to a raw
  * packet input routine.
  *
  * Routines exist for locating interfaces by their addresses
  * or for locating an interface on a certain network, as well as more general
  * routing and gateway routines maintaining information used to locate
  * interfaces.  These routines live in the files if.c and route.c
  */
 
 struct	rtentry;		/* ifa_rtrequest */
 struct	socket;
 struct	carp_if;
 struct	carp_softc;
 struct  ifvlantrunk;
 struct	route;			/* if_output */
 struct	vnet;
 struct	ifmedia;
 struct	netmap_adapter;
 struct	debugnet_methods;
 
 #ifdef _KERNEL
 #include <sys/_eventhandler.h>
 #include <sys/mbuf.h>		/* ifqueue only? */
 #include <sys/buf_ring.h>
 #include <net/vnet.h>
 #endif /* _KERNEL */
 #include <sys/ck.h>
 #include <sys/counter.h>
 #include <sys/epoch.h>
 #include <sys/lock.h>		/* XXX */
 #include <sys/mutex.h>		/* struct ifqueue */
 #include <sys/rwlock.h>		/* XXX */
 #include <sys/sx.h>		/* XXX */
 #include <sys/_task.h>		/* if_link_task */
 #define	IF_DUNIT_NONE	-1
 
 #include <net/altq/if_altq.h>
 
 CK_STAILQ_HEAD(ifnethead, ifnet);	/* we use TAILQs so that the order of */
 CK_STAILQ_HEAD(ifaddrhead, ifaddr);	/* instantiation is preserved in the list */
 CK_STAILQ_HEAD(ifmultihead, ifmultiaddr);
 CK_STAILQ_HEAD(ifgrouphead, ifg_group);
 
 #ifdef _KERNEL
 VNET_DECLARE(struct pfil_head *, link_pfil_head);
 #define	V_link_pfil_head	VNET(link_pfil_head)
 #define	PFIL_ETHER_NAME		"ethernet"
 
 #define	HHOOK_IPSEC_INET	0
 #define	HHOOK_IPSEC_INET6	1
 #define	HHOOK_IPSEC_COUNT	2
 VNET_DECLARE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]);
 VNET_DECLARE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]);
 #define	V_ipsec_hhh_in	VNET(ipsec_hhh_in)
 #define	V_ipsec_hhh_out	VNET(ipsec_hhh_out)
 #endif /* _KERNEL */
 
 typedef enum {
 	IFCOUNTER_IPACKETS = 0,
 	IFCOUNTER_IERRORS,
 	IFCOUNTER_OPACKETS,
 	IFCOUNTER_OERRORS,
 	IFCOUNTER_COLLISIONS,
 	IFCOUNTER_IBYTES,
 	IFCOUNTER_OBYTES,
 	IFCOUNTER_IMCASTS,
 	IFCOUNTER_OMCASTS,
 	IFCOUNTER_IQDROPS,
 	IFCOUNTER_OQDROPS,
 	IFCOUNTER_NOPROTO,
 	IFCOUNTERS /* Array size. */
 } ift_counter;
 
 typedef	void (*if_start_fn_t)(if_t);
 typedef	int (*if_ioctl_fn_t)(if_t, u_long, caddr_t);
 typedef	void (*if_init_fn_t)(void *);
 typedef	void (*if_input_fn_t)(struct ifnet *, struct mbuf *);
 typedef	int (*if_output_fn_t)
     (struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *);
 typedef void (*if_qflush_fn_t)(if_t);
 typedef int (*if_transmit_fn_t)(if_t, struct mbuf *);
 typedef	uint64_t (*if_get_counter_t)(if_t, ift_counter);
 
 struct ifnet_hw_tsomax {
 	u_int	tsomaxbytes;	/* TSO total burst length limit in bytes */
 	u_int	tsomaxsegcount;	/* TSO maximum segment count */
 	u_int	tsomaxsegsize;	/* TSO maximum segment size in bytes */
 };
 
 /* Interface encap request types */
 typedef enum {
 	IFENCAP_LL = 1			/* pre-calculate link-layer header */
 } ife_type;
 
 /*
  * The structure below allows to request various pre-calculated L2/L3 headers
  * for different media. Requests varies by type (rtype field).
  *
  * IFENCAP_LL type: pre-calculates link header based on address family
  *   and destination lladdr.
  *
  *   Input data fields:
  *     buf: pointer to destination buffer
  *     bufsize: buffer size
  *     flags: IFENCAP_FLAG_BROADCAST if destination is broadcast
  *     family: address family defined by AF_ constant.
  *     lladdr: pointer to link-layer address
  *     lladdr_len: length of link-layer address
  *     hdata: pointer to L3 header (optional, used for ARP requests).
  *   Output data fields:
  *     buf: encap data is stored here
  *     bufsize: resulting encap length is stored here
  *     lladdr_off: offset of link-layer address from encap hdr start
  *     hdata: L3 header may be altered if necessary
  */
 
 struct if_encap_req {
 	u_char		*buf;		/* Destination buffer (w) */
 	size_t		bufsize;	/* size of provided buffer (r) */
 	ife_type	rtype;		/* request type (r) */
 	uint32_t	flags;		/* Request flags (r) */
 	int		family;		/* Address family AF_* (r) */
 	int		lladdr_off;	/* offset from header start (w) */
 	int		lladdr_len;	/* lladdr length (r) */
 	char		*lladdr;	/* link-level address pointer (r) */
 	char		*hdata;		/* Upper layer header data (rw) */
 };
 
 #define	IFENCAP_FLAG_BROADCAST	0x02	/* Destination is broadcast */
 
 /*
  * Network interface send tag support. The storage of "struct
  * m_snd_tag" comes from the network driver and it is free to allocate
  * as much additional space as it wants for its own use.
  */
 struct ktls_session;
 struct m_snd_tag;
 
 #define	IF_SND_TAG_TYPE_RATE_LIMIT 0
 #define	IF_SND_TAG_TYPE_UNLIMITED 1
 #define	IF_SND_TAG_TYPE_TLS 2
 #define	IF_SND_TAG_TYPE_TLS_RATE_LIMIT 3
 #define	IF_SND_TAG_TYPE_TLS_RX 4
 #define	IF_SND_TAG_TYPE_MAX 5
 
 struct if_snd_tag_alloc_header {
 	uint32_t type;		/* send tag type, see IF_SND_TAG_XXX */
 	uint32_t flowid;	/* mbuf hash value */
 	uint32_t flowtype;	/* mbuf hash type */
 	uint8_t numa_domain;	/* numa domain of associated inp */
 };
 
 struct if_snd_tag_alloc_rate_limit {
 	struct if_snd_tag_alloc_header hdr;
 	uint64_t max_rate;	/* in bytes/s */
 	uint32_t flags;		/* M_NOWAIT or M_WAITOK */
 	uint32_t reserved;	/* alignment */
 };
 
 struct if_snd_tag_alloc_tls {
 	struct if_snd_tag_alloc_header hdr;
 	struct inpcb *inp;
 	const struct ktls_session *tls;
 };
 
 struct if_snd_tag_alloc_tls_rx {
 	struct if_snd_tag_alloc_header hdr;
 	struct inpcb *inp;
 	const struct ktls_session *tls;
 	uint16_t vlan_id;	/* valid if non-zero */
 };
 
 struct if_snd_tag_alloc_tls_rate_limit {
 	struct if_snd_tag_alloc_header hdr;
 	struct inpcb *inp;
 	const struct ktls_session *tls;
 	uint64_t max_rate;	/* in bytes/s */
 };
 
 struct if_snd_tag_rate_limit_params {
 	uint64_t max_rate;	/* in bytes/s */
 	uint32_t queue_level;	/* 0 (empty) .. 65535 (full) */
 #define	IF_SND_QUEUE_LEVEL_MIN 0
 #define	IF_SND_QUEUE_LEVEL_MAX 65535
 	uint32_t flags;		/* M_NOWAIT or M_WAITOK */
 };
 
 struct if_snd_tag_modify_tls_rx {
 	/* TCP sequence number of TLS header in host endian format */
 	uint32_t tls_hdr_tcp_sn;
 
 	/*
 	 * TLS record length, including all headers, data and trailers.
 	 * If the tls_rec_length is zero, it means HW encryption resumed.
 	 */
 	uint32_t tls_rec_length;
 
 	/* TLS sequence number in host endian format */
 	uint64_t tls_seq_number;
 };
 
 union if_snd_tag_alloc_params {
 	struct if_snd_tag_alloc_header hdr;
 	struct if_snd_tag_alloc_rate_limit rate_limit;
 	struct if_snd_tag_alloc_rate_limit unlimited;
 	struct if_snd_tag_alloc_tls tls;
 	struct if_snd_tag_alloc_tls_rx tls_rx;
 	struct if_snd_tag_alloc_tls_rate_limit tls_rate_limit;
 };
 
 union if_snd_tag_modify_params {
 	struct if_snd_tag_rate_limit_params rate_limit;
 	struct if_snd_tag_rate_limit_params unlimited;
 	struct if_snd_tag_rate_limit_params tls_rate_limit;
 	struct if_snd_tag_modify_tls_rx tls_rx;
 };
 
 union if_snd_tag_query_params {
 	struct if_snd_tag_rate_limit_params rate_limit;
 	struct if_snd_tag_rate_limit_params unlimited;
 	struct if_snd_tag_rate_limit_params tls_rate_limit;
 };
 
 typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *,
     struct m_snd_tag **);
 typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *);
 typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *);
 typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
 typedef struct m_snd_tag *(if_next_send_tag_t)(struct m_snd_tag *);
 
 struct if_snd_tag_sw {
 	if_snd_tag_modify_t *snd_tag_modify;
 	if_snd_tag_query_t *snd_tag_query;
 	if_snd_tag_free_t *snd_tag_free;
 	if_next_send_tag_t *next_snd_tag;
 	u_int	type;			/* One of IF_SND_TAG_TYPE_*. */
 };
 
 /* Query return flags */
 #define RT_NOSUPPORT	  0x00000000	/* Not supported */
 #define RT_IS_INDIRECT    0x00000001	/*
 					 * Interface like a lagg, select
 					 * the actual interface for
 					 * capabilities.
 					 */
 #define RT_IS_SELECTABLE  0x00000002	/*
 					 * No rate table, you select
 					 * rates and the first
 					 * number_of_rates are created.
 					 */
 #define RT_IS_FIXED_TABLE 0x00000004	/* A fixed table is attached */
 #define RT_IS_UNUSABLE	  0x00000008	/* It is not usable for this */
 #define RT_IS_SETUP_REQ	  0x00000010	/* The interface setup must be called before use */
 
 struct if_ratelimit_query_results {
 	const uint64_t *rate_table;	/* Pointer to table if present */
 	uint32_t flags;			/* Flags indicating results */
 	uint32_t max_flows;		/* Max flows using, 0=unlimited */
 	uint32_t number_of_rates;	/* How many unique rates can be created */
 	uint32_t min_segment_burst;	/* The amount the adapter bursts at each send */
 };
 
 typedef void (if_ratelimit_query_t)(struct ifnet *,
     struct if_ratelimit_query_results *);
 typedef int (if_ratelimit_setup_t)(struct ifnet *, uint64_t, uint32_t);
-
-/*
- * Structure defining a network interface.
- */
-struct ifnet {
-	/* General book keeping of interface lists. */
-	CK_STAILQ_ENTRY(ifnet) if_link; 	/* all struct ifnets are chained (CK_) */
-	LIST_ENTRY(ifnet) if_clones;	/* interfaces of a cloner */
-	CK_STAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if (CK_) */
-					/* protected by if_addr_lock */
-	u_char	if_alloctype;		/* if_type at time of allocation */
-	uint8_t	if_numa_domain;		/* NUMA domain of device */
-	/* Driver and protocol specific information that remains stable. */
-	void	*if_softc;		/* pointer to driver state */
-	void	*if_llsoftc;		/* link layer softc */
-	void	*if_l2com;		/* pointer to protocol bits */
-	const char *if_dname;		/* driver name */
-	int	if_dunit;		/* unit or IF_DUNIT_NONE */
-	u_short	if_index;		/* numeric abbreviation for this if  */
-	u_short	if_idxgen;		/* ... and its generation count */
-	char	if_xname[IFNAMSIZ];	/* external name (name + unit) */
-	char	*if_description;	/* interface description */
-
-	/* Variable fields that are touched by the stack and drivers. */
-	int	if_flags;		/* up/down, broadcast, etc. */
-	int	if_drv_flags;		/* driver-managed status flags */
-	int	if_capabilities;	/* interface features & capabilities */
-	int	if_capabilities2;	/* part 2 */
-	int	if_capenable;		/* enabled features & capabilities */
-	int	if_capenable2;		/* part 2 */
-	void	*if_linkmib;		/* link-type-specific MIB data */
-	size_t	if_linkmiblen;		/* length of above data */
-	u_int	if_refcount;		/* reference count */
-
-	/* These fields are shared with struct if_data. */
-	uint8_t		if_type;	/* ethernet, tokenring, etc */
-	uint8_t		if_addrlen;	/* media address length */
-	uint8_t		if_hdrlen;	/* media header length */
-	uint8_t		if_link_state;	/* current link state */
-	uint32_t	if_mtu;		/* maximum transmission unit */
-	uint32_t	if_metric;	/* routing metric (external only) */
-	uint64_t	if_baudrate;	/* linespeed */
-	uint64_t	if_hwassist;	/* HW offload capabilities, see IFCAP */
-	time_t		if_epoch;	/* uptime at attach or stat reset */
-	struct timeval	if_lastchange;	/* time of last administrative change */
-
-	struct  ifaltq if_snd;		/* output queue (includes altq) */
-	struct	task if_linktask;	/* task for link change events */
-	struct	task if_addmultitask;	/* task for SIOCADDMULTI */
-
-	/* Addresses of different protocol families assigned to this if. */
-	struct mtx if_addr_lock;	/* lock to protect address lists */
-		/*
-		 * if_addrhead is the list of all addresses associated to
-		 * an interface.
-		 * Some code in the kernel assumes that first element
-		 * of the list has type AF_LINK, and contains sockaddr_dl
-		 * addresses which store the link-level address and the name
-		 * of the interface.
-		 * However, access to the AF_LINK address through this
-		 * field is deprecated. Use if_addr instead.
-		 */
-	struct	ifaddrhead if_addrhead;	/* linked list of addresses per if */
-	struct	ifmultihead if_multiaddrs; /* multicast addresses configured */
-	int	if_amcount;		/* number of all-multicast requests */
-	struct	ifaddr	*if_addr;	/* pointer to link-level address */
-	void	*if_hw_addr;		/* hardware link-level address */
-	const u_int8_t *if_broadcastaddr; /* linklevel broadcast bytestring */
-	struct	mtx if_afdata_lock;
-	void	*if_afdata[AF_MAX];
-	int	if_afdata_initialized;
-
-	/* Additional features hung off the interface. */
-	u_int	if_fib;			/* interface FIB */
-	struct	vnet *if_vnet;		/* pointer to network stack instance */
-	struct	vnet *if_home_vnet;	/* where this ifnet originates from */
-	struct  ifvlantrunk *if_vlantrunk; /* pointer to 802.1q data */
-	struct	bpf_if *if_bpf;		/* packet filter structure */
-	int	if_pcount;		/* number of promiscuous listeners */
-	void	*if_bridge;		/* bridge glue */
-	void	*if_lagg;		/* lagg glue */
-	void	*if_pf_kif;		/* pf glue */
-	struct	carp_if *if_carp;	/* carp interface structure */
-	struct	label *if_label;	/* interface MAC label */
-	struct	netmap_adapter *if_netmap; /* netmap(4) softc */
-
-	/* Various procedures of the layer2 encapsulation and drivers. */
-	if_output_fn_t if_output;	/* output routine (enqueue) */
-	if_input_fn_t if_input;		/* input routine (from h/w driver) */
-	struct mbuf *(*if_bridge_input)(struct ifnet *, struct mbuf *);
-	int	(*if_bridge_output)(struct ifnet *, struct mbuf *, struct sockaddr *,
-		    struct rtentry *);
-	void (*if_bridge_linkstate)(struct ifnet *ifp);
-	if_start_fn_t	if_start;	/* initiate output routine */
-	if_ioctl_fn_t	if_ioctl;	/* ioctl routine */
-	if_init_fn_t	if_init;	/* Init routine */
-	int	(*if_resolvemulti)	/* validate/resolve multicast */
-		(struct ifnet *, struct sockaddr **, struct sockaddr *);
-	if_qflush_fn_t	if_qflush;	/* flush any queue */
-	if_transmit_fn_t if_transmit;   /* initiate output routine */
-
-	void	(*if_reassign)		/* reassign to vnet routine */
-		(struct ifnet *, struct vnet *, char *);
-	if_get_counter_t if_get_counter; /* get counter values */
-	int	(*if_requestencap)	/* make link header from request */
-		(struct ifnet *, struct if_encap_req *);
-
-	/* Statistics. */
-	counter_u64_t	if_counters[IFCOUNTERS];
-
-	/* Stuff that's only temporary and doesn't belong here. */
-
-	/*
-	 * Network adapter TSO limits:
-	 * ===========================
-	 *
-	 * If the "if_hw_tsomax" field is zero the maximum segment
-	 * length limit does not apply. If the "if_hw_tsomaxsegcount"
-	 * or the "if_hw_tsomaxsegsize" field is zero the TSO segment
-	 * count limit does not apply. If all three fields are zero,
-	 * there is no TSO limit.
-	 *
-	 * NOTE: The TSO limits should reflect the values used in the
-	 * BUSDMA tag a network adapter is using to load a mbuf chain
-	 * for transmission. The TCP/IP network stack will subtract
-	 * space for all linklevel and protocol level headers and
-	 * ensure that the full mbuf chain passed to the network
-	 * adapter fits within the given limits.
-	 */
-	u_int	if_hw_tsomax;		/* TSO maximum size in bytes */
-	u_int	if_hw_tsomaxsegcount;	/* TSO maximum segment count */
-	u_int	if_hw_tsomaxsegsize;	/* TSO maximum segment size in bytes */
-
-	/*
-	 * Network adapter send tag support:
-	 */
-	if_snd_tag_alloc_t *if_snd_tag_alloc;
-
-	/* Ratelimit (packet pacing) */
-	if_ratelimit_query_t *if_ratelimit_query;
-	if_ratelimit_setup_t *if_ratelimit_setup;
-
-	/* Ethernet PCP */
-	uint8_t if_pcp;
-
-	/*
-	 * Debugnet (Netdump) hooks to be called while in db/panic.
-	 */
-	struct debugnet_methods *if_debugnet_methods;
-	struct epoch_context	if_epoch_ctx;
-
-	/*
-	 * Spare fields to be added before branching a stable branch, so
-	 * that structure can be enhanced without changing the kernel
-	 * binary interface.
-	 */
-	int	if_ispare[4];		/* general use */
-};
-
 #define	IF_NODOM	255
 /*
  * Locks for address lists on the network interface.
  */
 #define	IF_ADDR_LOCK_INIT(if)	mtx_init(&(if)->if_addr_lock, "if_addr_lock", NULL, MTX_DEF)
 #define	IF_ADDR_LOCK_DESTROY(if)	mtx_destroy(&(if)->if_addr_lock)
 
 #define	IF_ADDR_WLOCK(if)	mtx_lock(&(if)->if_addr_lock)
 #define	IF_ADDR_WUNLOCK(if)	mtx_unlock(&(if)->if_addr_lock)
 #define	IF_ADDR_LOCK_ASSERT(if)	MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(if)->if_addr_lock))
 #define	IF_ADDR_WLOCK_ASSERT(if) mtx_assert(&(if)->if_addr_lock, MA_OWNED)
 
 #ifdef _KERNEL
 /* interface link layer address change event */
 typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t);
 /* interface address change event */
 typedef void (*ifaddr_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(ifaddr_event, ifaddr_event_handler_t);
 typedef void (*ifaddr_event_ext_handler_t)(void *, struct ifnet *,
     struct ifaddr *, int);
 EVENTHANDLER_DECLARE(ifaddr_event_ext, ifaddr_event_ext_handler_t);
 #define	IFADDR_EVENT_ADD	0
 #define	IFADDR_EVENT_DEL	1
 /* new interface arrival event */
 typedef void (*ifnet_arrival_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(ifnet_arrival_event, ifnet_arrival_event_handler_t);
 /* interface departure event */
 typedef void (*ifnet_departure_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t);
 /* Interface link state change event */
 typedef void (*ifnet_link_event_handler_t)(void *, struct ifnet *, int);
 EVENTHANDLER_DECLARE(ifnet_link_event, ifnet_link_event_handler_t);
 /* Interface up/down event */
 #define IFNET_EVENT_UP		0
 #define IFNET_EVENT_DOWN	1
 #define IFNET_EVENT_PCP		2	/* priority code point, PCP */
 #define	IFNET_EVENT_UPDATE_BAUDRATE	3
 
 typedef void (*ifnet_event_fn)(void *, struct ifnet *ifp, int event);
 EVENTHANDLER_DECLARE(ifnet_event, ifnet_event_fn);
 
 /*
  * interface groups
  */
 struct ifg_group {
 	char				 ifg_group[IFNAMSIZ];
 	u_int				 ifg_refcnt;
 	void				*ifg_pf_kif;
 	CK_STAILQ_HEAD(, ifg_member)	 ifg_members; /* (CK_) */
 	CK_STAILQ_ENTRY(ifg_group)		 ifg_next; /* (CK_) */
 };
 
 struct ifg_member {
 	CK_STAILQ_ENTRY(ifg_member)	 ifgm_next; /* (CK_) */
 	struct ifnet		*ifgm_ifp;
 };
 
 struct ifg_list {
 	struct ifg_group	*ifgl_group;
 	CK_STAILQ_ENTRY(ifg_list)	 ifgl_next; /* (CK_) */
 };
 
 #ifdef _SYS_EVENTHANDLER_H_
 /* group attach event */
 typedef void (*group_attach_event_handler_t)(void *, struct ifg_group *);
 EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t);
 /* group detach event */
 typedef void (*group_detach_event_handler_t)(void *, struct ifg_group *);
 EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t);
 /* group change event */
 typedef void (*group_change_event_handler_t)(void *, const char *);
 EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t);
 #endif /* _SYS_EVENTHANDLER_H_ */
 
 #define	IF_AFDATA_LOCK_INIT(ifp)	\
 	mtx_init(&(ifp)->if_afdata_lock, "if_afdata", NULL, MTX_DEF)
 
 #define	IF_AFDATA_WLOCK(ifp)	mtx_lock(&(ifp)->if_afdata_lock)
 #define	IF_AFDATA_WUNLOCK(ifp)	mtx_unlock(&(ifp)->if_afdata_lock)
 #define	IF_AFDATA_LOCK(ifp)	IF_AFDATA_WLOCK(ifp)
 #define	IF_AFDATA_UNLOCK(ifp)	IF_AFDATA_WUNLOCK(ifp)
 #define	IF_AFDATA_TRYLOCK(ifp)	mtx_trylock(&(ifp)->if_afdata_lock)
 #define	IF_AFDATA_DESTROY(ifp)	mtx_destroy(&(ifp)->if_afdata_lock)
 
 #define	IF_AFDATA_LOCK_ASSERT(ifp)	MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ifp)->if_afdata_lock))
 #define	IF_AFDATA_WLOCK_ASSERT(ifp)	mtx_assert(&(ifp)->if_afdata_lock, MA_OWNED)
 #define	IF_AFDATA_UNLOCK_ASSERT(ifp)	mtx_assert(&(ifp)->if_afdata_lock, MA_NOTOWNED)
 
 /*
  * 72 was chosen below because it is the size of a TCP/IP
  * header (40) + the minimum mss (32).
  */
 #define	IF_MINMTU	72
 #define	IF_MAXMTU	65535
 
 #define	TOEDEV(ifp)	((ifp)->if_llsoftc)
 
 /*
  * The ifaddr structure contains information about one address
  * of an interface.  They are maintained by the different address families,
  * are allocated and attached when an address is set, and are linked
  * together so all addresses for an interface can be located.
  *
  * NOTE: a 'struct ifaddr' is always at the beginning of a larger
  * chunk of malloc'ed memory, where we store the three addresses
  * (ifa_addr, ifa_dstaddr and ifa_netmask) referenced here.
  */
 struct ifaddr {
 	struct	sockaddr *ifa_addr;	/* address of interface */
 	struct	sockaddr *ifa_dstaddr;	/* other end of p-to-p link */
 #define	ifa_broadaddr	ifa_dstaddr	/* broadcast address interface */
 	struct	sockaddr *ifa_netmask;	/* used to determine subnet */
 	struct	ifnet *ifa_ifp;		/* back-pointer to interface */
 	struct	carp_softc *ifa_carp;	/* pointer to CARP data */
 	CK_STAILQ_ENTRY(ifaddr) ifa_link;	/* queue macro glue */
 	u_short	ifa_flags;		/* mostly rt_flags for cloning */
 #define	IFA_ROUTE	RTF_UP		/* route installed */
 #define	IFA_RTSELF	RTF_HOST	/* loopback route to self installed */
 	u_int	ifa_refcnt;		/* references to this structure */
 
 	counter_u64_t	ifa_ipackets;
 	counter_u64_t	ifa_opackets;
 	counter_u64_t	ifa_ibytes;
 	counter_u64_t	ifa_obytes;
 	struct	epoch_context	ifa_epoch_ctx;
 };
 
 struct ifaddr *	ifa_alloc(size_t size, int flags);
 void	ifa_free(struct ifaddr *ifa);
 void	ifa_ref(struct ifaddr *ifa);
 int __result_use_check ifa_try_ref(struct ifaddr *ifa);
 
 /*
  * Multicast address structure.  This is analogous to the ifaddr
  * structure except that it keeps track of multicast addresses.
  */
 #define IFMA_F_ENQUEUED		0x1
 struct ifmultiaddr {
 	CK_STAILQ_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */
 	struct	sockaddr *ifma_addr; 	/* address this membership is for */
 	struct	sockaddr *ifma_lladdr;	/* link-layer translation, if any */
 	struct	ifnet *ifma_ifp;	/* back-pointer to interface */
 	u_int	ifma_refcount;		/* reference count */
 	int	ifma_flags;
 	void	*ifma_protospec;	/* protocol-specific state, if any */
 	struct	ifmultiaddr *ifma_llifma; /* pointer to ifma for ifma_lladdr */
 	struct	epoch_context	ifma_epoch_ctx;
 };
 
 extern	struct sx ifnet_sxlock;
 
 #define	IFNET_WLOCK()		sx_xlock(&ifnet_sxlock)
 #define	IFNET_WUNLOCK()		sx_xunlock(&ifnet_sxlock)
 #define	IFNET_RLOCK_ASSERT()	sx_assert(&ifnet_sxlock, SA_SLOCKED)
 #define	IFNET_WLOCK_ASSERT()	sx_assert(&ifnet_sxlock, SA_XLOCKED)
 #define	IFNET_RLOCK()		sx_slock(&ifnet_sxlock)
 #define	IFNET_RUNLOCK()		sx_sunlock(&ifnet_sxlock)
 
 /*
  * Look up an ifnet given its index.  The returned value protected from
  * being freed by the network epoch.  The _ref variant also acquires a
  * reference that must be freed using if_rele().
  */
 struct ifnet	*ifnet_byindex(u_int);
 struct ifnet	*ifnet_byindex_ref(u_int);
 
 /*
  * ifnet_byindexgen() looks up ifnet by index and generation count,
  * attempting to restore a weak pointer that had been stored across
  * the epoch.
  */
 struct ifnet   *ifnet_byindexgen(uint16_t idx, uint16_t gen);
 
 VNET_DECLARE(struct ifnethead, ifnet);
 VNET_DECLARE(struct ifgrouphead, ifg_head);
 VNET_DECLARE(struct ifnet *, loif);	/* first loopback interface */
 
 #define	V_ifnet		VNET(ifnet)
 #define	V_ifg_head	VNET(ifg_head)
 #define	V_loif		VNET(loif)
 
 #ifdef MCAST_VERBOSE
 #define MCDPRINTF printf
 #else
 #define MCDPRINTF(...)
 #endif
 
 int	if_addgroup(struct ifnet *, const char *);
 int	if_delgroup(struct ifnet *, const char *);
 int	if_addmulti(struct ifnet *, struct sockaddr *, struct ifmultiaddr **);
 int	if_allmulti(struct ifnet *, int);
 struct	ifnet* if_alloc(u_char);
 struct	ifnet* if_alloc_dev(u_char, device_t dev);
 void	if_attach(struct ifnet *);
 void	if_dead(struct ifnet *);
 int	if_delmulti(struct ifnet *, struct sockaddr *);
 void	if_delmulti_ifma(struct ifmultiaddr *);
 void	if_delmulti_ifma_flags(struct ifmultiaddr *, int flags);
 void	if_detach(struct ifnet *);
 void	if_purgeaddrs(struct ifnet *);
 void	if_delallmulti(struct ifnet *);
 void	if_down(struct ifnet *);
 struct ifmultiaddr *
 	if_findmulti(struct ifnet *, const struct sockaddr *);
 void	if_freemulti(struct ifmultiaddr *ifma);
 void	if_free(struct ifnet *);
 void	if_initname(struct ifnet *, const char *, int);
 void	if_link_state_change(struct ifnet *, int);
 int	if_printf(struct ifnet *, const char *, ...) __printflike(2, 3);
 int	if_log(struct ifnet *, int, const char *, ...) __printflike(3, 4);
 void	if_ref(struct ifnet *);
 void	if_rele(struct ifnet *);
 bool	__result_use_check if_try_ref(struct ifnet *);
 int	if_setlladdr(struct ifnet *, const u_char *, int);
 int	if_tunnel_check_nesting(struct ifnet *, struct mbuf *, uint32_t, int);
 void	if_up(struct ifnet *);
 int	ifioctl(struct socket *, u_long, caddr_t, struct thread *);
 int	ifpromisc(struct ifnet *, int);
 struct	ifnet *ifunit(const char *);
 struct	ifnet *ifunit_ref(const char *);
 
 int	ifa_add_loopback_route(struct ifaddr *, struct sockaddr *);
 int	ifa_del_loopback_route(struct ifaddr *, struct sockaddr *);
 int	ifa_switch_loopback_route(struct ifaddr *, struct sockaddr *);
 
 struct	ifaddr *ifa_ifwithaddr(const struct sockaddr *);
 int		ifa_ifwithaddr_check(const struct sockaddr *);
 struct	ifaddr *ifa_ifwithbroadaddr(const struct sockaddr *, int);
 struct	ifaddr *ifa_ifwithdstaddr(const struct sockaddr *, int);
 struct	ifaddr *ifa_ifwithnet(const struct sockaddr *, int, int);
 struct	ifaddr *ifa_ifwithroute(int, const struct sockaddr *,
     const struct sockaddr *, u_int);
 struct	ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *);
 int	ifa_preferred(struct ifaddr *, struct ifaddr *);
 
 int	if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen);
 
 typedef	void *if_com_alloc_t(u_char type, struct ifnet *ifp);
 typedef	void if_com_free_t(void *com, u_char type);
 void	if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f);
 void	if_deregister_com_alloc(u_char type);
 void	if_data_copy(struct ifnet *, struct if_data *);
 uint64_t if_get_counter_default(struct ifnet *, ift_counter);
 void	if_inc_counter(struct ifnet *, ift_counter, int64_t);
 
 #define IF_LLADDR(ifp)							\
     LLADDR((struct sockaddr_dl *)((ifp)->if_addr->ifa_addr))
 
 uint64_t if_setbaudrate(if_t ifp, uint64_t baudrate);
 uint64_t if_getbaudrate(const if_t ifp);
 int if_setcapabilities(if_t ifp, int capabilities);
 int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit);
 int if_getcapabilities(const if_t ifp);
 int if_togglecapenable(if_t ifp, int togglecap);
 int if_setcapenable(if_t ifp, int capenable);
 int if_setcapenablebit(if_t ifp, int setcap, int clearcap);
 int if_getcapenable(const if_t ifp);
 int if_getdunit(const if_t ifp);
 int if_getindex(const if_t ifp);
 const char *if_getdname(const if_t ifp);
 void if_setdname(if_t ifp, const char *name);
 const char *if_name(if_t ifp);
 int if_setname(if_t ifp, const char *name);
 void if_setdescr(if_t ifp, char *descrbuf);
 char *if_allocdescr(size_t sz, int malloc_flag);
 void if_freedescr(char *descrbuf);
 int if_getalloctype(const if_t ifp);
 int if_setdev(if_t ifp, void *dev);
 int if_setdrvflagbits(if_t ifp, int if_setflags, int clear_flags);
 int if_getdrvflags(const if_t ifp);
 int if_setdrvflags(if_t ifp, int flags);
 int if_clearhwassist(if_t ifp);
 int if_sethwassistbits(if_t ifp, int toset, int toclear);
 int if_sethwassist(if_t ifp, int hwassist_bit);
 int if_gethwassist(const if_t ifp);
 int if_togglehwassist(if_t ifp, int toggle_bits);
 int if_setsoftc(if_t ifp, void *softc);
 void *if_getsoftc(if_t ifp);
 int if_setflags(if_t ifp, int flags);
 int if_gethwaddr(const if_t ifp, struct ifreq *);
 int if_setmtu(if_t ifp, int mtu);
 int if_getmtu(const if_t ifp);
 int if_getmtu_family(const if_t ifp, int family);
 int if_setflagbits(if_t ifp, int set, int clear);
 int if_getflags(const if_t ifp);
 int if_sendq_empty(if_t ifp);
 int if_setsendqready(if_t ifp);
 int if_setsendqlen(if_t ifp, int tx_desc_count);
 int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax);
 int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount);
 int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize);
 u_int if_gethwtsomax(const if_t ifp);
 u_int if_gethwtsomaxsegcount(const if_t ifp);
 u_int if_gethwtsomaxsegsize(const if_t ifp);
 int if_input(if_t ifp, struct mbuf* sendmp);
 int if_sendq_prepend(if_t ifp, struct mbuf *m);
 struct mbuf *if_dequeue(if_t ifp);
 int if_setifheaderlen(if_t ifp, int len);
 void if_setrcvif(struct mbuf *m, if_t ifp);
 void if_setvtag(struct mbuf *m, u_int16_t tag);
 u_int16_t if_getvtag(struct mbuf *m);
 int if_vlantrunkinuse(if_t ifp);
 caddr_t if_getlladdr(const if_t ifp);
 void *if_gethandle(u_char);
 void if_bpfmtap(if_t ifp, struct mbuf *m);
 void if_etherbpfmtap(if_t ifp, struct mbuf *m);
 void if_vlancap(if_t ifp);
 int if_transmit(if_t ifp, struct mbuf *m);
 int if_init(if_t ifp, void *ctx);
 
 /*
  * Traversing through interface address lists.
  */
 struct sockaddr_dl;
 typedef u_int iflladdr_cb_t(void *, struct sockaddr_dl *, u_int);
 u_int if_foreach_lladdr(if_t, iflladdr_cb_t, void *);
 u_int if_foreach_llmaddr(if_t, iflladdr_cb_t, void *);
 u_int if_lladdr_count(if_t);
 u_int if_llmaddr_count(if_t);
 
 int if_getamcount(const if_t ifp);
 struct ifaddr * if_getifaddr(const if_t ifp);
 typedef u_int if_addr_cb_t(void *, struct ifaddr *, u_int);
 u_int if_foreach_addr_type(if_t ifp, int type, if_addr_cb_t cb, void *cb_arg);
 
 /* Functions */
 void if_setinitfn(if_t ifp, if_init_fn_t);
 void if_setinputfn(if_t ifp, if_input_fn_t);
 void if_setioctlfn(if_t ifp, if_ioctl_fn_t);
 void if_setoutputfn(if_t ifp, int(*)
     (if_t, struct mbuf *, const struct sockaddr *, struct route *));
 void if_setstartfn(if_t ifp, void (*)(if_t));
 void if_settransmitfn(if_t ifp, if_transmit_fn_t);
 void if_setqflushfn(if_t ifp, if_qflush_fn_t);
 void if_setgetcounterfn(if_t ifp, if_get_counter_t);
 void if_setsndtagallocfn(if_t ifp, if_snd_tag_alloc_t);
 
 /* TSO */
 void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *);
 int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *);
 
 /* accessors for struct ifreq */
 void *ifr_data_get_ptr(void *ifrp);
 void *ifr_buffer_get_buffer(void *data);
 size_t ifr_buffer_get_length(void *data);
 
 int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *);
 
 #ifdef DEVICE_POLLING
 enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS };
 
 typedef	int poll_handler_t(if_t ifp, enum poll_cmd cmd, int count);
 int    ether_poll_register(poll_handler_t *h, if_t ifp);
 int    ether_poll_deregister(if_t ifp);
 #endif /* DEVICE_POLLING */
 
 #endif /* _KERNEL */
 
+#include <net/if_private.h>	/* XXX: temporary until drivers converted. */
 #include <net/ifq.h>	/* XXXAO: temporary unconditional include */
 
 #endif /* !_NET_IF_VAR_H_ */
diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c
index 6a2d1bfb3fd1..d908d97b6e6d 100644
--- a/sys/net/if_vlan.c
+++ b/sys/net/if_vlan.c
@@ -1,2357 +1,2358 @@
 /*-
  * Copyright 1998 Massachusetts Institute of Technology
  * Copyright 2012 ADARA Networks, Inc.
  * Copyright 2017 Dell EMC Isilon
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to ADARA Networks, Inc.
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  *
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * if_vlan.c - pseudo-device driver for IEEE 802.1Q virtual LANs.
  * This is sort of sneaky in the implementation, since
  * we need to pretend to be enough of an Ethernet implementation
  * to make arp work.  The way we do this is by telling everyone
  * that we are an Ethernet, and then catch the packets that
  * ether_output() sends to us via if_transmit(), rewrite them for
  * use by the real outgoing interface, and ask it to send them.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_vlan.h"
 #include "opt_ratelimit.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/rmlock.h>
 #include <sys/priv.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #ifdef INET
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #endif
 
 #ifdef INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
  * should be more generalized?
  */
 extern void	nd6_setmtu(struct ifnet *);
 #endif
 
 #define	VLAN_DEF_HWIDTH	4
 #define	VLAN_IFFLAGS	(IFF_BROADCAST | IFF_MULTICAST)
 
 #define	UP_AND_RUNNING(ifp) \
     ((ifp)->if_flags & IFF_UP && (ifp)->if_drv_flags & IFF_DRV_RUNNING)
 
 CK_SLIST_HEAD(ifvlanhead, ifvlan);
 
 struct ifvlantrunk {
 	struct	ifnet   *parent;	/* parent interface of this trunk */
 	struct	mtx	lock;
 #ifdef VLAN_ARRAY
 #define	VLAN_ARRAY_SIZE	(EVL_VLID_MASK + 1)
 	struct	ifvlan	*vlans[VLAN_ARRAY_SIZE]; /* static table */
 #else
 	struct	ifvlanhead *hash;	/* dynamic hash-list table */
 	uint16_t	hmask;
 	uint16_t	hwidth;
 #endif
 	int		refcnt;
 };
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 struct vlan_snd_tag {
 	struct m_snd_tag com;
 	struct m_snd_tag *tag;
 };
 
 static inline struct vlan_snd_tag *
 mst_to_vst(struct m_snd_tag *mst)
 {
 
 	return (__containerof(mst, struct vlan_snd_tag, com));
 }
 #endif
 
 /*
  * This macro provides a facility to iterate over every vlan on a trunk with
  * the assumption that none will be added/removed during iteration.
  */
 #ifdef VLAN_ARRAY
 #define VLAN_FOREACH(_ifv, _trunk) \
 	size_t _i; \
 	for (_i = 0; _i < VLAN_ARRAY_SIZE; _i++) \
 		if (((_ifv) = (_trunk)->vlans[_i]) != NULL)
 #else /* VLAN_ARRAY */
 #define VLAN_FOREACH(_ifv, _trunk) \
 	struct ifvlan *_next; \
 	size_t _i; \
 	for (_i = 0; _i < (1 << (_trunk)->hwidth); _i++) \
 		CK_SLIST_FOREACH_SAFE((_ifv), &(_trunk)->hash[_i], ifv_list, _next)
 #endif /* VLAN_ARRAY */
 
 /*
  * This macro provides a facility to iterate over every vlan on a trunk while
  * also modifying the number of vlans on the trunk. The iteration continues
  * until some condition is met or there are no more vlans on the trunk.
  */
 #ifdef VLAN_ARRAY
 /* The VLAN_ARRAY case is simple -- just a for loop using the condition. */
 #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \
 	size_t _i; \
 	for (_i = 0; !(_cond) && _i < VLAN_ARRAY_SIZE; _i++) \
 		if (((_ifv) = (_trunk)->vlans[_i]))
 #else /* VLAN_ARRAY */
 /*
  * The hash table case is more complicated. We allow for the hash table to be
  * modified (i.e. vlans removed) while we are iterating over it. To allow for
  * this we must restart the iteration every time we "touch" something during
  * the iteration, since removal will resize the hash table and invalidate our
  * current position. If acting on the touched element causes the trunk to be
  * emptied, then iteration also stops.
  */
 #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \
 	size_t _i; \
 	bool _touch = false; \
 	for (_i = 0; \
 	    !(_cond) && _i < (1 << (_trunk)->hwidth); \
 	    _i = (_touch && ((_trunk) != NULL) ? 0 : _i + 1), _touch = false) \
 		if (((_ifv) = CK_SLIST_FIRST(&(_trunk)->hash[_i])) != NULL && \
 		    (_touch = true))
 #endif /* VLAN_ARRAY */
 
 struct vlan_mc_entry {
 	struct sockaddr_dl		mc_addr;
 	CK_SLIST_ENTRY(vlan_mc_entry)	mc_entries;
 	struct epoch_context		mc_epoch_ctx;
 };
 
 struct ifvlan {
 	struct	ifvlantrunk *ifv_trunk;
 	struct	ifnet *ifv_ifp;
 #define	TRUNK(ifv)	((ifv)->ifv_trunk)
 #define	PARENT(ifv)	(TRUNK(ifv)->parent)
 	void	*ifv_cookie;
 	int	ifv_pflags;	/* special flags we have set on parent */
 	int	ifv_capenable;
 	int	ifv_encaplen;	/* encapsulation length */
 	int	ifv_mtufudge;	/* MTU fudged by this much */
 	int	ifv_mintu;	/* min transmission unit */
 	struct  ether_8021q_tag ifv_qtag;
 #define ifv_proto	ifv_qtag.proto
 #define ifv_vid		ifv_qtag.vid
 #define ifv_pcp		ifv_qtag.pcp
 	struct task lladdr_task;
 	CK_SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead;
 #ifndef VLAN_ARRAY
 	CK_SLIST_ENTRY(ifvlan) ifv_list;
 #endif
 };
 
 /* Special flags we should propagate to parent. */
 static struct {
 	int flag;
 	int (*func)(struct ifnet *, int);
 } vlan_pflags[] = {
 	{IFF_PROMISC, ifpromisc},
 	{IFF_ALLMULTI, if_allmulti},
 	{0, NULL}
 };
 
 VNET_DECLARE(int, vlan_mtag_pcp);
 #define	V_vlan_mtag_pcp	VNET(vlan_mtag_pcp)
 
 static const char vlanname[] = "vlan";
 static MALLOC_DEFINE(M_VLAN, vlanname, "802.1Q Virtual LAN Interface");
 
 static eventhandler_tag ifdetach_tag;
 static eventhandler_tag iflladdr_tag;
 static eventhandler_tag ifevent_tag;
 
 /*
  * if_vlan uses two module-level synchronizations primitives to allow concurrent
  * modification of vlan interfaces and (mostly) allow for vlans to be destroyed
  * while they are being used for tx/rx. To accomplish this in a way that has
  * acceptable performance and cooperation with other parts of the network stack
  * there is a non-sleepable epoch(9) and an sx(9).
  *
  * The performance-sensitive paths that warrant using the epoch(9) are
  * vlan_transmit and vlan_input. Both have to check for the vlan interface's
  * existence using if_vlantrunk, and being in the network tx/rx paths the use
  * of an epoch(9) gives a measureable improvement in performance.
  *
  * The reason for having an sx(9) is mostly because there are still areas that
  * must be sleepable and also have safe concurrent access to a vlan interface.
  * Since the sx(9) exists, it is used by default in most paths unless sleeping
  * is not permitted, or if it is not clear whether sleeping is permitted.
  *
  */
 #define _VLAN_SX_ID ifv_sx
 
 static struct sx _VLAN_SX_ID;
 
 #define VLAN_LOCKING_INIT() \
 	sx_init_flags(&_VLAN_SX_ID, "vlan_sx", SX_RECURSE)
 
 #define VLAN_LOCKING_DESTROY() \
 	sx_destroy(&_VLAN_SX_ID)
 
 #define	VLAN_SLOCK()			sx_slock(&_VLAN_SX_ID)
 #define	VLAN_SUNLOCK()			sx_sunlock(&_VLAN_SX_ID)
 #define	VLAN_XLOCK()			sx_xlock(&_VLAN_SX_ID)
 #define	VLAN_XUNLOCK()			sx_xunlock(&_VLAN_SX_ID)
 #define	VLAN_SLOCK_ASSERT()		sx_assert(&_VLAN_SX_ID, SA_SLOCKED)
 #define	VLAN_XLOCK_ASSERT()		sx_assert(&_VLAN_SX_ID, SA_XLOCKED)
 #define	VLAN_SXLOCK_ASSERT()		sx_assert(&_VLAN_SX_ID, SA_LOCKED)
 
 /*
  * We also have a per-trunk mutex that should be acquired when changing
  * its state.
  */
 #define	TRUNK_LOCK_INIT(trunk)		mtx_init(&(trunk)->lock, vlanname, NULL, MTX_DEF)
 #define	TRUNK_LOCK_DESTROY(trunk)	mtx_destroy(&(trunk)->lock)
 #define	TRUNK_WLOCK(trunk)		mtx_lock(&(trunk)->lock)
 #define	TRUNK_WUNLOCK(trunk)		mtx_unlock(&(trunk)->lock)
 #define	TRUNK_WLOCK_ASSERT(trunk)	mtx_assert(&(trunk)->lock, MA_OWNED);
 
 /*
  * The VLAN_ARRAY substitutes the dynamic hash with a static array
  * with 4096 entries. In theory this can give a boost in processing,
  * however in practice it does not. Probably this is because the array
  * is too big to fit into CPU cache.
  */
 #ifndef VLAN_ARRAY
 static	void vlan_inithash(struct ifvlantrunk *trunk);
 static	void vlan_freehash(struct ifvlantrunk *trunk);
 static	int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv);
 static	int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv);
 static	void vlan_growhash(struct ifvlantrunk *trunk, int howmuch);
 static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk,
 	uint16_t vid);
 #endif
 static	void trunk_destroy(struct ifvlantrunk *trunk);
 
 static	void vlan_init(void *foo);
 static	void vlan_input(struct ifnet *ifp, struct mbuf *m);
 static	int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr);
 #if defined(KERN_TLS) || defined(RATELIMIT)
 static	int vlan_snd_tag_alloc(struct ifnet *,
     union if_snd_tag_alloc_params *, struct m_snd_tag **);
 static	int vlan_snd_tag_modify(struct m_snd_tag *,
     union if_snd_tag_modify_params *);
 static	int vlan_snd_tag_query(struct m_snd_tag *,
     union if_snd_tag_query_params *);
 static	void vlan_snd_tag_free(struct m_snd_tag *);
 static struct m_snd_tag *vlan_next_snd_tag(struct m_snd_tag *);
 static void vlan_ratelimit_query(struct ifnet *,
     struct if_ratelimit_query_results *);
 #endif
 static	void vlan_qflush(struct ifnet *ifp);
 static	int vlan_setflag(struct ifnet *ifp, int flag, int status,
     int (*func)(struct ifnet *, int));
 static	int vlan_setflags(struct ifnet *ifp, int status);
 static	int vlan_setmulti(struct ifnet *ifp);
 static	int vlan_transmit(struct ifnet *ifp, struct mbuf *m);
 #ifdef ALTQ
 static void vlan_altq_start(struct ifnet *ifp);
 static	int vlan_altq_transmit(struct ifnet *ifp, struct mbuf *m);
 #endif
 static	int vlan_output(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *dst, struct route *ro);
 static	void vlan_unconfig(struct ifnet *ifp);
 static	void vlan_unconfig_locked(struct ifnet *ifp, int departing);
 static	int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag,
 	uint16_t proto);
 static	void vlan_link_state(struct ifnet *ifp);
 static	void vlan_capabilities(struct ifvlan *ifv);
 static	void vlan_trunk_capabilities(struct ifnet *ifp);
 
 static	struct ifnet *vlan_clone_match_ethervid(const char *, int *);
 static	int vlan_clone_match(struct if_clone *, const char *);
 static	int vlan_clone_create(struct if_clone *, char *, size_t,
     struct ifc_data *, struct ifnet **);
 static	int vlan_clone_destroy(struct if_clone *, struct ifnet *, uint32_t);
 
 static	void vlan_ifdetach(void *arg, struct ifnet *ifp);
 static  void vlan_iflladdr(void *arg, struct ifnet *ifp);
 static  void vlan_ifevent(void *arg, struct ifnet *ifp, int event);
 
 static  void vlan_lladdr_fn(void *arg, int pending);
 
 static struct if_clone *vlan_cloner;
 
 #ifdef VIMAGE
 VNET_DEFINE_STATIC(struct if_clone *, vlan_cloner);
 #define	V_vlan_cloner	VNET(vlan_cloner)
 #endif
 
 #ifdef RATELIMIT
 static const struct if_snd_tag_sw vlan_snd_tag_ul_sw = {
 	.snd_tag_modify = vlan_snd_tag_modify,
 	.snd_tag_query = vlan_snd_tag_query,
 	.snd_tag_free = vlan_snd_tag_free,
 	.next_snd_tag = vlan_next_snd_tag,
 	.type = IF_SND_TAG_TYPE_UNLIMITED
 };
 
 static const struct if_snd_tag_sw vlan_snd_tag_rl_sw = {
 	.snd_tag_modify = vlan_snd_tag_modify,
 	.snd_tag_query = vlan_snd_tag_query,
 	.snd_tag_free = vlan_snd_tag_free,
 	.next_snd_tag = vlan_next_snd_tag,
 	.type = IF_SND_TAG_TYPE_RATE_LIMIT
 };
 #endif
 
 #ifdef KERN_TLS
 static const struct if_snd_tag_sw vlan_snd_tag_tls_sw = {
 	.snd_tag_modify = vlan_snd_tag_modify,
 	.snd_tag_query = vlan_snd_tag_query,
 	.snd_tag_free = vlan_snd_tag_free,
 	.next_snd_tag = vlan_next_snd_tag,
 	.type = IF_SND_TAG_TYPE_TLS
 };
 
 #ifdef RATELIMIT
 static const struct if_snd_tag_sw vlan_snd_tag_tls_rl_sw = {
 	.snd_tag_modify = vlan_snd_tag_modify,
 	.snd_tag_query = vlan_snd_tag_query,
 	.snd_tag_free = vlan_snd_tag_free,
 	.next_snd_tag = vlan_next_snd_tag,
 	.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT
 };
 #endif
 #endif
 
 static void
 vlan_mc_free(struct epoch_context *ctx)
 {
 	struct vlan_mc_entry *mc = __containerof(ctx, struct vlan_mc_entry, mc_epoch_ctx);
 	free(mc, M_VLAN);
 }
 
 #ifndef VLAN_ARRAY
 #define HASH(n, m)	((((n) >> 8) ^ ((n) >> 4) ^ (n)) & (m))
 
 static void
 vlan_inithash(struct ifvlantrunk *trunk)
 {
 	int i, n;
 
 	/*
 	 * The trunk must not be locked here since we call malloc(M_WAITOK).
 	 * It is OK in case this function is called before the trunk struct
 	 * gets hooked up and becomes visible from other threads.
 	 */
 
 	KASSERT(trunk->hwidth == 0 && trunk->hash == NULL,
 	    ("%s: hash already initialized", __func__));
 
 	trunk->hwidth = VLAN_DEF_HWIDTH;
 	n = 1 << trunk->hwidth;
 	trunk->hmask = n - 1;
 	trunk->hash = malloc(sizeof(struct ifvlanhead) * n, M_VLAN, M_WAITOK);
 	for (i = 0; i < n; i++)
 		CK_SLIST_INIT(&trunk->hash[i]);
 }
 
 static void
 vlan_freehash(struct ifvlantrunk *trunk)
 {
 #ifdef INVARIANTS
 	int i;
 
 	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
 	for (i = 0; i < (1 << trunk->hwidth); i++)
 		KASSERT(CK_SLIST_EMPTY(&trunk->hash[i]),
 		    ("%s: hash table not empty", __func__));
 #endif
 	free(trunk->hash, M_VLAN);
 	trunk->hash = NULL;
 	trunk->hwidth = trunk->hmask = 0;
 }
 
 static int
 vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv)
 {
 	int i, b;
 	struct ifvlan *ifv2;
 
 	VLAN_XLOCK_ASSERT();
 	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
 
 	b = 1 << trunk->hwidth;
 	i = HASH(ifv->ifv_vid, trunk->hmask);
 	CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list)
 		if (ifv->ifv_vid == ifv2->ifv_vid)
 			return (EEXIST);
 
 	/*
 	 * Grow the hash when the number of vlans exceeds half of the number of
 	 * hash buckets squared. This will make the average linked-list length
 	 * buckets/2.
 	 */
 	if (trunk->refcnt > (b * b) / 2) {
 		vlan_growhash(trunk, 1);
 		i = HASH(ifv->ifv_vid, trunk->hmask);
 	}
 	CK_SLIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list);
 	trunk->refcnt++;
 
 	return (0);
 }
 
 static int
 vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv)
 {
 	int i, b;
 	struct ifvlan *ifv2;
 
 	VLAN_XLOCK_ASSERT();
 	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
 
 	b = 1 << (trunk->hwidth - 1);
 	i = HASH(ifv->ifv_vid, trunk->hmask);
 	CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list)
 		if (ifv2 == ifv) {
 			trunk->refcnt--;
 			CK_SLIST_REMOVE(&trunk->hash[i], ifv2, ifvlan, ifv_list);
 			if (trunk->refcnt < (b * b) / 2)
 				vlan_growhash(trunk, -1);
 			return (0);
 		}
 
 	panic("%s: vlan not found\n", __func__);
 	return (ENOENT); /*NOTREACHED*/
 }
 
 /*
  * Grow the hash larger or smaller if memory permits.
  */
 static void
 vlan_growhash(struct ifvlantrunk *trunk, int howmuch)
 {
 	struct ifvlan *ifv;
 	struct ifvlanhead *hash2;
 	int hwidth2, i, j, n, n2;
 
 	VLAN_XLOCK_ASSERT();
 	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
 
 	if (howmuch == 0) {
 		/* Harmless yet obvious coding error */
 		printf("%s: howmuch is 0\n", __func__);
 		return;
 	}
 
 	hwidth2 = trunk->hwidth + howmuch;
 	n = 1 << trunk->hwidth;
 	n2 = 1 << hwidth2;
 	/* Do not shrink the table below the default */
 	if (hwidth2 < VLAN_DEF_HWIDTH)
 		return;
 
 	hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_WAITOK);
 	if (hash2 == NULL) {
 		printf("%s: out of memory -- hash size not changed\n",
 		    __func__);
 		return;		/* We can live with the old hash table */
 	}
 	for (j = 0; j < n2; j++)
 		CK_SLIST_INIT(&hash2[j]);
 	for (i = 0; i < n; i++)
 		while ((ifv = CK_SLIST_FIRST(&trunk->hash[i])) != NULL) {
 			CK_SLIST_REMOVE(&trunk->hash[i], ifv, ifvlan, ifv_list);
 			j = HASH(ifv->ifv_vid, n2 - 1);
 			CK_SLIST_INSERT_HEAD(&hash2[j], ifv, ifv_list);
 		}
 	NET_EPOCH_WAIT();
 	free(trunk->hash, M_VLAN);
 	trunk->hash = hash2;
 	trunk->hwidth = hwidth2;
 	trunk->hmask = n2 - 1;
 
 	if (bootverbose)
 		if_printf(trunk->parent,
 		    "VLAN hash table resized from %d to %d buckets\n", n, n2);
 }
 
 static __inline struct ifvlan *
 vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid)
 {
 	struct ifvlan *ifv;
 
 	NET_EPOCH_ASSERT();
 
 	CK_SLIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list)
 		if (ifv->ifv_vid == vid)
 			return (ifv);
 	return (NULL);
 }
 
 #if 0
 /* Debugging code to view the hashtables. */
 static void
 vlan_dumphash(struct ifvlantrunk *trunk)
 {
 	int i;
 	struct ifvlan *ifv;
 
 	for (i = 0; i < (1 << trunk->hwidth); i++) {
 		printf("%d: ", i);
 		CK_SLIST_FOREACH(ifv, &trunk->hash[i], ifv_list)
 			printf("%s ", ifv->ifv_ifp->if_xname);
 		printf("\n");
 	}
 }
 #endif /* 0 */
 #else
 
 static __inline struct ifvlan *
 vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid)
 {
 
 	return trunk->vlans[vid];
 }
 
 static __inline int
 vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv)
 {
 
 	if (trunk->vlans[ifv->ifv_vid] != NULL)
 		return EEXIST;
 	trunk->vlans[ifv->ifv_vid] = ifv;
 	trunk->refcnt++;
 
 	return (0);
 }
 
 static __inline int
 vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv)
 {
 
 	trunk->vlans[ifv->ifv_vid] = NULL;
 	trunk->refcnt--;
 
 	return (0);
 }
 
 static __inline void
 vlan_freehash(struct ifvlantrunk *trunk)
 {
 }
 
 static __inline void
 vlan_inithash(struct ifvlantrunk *trunk)
 {
 }
 
 #endif /* !VLAN_ARRAY */
 
 static void
 trunk_destroy(struct ifvlantrunk *trunk)
 {
 	VLAN_XLOCK_ASSERT();
 
 	vlan_freehash(trunk);
 	trunk->parent->if_vlantrunk = NULL;
 	TRUNK_LOCK_DESTROY(trunk);
 	if_rele(trunk->parent);
 	free(trunk, M_VLAN);
 }
 
 /*
  * Program our multicast filter. What we're actually doing is
  * programming the multicast filter of the parent. This has the
  * side effect of causing the parent interface to receive multicast
  * traffic that it doesn't really want, which ends up being discarded
  * later by the upper protocol layers. Unfortunately, there's no way
  * to avoid this: there really is only one physical interface.
  */
 static int
 vlan_setmulti(struct ifnet *ifp)
 {
 	struct ifnet		*ifp_p;
 	struct ifmultiaddr	*ifma;
 	struct ifvlan		*sc;
 	struct vlan_mc_entry	*mc;
 	int			error;
 
 	VLAN_XLOCK_ASSERT();
 
 	/* Find the parent. */
 	sc = ifp->if_softc;
 	ifp_p = PARENT(sc);
 
 	CURVNET_SET_QUIET(ifp_p->if_vnet);
 
 	/* First, remove any existing filter entries. */
 	while ((mc = CK_SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) {
 		CK_SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries);
 		(void)if_delmulti(ifp_p, (struct sockaddr *)&mc->mc_addr);
 		NET_EPOCH_CALL(vlan_mc_free, &mc->mc_epoch_ctx);
 	}
 
 	/* Now program new ones. */
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		mc = malloc(sizeof(struct vlan_mc_entry), M_VLAN, M_NOWAIT);
 		if (mc == NULL) {
 			IF_ADDR_WUNLOCK(ifp);
 			CURVNET_RESTORE();
 			return (ENOMEM);
 		}
 		bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len);
 		mc->mc_addr.sdl_index = ifp_p->if_index;
 		CK_SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries);
 	}
 	IF_ADDR_WUNLOCK(ifp);
 	CK_SLIST_FOREACH (mc, &sc->vlan_mc_listhead, mc_entries) {
 		error = if_addmulti(ifp_p, (struct sockaddr *)&mc->mc_addr,
 		    NULL);
 		if (error) {
 			CURVNET_RESTORE();
 			return (error);
 		}
 	}
 
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * A handler for interface ifnet events.
  */
 static void
 vlan_ifevent(void *arg __unused, struct ifnet *ifp, int event)
 {
 	struct epoch_tracker et;
 	struct ifvlan *ifv;
 	struct ifvlantrunk *trunk;
 
 	if (event != IFNET_EVENT_UPDATE_BAUDRATE)
 		return;
 
 	NET_EPOCH_ENTER(et);
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		NET_EPOCH_EXIT(et);
 		return;
 	}
 
 	TRUNK_WLOCK(trunk);
 	VLAN_FOREACH(ifv, trunk) {
 		ifv->ifv_ifp->if_baudrate = ifp->if_baudrate;
 	}
 	TRUNK_WUNLOCK(trunk);
 	NET_EPOCH_EXIT(et);
 }
 
 /*
  * A handler for parent interface link layer address changes.
  * If the parent interface link layer address is changed we
  * should also change it on all children vlans.
  */
 static void
 vlan_iflladdr(void *arg __unused, struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct ifvlan *ifv;
 	struct ifnet *ifv_ifp;
 	struct ifvlantrunk *trunk;
 	struct sockaddr_dl *sdl;
 
 	/* Need the epoch since this is run on taskqueue_swi. */
 	NET_EPOCH_ENTER(et);
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		NET_EPOCH_EXIT(et);
 		return;
 	}
 
 	/*
 	 * OK, it's a trunk.  Loop over and change all vlan's lladdrs on it.
 	 * We need an exclusive lock here to prevent concurrent SIOCSIFLLADDR
 	 * ioctl calls on the parent garbling the lladdr of the child vlan.
 	 */
 	TRUNK_WLOCK(trunk);
 	VLAN_FOREACH(ifv, trunk) {
 		/*
 		 * Copy new new lladdr into the ifv_ifp, enqueue a task
 		 * to actually call if_setlladdr. if_setlladdr needs to
 		 * be deferred to a taskqueue because it will call into
 		 * the if_vlan ioctl path and try to acquire the global
 		 * lock.
 		 */
 		ifv_ifp = ifv->ifv_ifp;
 		bcopy(IF_LLADDR(ifp), IF_LLADDR(ifv_ifp),
 		    ifp->if_addrlen);
 		sdl = (struct sockaddr_dl *)ifv_ifp->if_addr->ifa_addr;
 		sdl->sdl_alen = ifp->if_addrlen;
 		taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task);
 	}
 	TRUNK_WUNLOCK(trunk);
 	NET_EPOCH_EXIT(et);
 }
 
 /*
  * A handler for network interface departure events.
  * Track departure of trunks here so that we don't access invalid
  * pointers or whatever if a trunk is ripped from under us, e.g.,
  * by ejecting its hot-plug card.  However, if an ifnet is simply
  * being renamed, then there's no need to tear down the state.
  */
 static void
 vlan_ifdetach(void *arg __unused, struct ifnet *ifp)
 {
 	struct ifvlan *ifv;
 	struct ifvlantrunk *trunk;
 
 	/* If the ifnet is just being renamed, don't do anything. */
 	if (ifp->if_flags & IFF_RENAMING)
 		return;
 	VLAN_XLOCK();
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		VLAN_XUNLOCK();
 		return;
 	}
 
 	/*
 	 * OK, it's a trunk.  Loop over and detach all vlan's on it.
 	 * Check trunk pointer after each vlan_unconfig() as it will
 	 * free it and set to NULL after the last vlan was detached.
 	 */
 	VLAN_FOREACH_UNTIL_SAFE(ifv, ifp->if_vlantrunk,
 	    ifp->if_vlantrunk == NULL)
 		vlan_unconfig_locked(ifv->ifv_ifp, 1);
 
 	/* Trunk should have been destroyed in vlan_unconfig(). */
 	KASSERT(ifp->if_vlantrunk == NULL, ("%s: purge failed", __func__));
 	VLAN_XUNLOCK();
 }
 
 /*
  * Return the trunk device for a virtual interface.
  */
 static struct ifnet  *
 vlan_trunkdev(struct ifnet *ifp)
 {
 	struct ifvlan *ifv;
 
 	NET_EPOCH_ASSERT();
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (NULL);
 
 	ifv = ifp->if_softc;
 	ifp = NULL;
 	if (ifv->ifv_trunk)
 		ifp = PARENT(ifv);
 	return (ifp);
 }
 
 /*
  * Return the 12-bit VLAN VID for this interface, for use by external
  * components such as Infiniband.
  *
  * XXXRW: Note that the function name here is historical; it should be named
  * vlan_vid().
  */
 static int
 vlan_tag(struct ifnet *ifp, uint16_t *vidp)
 {
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (EINVAL);
 	ifv = ifp->if_softc;
 	*vidp = ifv->ifv_vid;
 	return (0);
 }
 
 static int
 vlan_pcp(struct ifnet *ifp, uint16_t *pcpp)
 {
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (EINVAL);
 	ifv = ifp->if_softc;
 	*pcpp = ifv->ifv_pcp;
 	return (0);
 }
 
 /*
  * Return a driver specific cookie for this interface.  Synchronization
  * with setcookie must be provided by the driver.
  */
 static void *
 vlan_cookie(struct ifnet *ifp)
 {
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (NULL);
 	ifv = ifp->if_softc;
 	return (ifv->ifv_cookie);
 }
 
 /*
  * Store a cookie in our softc that drivers can use to store driver
  * private per-instance data in.
  */
 static int
 vlan_setcookie(struct ifnet *ifp, void *cookie)
 {
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (EINVAL);
 	ifv = ifp->if_softc;
 	ifv->ifv_cookie = cookie;
 	return (0);
 }
 
 /*
  * Return the vlan device present at the specific VID.
  */
 static struct ifnet *
 vlan_devat(struct ifnet *ifp, uint16_t vid)
 {
 	struct ifvlantrunk *trunk;
 	struct ifvlan *ifv;
 
 	NET_EPOCH_ASSERT();
 
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL)
 		return (NULL);
 	ifp = NULL;
 	ifv = vlan_gethash(trunk, vid);
 	if (ifv)
 		ifp = ifv->ifv_ifp;
 	return (ifp);
 }
 
 /*
  * VLAN support can be loaded as a module.  The only place in the
  * system that's intimately aware of this is ether_input.  We hook
  * into this code through vlan_input_p which is defined there and
  * set here.  No one else in the system should be aware of this so
  * we use an explicit reference here.
  */
 extern	void (*vlan_input_p)(struct ifnet *, struct mbuf *);
 
 /* For if_link_state_change() eyes only... */
 extern	void (*vlan_link_state_p)(struct ifnet *);
 
 static struct if_clone_addreq vlan_addreq = {
 	.match_f = vlan_clone_match,
 	.create_f = vlan_clone_create,
 	.destroy_f = vlan_clone_destroy,
 };
 
 static int
 vlan_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		ifdetach_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
 		    vlan_ifdetach, NULL, EVENTHANDLER_PRI_ANY);
 		if (ifdetach_tag == NULL)
 			return (ENOMEM);
 		iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event,
 		    vlan_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
 		if (iflladdr_tag == NULL)
 			return (ENOMEM);
 		ifevent_tag = EVENTHANDLER_REGISTER(ifnet_event,
 		    vlan_ifevent, NULL, EVENTHANDLER_PRI_ANY);
 		if (ifevent_tag == NULL)
 			return (ENOMEM);
 		VLAN_LOCKING_INIT();
 		vlan_input_p = vlan_input;
 		vlan_link_state_p = vlan_link_state;
 		vlan_trunk_cap_p = vlan_trunk_capabilities;
 		vlan_trunkdev_p = vlan_trunkdev;
 		vlan_cookie_p = vlan_cookie;
 		vlan_setcookie_p = vlan_setcookie;
 		vlan_tag_p = vlan_tag;
 		vlan_pcp_p = vlan_pcp;
 		vlan_devat_p = vlan_devat;
 #ifndef VIMAGE
 		vlan_cloner = ifc_attach_cloner(vlanname, &vlan_addreq);
 #endif
 		if (bootverbose)
 			printf("vlan: initialized, using "
 #ifdef VLAN_ARRAY
 			       "full-size arrays"
 #else
 			       "hash tables with chaining"
 #endif
 
 			       "\n");
 		break;
 	case MOD_UNLOAD:
 #ifndef VIMAGE
 		ifc_detach_cloner(vlan_cloner);
 #endif
 		EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_tag);
 		EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_tag);
 		EVENTHANDLER_DEREGISTER(ifnet_event, ifevent_tag);
 		vlan_input_p = NULL;
 		vlan_link_state_p = NULL;
 		vlan_trunk_cap_p = NULL;
 		vlan_trunkdev_p = NULL;
 		vlan_tag_p = NULL;
 		vlan_cookie_p = NULL;
 		vlan_setcookie_p = NULL;
 		vlan_devat_p = NULL;
 		VLAN_LOCKING_DESTROY();
 		if (bootverbose)
 			printf("vlan: unloaded\n");
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t vlan_mod = {
 	"if_vlan",
 	vlan_modevent,
 	0
 };
 
 DECLARE_MODULE(if_vlan, vlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_vlan, 3);
 
 #ifdef VIMAGE
 static void
 vnet_vlan_init(const void *unused __unused)
 {
 	vlan_cloner = ifc_attach_cloner(vlanname, &vlan_addreq);
 	V_vlan_cloner = vlan_cloner;
 }
 VNET_SYSINIT(vnet_vlan_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_vlan_init, NULL);
 
 static void
 vnet_vlan_uninit(const void *unused __unused)
 {
 
 	ifc_detach_cloner(V_vlan_cloner);
 }
 VNET_SYSUNINIT(vnet_vlan_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_vlan_uninit, NULL);
 #endif
 
 /*
  * Check for <etherif>.<vlan>[.<vlan> ...] style interface names.
  */
 static struct ifnet *
 vlan_clone_match_ethervid(const char *name, int *vidp)
 {
 	char ifname[IFNAMSIZ];
 	char *cp;
 	struct ifnet *ifp;
 	int vid;
 
 	strlcpy(ifname, name, IFNAMSIZ);
 	if ((cp = strrchr(ifname, '.')) == NULL)
 		return (NULL);
 	*cp = '\0';
 	if ((ifp = ifunit_ref(ifname)) == NULL)
 		return (NULL);
 	/* Parse VID. */
 	if (*++cp == '\0') {
 		if_rele(ifp);
 		return (NULL);
 	}
 	vid = 0;
 	for(; *cp >= '0' && *cp <= '9'; cp++)
 		vid = (vid * 10) + (*cp - '0');
 	if (*cp != '\0') {
 		if_rele(ifp);
 		return (NULL);
 	}
 	if (vidp != NULL)
 		*vidp = vid;
 
 	return (ifp);
 }
 
 static int
 vlan_clone_match(struct if_clone *ifc, const char *name)
 {
 	struct ifnet *ifp;
 	const char *cp;
 
 	ifp = vlan_clone_match_ethervid(name, NULL);
 	if (ifp != NULL) {
 		if_rele(ifp);
 		return (1);
 	}
 
 	if (strncmp(vlanname, name, strlen(vlanname)) != 0)
 		return (0);
 	for (cp = name + 4; *cp != '\0'; cp++) {
 		if (*cp < '0' || *cp > '9')
 			return (0);
 	}
 
 	return (1);
 }
 
 static int
 vlan_clone_create(struct if_clone *ifc, char *name, size_t len,
     struct ifc_data *ifd, struct ifnet **ifpp)
 {
 	char *dp;
 	bool wildcard = false;
 	bool subinterface = false;
 	int unit;
 	int error;
 	int vid = 0;
 	uint16_t proto = ETHERTYPE_VLAN;
 	struct ifvlan *ifv;
 	struct ifnet *ifp;
 	struct ifnet *p = NULL;
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 	struct vlanreq vlr;
 	static const u_char eaddr[ETHER_ADDR_LEN];	/* 00:00:00:00:00:00 */
 
 
 	/*
 	 * There are three ways to specify the cloned device:
 	 * o pass a parameter block with the clone request.
 	 * o specify parameters in the text of the clone device name
 	 * o specify no parameters and get an unattached device that
 	 *   must be configured separately.
 	 * The first technique is preferred; the latter two are supported
 	 * for backwards compatibility.
 	 *
 	 * XXXRW: Note historic use of the word "tag" here.  New ioctls may be
 	 * called for.
 	 */
 
 	if (ifd->params != NULL) {
 		error = ifc_copyin(ifd, &vlr, sizeof(vlr));
 		if (error)
 			return error;
 		vid = vlr.vlr_tag;
 		proto = vlr.vlr_proto;
 
 #ifdef COMPAT_FREEBSD12
 		if (proto == 0)
 			proto = ETHERTYPE_VLAN;
 #endif
 		p = ifunit_ref(vlr.vlr_parent);
 		if (p == NULL)
 			return (ENXIO);
 	}
 
 	if ((error = ifc_name2unit(name, &unit)) == 0) {
 
 		/*
 		 * vlanX interface. Set wildcard to true if the unit number
 		 * is not fixed (-1)
 		 */
 		wildcard = (unit < 0);
 	} else {
 		struct ifnet *p_tmp = vlan_clone_match_ethervid(name, &vid);
 		if (p_tmp != NULL) {
 			error = 0;
 			subinterface = true;
 			unit = IF_DUNIT_NONE;
 			wildcard = false;
 			if (p != NULL) {
 				if_rele(p_tmp);
 				if (p != p_tmp)
 					error = EINVAL;
 			} else
 				p = p_tmp;
 		} else
 			error = ENXIO;
 	}
 
 	if (error != 0) {
 		if (p != NULL)
 			if_rele(p);
 		return (error);
 	}
 
 	if (!subinterface) {
 		/* vlanX interface, mark X as busy or allocate new unit # */
 		error = ifc_alloc_unit(ifc, &unit);
 		if (error != 0) {
 			if (p != NULL)
 				if_rele(p);
 			return (error);
 		}
 	}
 
 	/* In the wildcard case, we need to update the name. */
 	if (wildcard) {
 		for (dp = name; *dp != '\0'; dp++);
 		if (snprintf(dp, len - (dp-name), "%d", unit) >
 		    len - (dp-name) - 1) {
 			panic("%s: interface name too long", __func__);
 		}
 	}
 
 	ifv = malloc(sizeof(struct ifvlan), M_VLAN, M_WAITOK | M_ZERO);
 	ifp = ifv->ifv_ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		if (!subinterface)
 			ifc_free_unit(ifc, unit);
 		free(ifv, M_VLAN);
 		if (p != NULL)
 			if_rele(p);
 		return (ENOSPC);
 	}
 	CK_SLIST_INIT(&ifv->vlan_mc_listhead);
 	ifp->if_softc = ifv;
 	/*
 	 * Set the name manually rather than using if_initname because
 	 * we don't conform to the default naming convention for interfaces.
 	 */
 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
 	ifp->if_dname = vlanname;
 	ifp->if_dunit = unit;
 
 	ifp->if_init = vlan_init;
 #ifdef ALTQ
 	ifp->if_start = vlan_altq_start;
 	ifp->if_transmit = vlan_altq_transmit;
 	IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen);
 	ifp->if_snd.ifq_drv_maxlen = 0;
 	IFQ_SET_READY(&ifp->if_snd);
 #else
 	ifp->if_transmit = vlan_transmit;
 #endif
 	ifp->if_qflush = vlan_qflush;
 	ifp->if_ioctl = vlan_ioctl;
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	ifp->if_snd_tag_alloc = vlan_snd_tag_alloc;
 	ifp->if_ratelimit_query = vlan_ratelimit_query;
 #endif
 	ifp->if_flags = VLAN_IFFLAGS;
 	ether_ifattach(ifp, eaddr);
 	/* Now undo some of the damage... */
 	ifp->if_baudrate = 0;
 	ifp->if_type = IFT_L2VLAN;
 	ifp->if_hdrlen = ETHER_VLAN_ENCAP_LEN;
 	ifa = ifp->if_addr;
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	sdl->sdl_type = IFT_L2VLAN;
 
 	if (p != NULL) {
 		error = vlan_config(ifv, p, vid, proto);
 		if_rele(p);
 		if (error != 0) {
 			/*
 			 * Since we've partially failed, we need to back
 			 * out all the way, otherwise userland could get
 			 * confused.  Thus, we destroy the interface.
 			 */
 			ether_ifdetach(ifp);
 			vlan_unconfig(ifp);
 			if_free(ifp);
 			if (!subinterface)
 				ifc_free_unit(ifc, unit);
 			free(ifv, M_VLAN);
 
 			return (error);
 		}
 	}
 	*ifpp = ifp;
 
 	return (0);
 }
 
 static int
 vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags)
 {
 	struct ifvlan *ifv = ifp->if_softc;
 	int unit = ifp->if_dunit;
 
 	if (ifp->if_vlantrunk)
 		return (EBUSY);
 
 #ifdef ALTQ
 	IFQ_PURGE(&ifp->if_snd);
 #endif
 	ether_ifdetach(ifp);	/* first, remove it from system-wide lists */
 	vlan_unconfig(ifp);	/* now it can be unconfigured and freed */
 	/*
 	 * We should have the only reference to the ifv now, so we can now
 	 * drain any remaining lladdr task before freeing the ifnet and the
 	 * ifvlan.
 	 */
 	taskqueue_drain(taskqueue_thread, &ifv->lladdr_task);
 	NET_EPOCH_WAIT();
 	if_free(ifp);
 	free(ifv, M_VLAN);
 	if (unit != IF_DUNIT_NONE)
 		ifc_free_unit(ifc, unit);
 
 	return (0);
 }
 
 /*
  * The ifp->if_init entry point for vlan(4) is a no-op.
  */
 static void
 vlan_init(void *foo __unused)
 {
 }
 
 /*
  * The if_transmit method for vlan(4) interface.
  */
 static int
 vlan_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ifvlan *ifv;
 	struct ifnet *p;
 	int error, len, mcast;
 
 	NET_EPOCH_ASSERT();
 
 	ifv = ifp->if_softc;
 	if (TRUNK(ifv) == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 	p = PARENT(ifv);
 	len = m->m_pkthdr.len;
 	mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
 
 	BPF_MTAP(ifp, m);
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) {
 		struct vlan_snd_tag *vst;
 		struct m_snd_tag *mst;
 
 		MPASS(m->m_pkthdr.snd_tag->ifp == ifp);
 		mst = m->m_pkthdr.snd_tag;
 		vst = mst_to_vst(mst);
 		if (vst->tag->ifp != p) {
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			m_freem(m);
 			return (EAGAIN);
 		}
 
 		m->m_pkthdr.snd_tag = m_snd_tag_ref(vst->tag);
 		m_snd_tag_rele(mst);
 	}
 #endif
 
 	/*
 	 * Do not run parent's if_transmit() if the parent is not up,
 	 * or parent's driver will cause a system crash.
 	 */
 	if (!UP_AND_RUNNING(p)) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	if (!ether_8021q_frame(&m, ifp, p, &ifv->ifv_qtag)) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (0);
 	}
 
 	/*
 	 * Send it, precisely as ether_output() would have.
 	 */
 	error = (p->if_transmit)(p, m);
 	if (error == 0) {
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast);
 	} else
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	return (error);
 }
 
 static int
 vlan_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
     struct route *ro)
 {
 	struct ifvlan *ifv;
 	struct ifnet *p;
 
 	NET_EPOCH_ASSERT();
 
 	/*
 	 * Find the first non-VLAN parent interface.
 	 */
 	ifv = ifp->if_softc;
 	do {
 		if (TRUNK(ifv) == NULL) {
 			m_freem(m);
 			return (ENETDOWN);
 		}
 		p = PARENT(ifv);
 		ifv = p->if_softc;
 	} while (p->if_type == IFT_L2VLAN);
 
 	return p->if_output(ifp, m, dst, ro);
 }
 
 #ifdef ALTQ
 static void
 vlan_altq_start(if_t ifp)
 {
 	struct ifaltq *ifq = &ifp->if_snd;
 	struct mbuf *m;
 
 	IFQ_LOCK(ifq);
 	IFQ_DEQUEUE_NOLOCK(ifq, m);
 	while (m != NULL) {
 		vlan_transmit(ifp, m);
 		IFQ_DEQUEUE_NOLOCK(ifq, m);
 	}
 	IFQ_UNLOCK(ifq);
 }
 
 static int
 vlan_altq_transmit(if_t ifp, struct mbuf *m)
 {
 	int err;
 
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		IFQ_ENQUEUE(&ifp->if_snd, m, err);
 		if (err == 0)
 			vlan_altq_start(ifp);
 	} else
 		err = vlan_transmit(ifp, m);
 
 	return (err);
 }
 #endif	/* ALTQ */
 
 /*
  * The ifp->if_qflush entry point for vlan(4) is a no-op.
  */
 static void
 vlan_qflush(struct ifnet *ifp __unused)
 {
 }
 
 static void
 vlan_input(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ifvlantrunk *trunk;
 	struct ifvlan *ifv;
 	struct m_tag *mtag;
 	uint16_t vid, tag;
 
 	NET_EPOCH_ASSERT();
 
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		m_freem(m);
 		return;
 	}
 
 	if (m->m_flags & M_VLANTAG) {
 		/*
 		 * Packet is tagged, but m contains a normal
 		 * Ethernet frame; the tag is stored out-of-band.
 		 */
 		tag = m->m_pkthdr.ether_vtag;
 		m->m_flags &= ~M_VLANTAG;
 	} else {
 		struct ether_vlan_header *evl;
 
 		/*
 		 * Packet is tagged in-band as specified by 802.1q.
 		 */
 		switch (ifp->if_type) {
 		case IFT_ETHER:
 			if (m->m_len < sizeof(*evl) &&
 			    (m = m_pullup(m, sizeof(*evl))) == NULL) {
 				if_printf(ifp, "cannot pullup VLAN header\n");
 				return;
 			}
 			evl = mtod(m, struct ether_vlan_header *);
 			tag = ntohs(evl->evl_tag);
 
 			/*
 			 * Remove the 802.1q header by copying the Ethernet
 			 * addresses over it and adjusting the beginning of
 			 * the data in the mbuf.  The encapsulated Ethernet
 			 * type field is already in place.
 			 */
 			bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
 			      ETHER_HDR_LEN - ETHER_TYPE_LEN);
 			m_adj(m, ETHER_VLAN_ENCAP_LEN);
 			break;
 
 		default:
 #ifdef INVARIANTS
 			panic("%s: %s has unsupported if_type %u",
 			      __func__, ifp->if_xname, ifp->if_type);
 #endif
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			m_freem(m);
 			return;
 		}
 	}
 
 	vid = EVL_VLANOFTAG(tag);
 
 	ifv = vlan_gethash(trunk, vid);
 	if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) {
 		if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 		m_freem(m);
 		return;
 	}
 
 	if (V_vlan_mtag_pcp) {
 		/*
 		 * While uncommon, it is possible that we will find a 802.1q
 		 * packet encapsulated inside another packet that also had an
 		 * 802.1q header.  For example, ethernet tunneled over IPSEC
 		 * arriving over ethernet.  In that case, we replace the
 		 * existing 802.1q PCP m_tag value.
 		 */
 		mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL);
 		if (mtag == NULL) {
 			mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_IN,
 			    sizeof(uint8_t), M_NOWAIT);
 			if (mtag == NULL) {
 				if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 				m_freem(m);
 				return;
 			}
 			m_tag_prepend(m, mtag);
 		}
 		*(uint8_t *)(mtag + 1) = EVL_PRIOFTAG(tag);
 	}
 
 	m->m_pkthdr.rcvif = ifv->ifv_ifp;
 	if_inc_counter(ifv->ifv_ifp, IFCOUNTER_IPACKETS, 1);
 
 	/* Pass it back through the parent's input routine. */
 	(*ifv->ifv_ifp->if_input)(ifv->ifv_ifp, m);
 }
 
 static void
 vlan_lladdr_fn(void *arg, int pending __unused)
 {
 	struct ifvlan *ifv;
 	struct ifnet *ifp;
 
 	ifv = (struct ifvlan *)arg;
 	ifp = ifv->ifv_ifp;
 
 	CURVNET_SET(ifp->if_vnet);
 
 	/* The ifv_ifp already has the lladdr copied in. */
 	if_setlladdr(ifp, IF_LLADDR(ifp), ifp->if_addrlen);
 
 	CURVNET_RESTORE();
 }
 
 static int
 vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid,
 	uint16_t proto)
 {
 	struct epoch_tracker et;
 	struct ifvlantrunk *trunk;
 	struct ifnet *ifp;
 	int error = 0;
 
 	/*
 	 * We can handle non-ethernet hardware types as long as
 	 * they handle the tagging and headers themselves.
 	 */
 	if (p->if_type != IFT_ETHER &&
 	    p->if_type != IFT_L2VLAN &&
 	    (p->if_capenable & IFCAP_VLAN_HWTAGGING) == 0)
 		return (EPROTONOSUPPORT);
 	if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS)
 		return (EPROTONOSUPPORT);
 	/*
 	 * Don't let the caller set up a VLAN VID with
 	 * anything except VLID bits.
 	 * VID numbers 0x0 and 0xFFF are reserved.
 	 */
 	if (vid == 0 || vid == 0xFFF || (vid & ~EVL_VLID_MASK))
 		return (EINVAL);
 	if (ifv->ifv_trunk) {
 		trunk = ifv->ifv_trunk;
 		if (trunk->parent != p)
 			return (EBUSY);
 
 		VLAN_XLOCK();
 
 		ifv->ifv_proto = proto;
 
 		if (ifv->ifv_vid != vid) {
 			/* Re-hash */
 			vlan_remhash(trunk, ifv);
 			ifv->ifv_vid = vid;
 			error = vlan_inshash(trunk, ifv);
 		}
 		/* Will unlock */
 		goto done;
 	}
 
 	VLAN_XLOCK();
 	if (p->if_vlantrunk == NULL) {
 		trunk = malloc(sizeof(struct ifvlantrunk),
 		    M_VLAN, M_WAITOK | M_ZERO);
 		vlan_inithash(trunk);
 		TRUNK_LOCK_INIT(trunk);
 		TRUNK_WLOCK(trunk);
 		p->if_vlantrunk = trunk;
 		trunk->parent = p;
 		if_ref(trunk->parent);
 		TRUNK_WUNLOCK(trunk);
 	} else {
 		trunk = p->if_vlantrunk;
 	}
 
 	ifv->ifv_vid = vid;	/* must set this before vlan_inshash() */
 	ifv->ifv_pcp = 0;       /* Default: best effort delivery. */
 	error = vlan_inshash(trunk, ifv);
 	if (error)
 		goto done;
 	ifv->ifv_proto = proto;
 	ifv->ifv_encaplen = ETHER_VLAN_ENCAP_LEN;
 	ifv->ifv_mintu = ETHERMIN;
 	ifv->ifv_pflags = 0;
 	ifv->ifv_capenable = -1;
 
 	/*
 	 * If the parent supports the VLAN_MTU capability,
 	 * i.e. can Tx/Rx larger than ETHER_MAX_LEN frames,
 	 * use it.
 	 */
 	if (p->if_capenable & IFCAP_VLAN_MTU) {
 		/*
 		 * No need to fudge the MTU since the parent can
 		 * handle extended frames.
 		 */
 		ifv->ifv_mtufudge = 0;
 	} else {
 		/*
 		 * Fudge the MTU by the encapsulation size.  This
 		 * makes us incompatible with strictly compliant
 		 * 802.1Q implementations, but allows us to use
 		 * the feature with other NetBSD implementations,
 		 * which might still be useful.
 		 */
 		ifv->ifv_mtufudge = ifv->ifv_encaplen;
 	}
 
 	ifv->ifv_trunk = trunk;
 	ifp = ifv->ifv_ifp;
 	/*
 	 * Initialize fields from our parent.  This duplicates some
 	 * work with ether_ifattach() but allows for non-ethernet
 	 * interfaces to also work.
 	 */
 	ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge;
 	ifp->if_baudrate = p->if_baudrate;
 	ifp->if_input = p->if_input;
 	ifp->if_resolvemulti = p->if_resolvemulti;
 	ifp->if_addrlen = p->if_addrlen;
 	ifp->if_broadcastaddr = p->if_broadcastaddr;
 	ifp->if_pcp = ifv->ifv_pcp;
 
 	/*
 	 * We wrap the parent's if_output using vlan_output to ensure that it
 	 * can't become stale.
 	 */
 	ifp->if_output = vlan_output;
 
 	/*
 	 * Copy only a selected subset of flags from the parent.
 	 * Other flags are none of our business.
 	 */
 #define VLAN_COPY_FLAGS (IFF_SIMPLEX)
 	ifp->if_flags &= ~VLAN_COPY_FLAGS;
 	ifp->if_flags |= p->if_flags & VLAN_COPY_FLAGS;
 #undef VLAN_COPY_FLAGS
 
 	ifp->if_link_state = p->if_link_state;
 
 	NET_EPOCH_ENTER(et);
 	vlan_capabilities(ifv);
 	NET_EPOCH_EXIT(et);
 
 	/*
 	 * Set up our interface address to reflect the underlying
 	 * physical interface's.
 	 */
 	TASK_INIT(&ifv->lladdr_task, 0, vlan_lladdr_fn, ifv);
 	((struct sockaddr_dl *)ifp->if_addr->ifa_addr)->sdl_alen =
 	    p->if_addrlen;
 
 	/*
 	 * Do not schedule link address update if it was the same
 	 * as previous parent's. This helps avoid updating for each
 	 * associated llentry.
 	 */
 	if (memcmp(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen) != 0) {
 		bcopy(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen);
 		taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task);
 	}
 
 	/* We are ready for operation now. */
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 
 	/* Update flags on the parent, if necessary. */
 	vlan_setflags(ifp, 1);
 
 	/*
 	 * Configure multicast addresses that may already be
 	 * joined on the vlan device.
 	 */
 	(void)vlan_setmulti(ifp);
 
 done:
 	if (error == 0)
 		EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_vid);
 	VLAN_XUNLOCK();
 
 	return (error);
 }
 
 static void
 vlan_unconfig(struct ifnet *ifp)
 {
 
 	VLAN_XLOCK();
 	vlan_unconfig_locked(ifp, 0);
 	VLAN_XUNLOCK();
 }
 
 static void
 vlan_unconfig_locked(struct ifnet *ifp, int departing)
 {
 	struct ifvlantrunk *trunk;
 	struct vlan_mc_entry *mc;
 	struct ifvlan *ifv;
 	struct ifnet  *parent;
 	int error;
 
 	VLAN_XLOCK_ASSERT();
 
 	ifv = ifp->if_softc;
 	trunk = ifv->ifv_trunk;
 	parent = NULL;
 
 	if (trunk != NULL) {
 		parent = trunk->parent;
 
 		/*
 		 * Since the interface is being unconfigured, we need to
 		 * empty the list of multicast groups that we may have joined
 		 * while we were alive from the parent's list.
 		 */
 		while ((mc = CK_SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) {
 			/*
 			 * If the parent interface is being detached,
 			 * all its multicast addresses have already
 			 * been removed.  Warn about errors if
 			 * if_delmulti() does fail, but don't abort as
 			 * all callers expect vlan destruction to
 			 * succeed.
 			 */
 			if (!departing) {
 				error = if_delmulti(parent,
 				    (struct sockaddr *)&mc->mc_addr);
 				if (error)
 					if_printf(ifp,
 		    "Failed to delete multicast address from parent: %d\n",
 					    error);
 			}
 			CK_SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries);
 			NET_EPOCH_CALL(vlan_mc_free, &mc->mc_epoch_ctx);
 		}
 
 		vlan_setflags(ifp, 0); /* clear special flags on parent */
 
 		vlan_remhash(trunk, ifv);
 		ifv->ifv_trunk = NULL;
 
 		/*
 		 * Check if we were the last.
 		 */
 		if (trunk->refcnt == 0) {
 			parent->if_vlantrunk = NULL;
 			NET_EPOCH_WAIT();
 			trunk_destroy(trunk);
 		}
 	}
 
 	/* Disconnect from parent. */
 	if (ifv->ifv_pflags)
 		if_printf(ifp, "%s: ifv_pflags unclean\n", __func__);
 	ifp->if_mtu = ETHERMTU;
 	ifp->if_link_state = LINK_STATE_UNKNOWN;
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 
 	/*
 	 * Only dispatch an event if vlan was
 	 * attached, otherwise there is nothing
 	 * to cleanup anyway.
 	 */
 	if (parent != NULL)
 		EVENTHANDLER_INVOKE(vlan_unconfig, parent, ifv->ifv_vid);
 }
 
 /* Handle a reference counted flag that should be set on the parent as well */
 static int
 vlan_setflag(struct ifnet *ifp, int flag, int status,
 	     int (*func)(struct ifnet *, int))
 {
 	struct ifvlan *ifv;
 	int error;
 
 	VLAN_SXLOCK_ASSERT();
 
 	ifv = ifp->if_softc;
 	status = status ? (ifp->if_flags & flag) : 0;
 	/* Now "status" contains the flag value or 0 */
 
 	/*
 	 * See if recorded parent's status is different from what
 	 * we want it to be.  If it is, flip it.  We record parent's
 	 * status in ifv_pflags so that we won't clear parent's flag
 	 * we haven't set.  In fact, we don't clear or set parent's
 	 * flags directly, but get or release references to them.
 	 * That's why we can be sure that recorded flags still are
 	 * in accord with actual parent's flags.
 	 */
 	if (status != (ifv->ifv_pflags & flag)) {
 		error = (*func)(PARENT(ifv), status);
 		if (error)
 			return (error);
 		ifv->ifv_pflags &= ~flag;
 		ifv->ifv_pflags |= status;
 	}
 	return (0);
 }
 
 /*
  * Handle IFF_* flags that require certain changes on the parent:
  * if "status" is true, update parent's flags respective to our if_flags;
  * if "status" is false, forcedly clear the flags set on parent.
  */
 static int
 vlan_setflags(struct ifnet *ifp, int status)
 {
 	int error, i;
 
 	for (i = 0; vlan_pflags[i].flag; i++) {
 		error = vlan_setflag(ifp, vlan_pflags[i].flag,
 				     status, vlan_pflags[i].func);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 /* Inform all vlans that their parent has changed link state */
 static void
 vlan_link_state(struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct ifvlantrunk *trunk;
 	struct ifvlan *ifv;
 
 	NET_EPOCH_ENTER(et);
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		NET_EPOCH_EXIT(et);
 		return;
 	}
 
 	TRUNK_WLOCK(trunk);
 	VLAN_FOREACH(ifv, trunk) {
 		ifv->ifv_ifp->if_baudrate = trunk->parent->if_baudrate;
 		if_link_state_change(ifv->ifv_ifp,
 		    trunk->parent->if_link_state);
 	}
 	TRUNK_WUNLOCK(trunk);
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 vlan_capabilities(struct ifvlan *ifv)
 {
 	struct ifnet *p;
 	struct ifnet *ifp;
 	struct ifnet_hw_tsomax hw_tsomax;
 	int cap = 0, ena = 0, mena;
 	u_long hwa = 0;
 
 	NET_EPOCH_ASSERT();
 	VLAN_SXLOCK_ASSERT();
 
 	p = PARENT(ifv);
 	ifp = ifv->ifv_ifp;
 
 	/* Mask parent interface enabled capabilities disabled by user. */
 	mena = p->if_capenable & ifv->ifv_capenable;
 
 	/*
 	 * If the parent interface can do checksum offloading
 	 * on VLANs, then propagate its hardware-assisted
 	 * checksumming flags. Also assert that checksum
 	 * offloading requires hardware VLAN tagging.
 	 */
 	if (p->if_capabilities & IFCAP_VLAN_HWCSUM)
 		cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
 	if (p->if_capenable & IFCAP_VLAN_HWCSUM &&
 	    p->if_capenable & IFCAP_VLAN_HWTAGGING) {
 		ena |= mena & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
 		if (ena & IFCAP_TXCSUM)
 			hwa |= p->if_hwassist & (CSUM_IP | CSUM_TCP |
 			    CSUM_UDP | CSUM_SCTP);
 		if (ena & IFCAP_TXCSUM_IPV6)
 			hwa |= p->if_hwassist & (CSUM_TCP_IPV6 |
 			    CSUM_UDP_IPV6 | CSUM_SCTP_IPV6);
 	}
 
 	/*
 	 * If the parent interface can do TSO on VLANs then
 	 * propagate the hardware-assisted flag. TSO on VLANs
 	 * does not necessarily require hardware VLAN tagging.
 	 */
 	memset(&hw_tsomax, 0, sizeof(hw_tsomax));
 	if_hw_tsomax_common(p, &hw_tsomax);
 	if_hw_tsomax_update(ifp, &hw_tsomax);
 	if (p->if_capabilities & IFCAP_VLAN_HWTSO)
 		cap |= p->if_capabilities & IFCAP_TSO;
 	if (p->if_capenable & IFCAP_VLAN_HWTSO) {
 		ena |= mena & IFCAP_TSO;
 		if (ena & IFCAP_TSO)
 			hwa |= p->if_hwassist & CSUM_TSO;
 	}
 
 	/*
 	 * If the parent interface can do LRO and checksum offloading on
 	 * VLANs, then guess it may do LRO on VLANs.  False positive here
 	 * cost nothing, while false negative may lead to some confusions.
 	 */
 	if (p->if_capabilities & IFCAP_VLAN_HWCSUM)
 		cap |= p->if_capabilities & IFCAP_LRO;
 	if (p->if_capenable & IFCAP_VLAN_HWCSUM)
 		ena |= p->if_capenable & IFCAP_LRO;
 
 	/*
 	 * If the parent interface can offload TCP connections over VLANs then
 	 * propagate its TOE capability to the VLAN interface.
 	 *
 	 * All TOE drivers in the tree today can deal with VLANs.  If this
 	 * changes then IFCAP_VLAN_TOE should be promoted to a full capability
 	 * with its own bit.
 	 */
 #define	IFCAP_VLAN_TOE IFCAP_TOE
 	if (p->if_capabilities & IFCAP_VLAN_TOE)
 		cap |= p->if_capabilities & IFCAP_TOE;
 	if (p->if_capenable & IFCAP_VLAN_TOE) {
 		TOEDEV(ifp) = TOEDEV(p);
 		ena |= mena & IFCAP_TOE;
 	}
 
 	/*
 	 * If the parent interface supports dynamic link state, so does the
 	 * VLAN interface.
 	 */
 	cap |= (p->if_capabilities & IFCAP_LINKSTATE);
 	ena |= (mena & IFCAP_LINKSTATE);
 
 #ifdef RATELIMIT
 	/*
 	 * If the parent interface supports ratelimiting, so does the
 	 * VLAN interface.
 	 */
 	cap |= (p->if_capabilities & IFCAP_TXRTLMT);
 	ena |= (mena & IFCAP_TXRTLMT);
 #endif
 
 	/*
 	 * If the parent interface supports unmapped mbufs, so does
 	 * the VLAN interface.  Note that this should be fine even for
 	 * interfaces that don't support hardware tagging as headers
 	 * are prepended in normal mbufs to unmapped mbufs holding
 	 * payload data.
 	 */
 	cap |= (p->if_capabilities & IFCAP_MEXTPG);
 	ena |= (mena & IFCAP_MEXTPG);
 
 	/*
 	 * If the parent interface can offload encryption and segmentation
 	 * of TLS records over TCP, propagate it's capability to the VLAN
 	 * interface.
 	 *
 	 * All TLS drivers in the tree today can deal with VLANs.  If
 	 * this ever changes, then a new IFCAP_VLAN_TXTLS can be
 	 * defined.
 	 */
 	if (p->if_capabilities & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT))
 		cap |= p->if_capabilities & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT);
 	if (p->if_capenable & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT))
 		ena |= mena & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT);
 
 	ifp->if_capabilities = cap;
 	ifp->if_capenable = ena;
 	ifp->if_hwassist = hwa;
 }
 
 static void
 vlan_trunk_capabilities(struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct ifvlantrunk *trunk;
 	struct ifvlan *ifv;
 
 	VLAN_SLOCK();
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		VLAN_SUNLOCK();
 		return;
 	}
 	NET_EPOCH_ENTER(et);
 	VLAN_FOREACH(ifv, trunk)
 		vlan_capabilities(ifv);
 	NET_EPOCH_EXIT(et);
 	VLAN_SUNLOCK();
 }
 
 static int
 vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifnet *p;
 	struct ifreq *ifr;
 #ifdef INET
 	struct ifaddr *ifa;
 #endif
 	struct ifvlan *ifv;
 	struct ifvlantrunk *trunk;
 	struct vlanreq vlr;
 	int error = 0, oldmtu;
 
 	ifr = (struct ifreq *)data;
 #ifdef INET
 	ifa = (struct ifaddr *) data;
 #endif
 	ifv = ifp->if_softc;
 
 	switch (cmd) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 #ifdef INET
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			arp_ifinit(ifp, ifa);
 #endif
 		break;
 	case SIOCGIFADDR:
 		bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0],
 		    ifp->if_addrlen);
 		break;
 	case SIOCGIFMEDIA:
 		VLAN_SLOCK();
 		if (TRUNK(ifv) != NULL) {
 			p = PARENT(ifv);
 			if_ref(p);
 			error = (*p->if_ioctl)(p, SIOCGIFMEDIA, data);
 			if_rele(p);
 			/* Limit the result to the parent's current config. */
 			if (error == 0) {
 				struct ifmediareq *ifmr;
 
 				ifmr = (struct ifmediareq *)data;
 				if (ifmr->ifm_count >= 1 && ifmr->ifm_ulist) {
 					ifmr->ifm_count = 1;
 					error = copyout(&ifmr->ifm_current,
 						ifmr->ifm_ulist,
 						sizeof(int));
 				}
 			}
 		} else {
 			error = EINVAL;
 		}
 		VLAN_SUNLOCK();
 		break;
 
 	case SIOCSIFMEDIA:
 		error = EINVAL;
 		break;
 
 	case SIOCSIFMTU:
 		/*
 		 * Set the interface MTU.
 		 */
 		VLAN_SLOCK();
 		trunk = TRUNK(ifv);
 		if (trunk != NULL) {
 			TRUNK_WLOCK(trunk);
 			if (ifr->ifr_mtu >
 			     (PARENT(ifv)->if_mtu - ifv->ifv_mtufudge) ||
 			    ifr->ifr_mtu <
 			     (ifv->ifv_mintu - ifv->ifv_mtufudge))
 				error = EINVAL;
 			else
 				ifp->if_mtu = ifr->ifr_mtu;
 			TRUNK_WUNLOCK(trunk);
 		} else
 			error = EINVAL;
 		VLAN_SUNLOCK();
 		break;
 
 	case SIOCSETVLAN:
 #ifdef VIMAGE
 		/*
 		 * XXXRW/XXXBZ: The goal in these checks is to allow a VLAN
 		 * interface to be delegated to a jail without allowing the
 		 * jail to change what underlying interface/VID it is
 		 * associated with.  We are not entirely convinced that this
 		 * is the right way to accomplish that policy goal.
 		 */
 		if (ifp->if_vnet != ifp->if_home_vnet) {
 			error = EPERM;
 			break;
 		}
 #endif
 		error = copyin(ifr_data_get_ptr(ifr), &vlr, sizeof(vlr));
 		if (error)
 			break;
 		if (vlr.vlr_parent[0] == '\0') {
 			vlan_unconfig(ifp);
 			break;
 		}
 		p = ifunit_ref(vlr.vlr_parent);
 		if (p == NULL) {
 			error = ENOENT;
 			break;
 		}
 #ifdef COMPAT_FREEBSD12
 		if (vlr.vlr_proto == 0)
 			vlr.vlr_proto = ETHERTYPE_VLAN;
 #endif
 		oldmtu = ifp->if_mtu;
 		error = vlan_config(ifv, p, vlr.vlr_tag, vlr.vlr_proto);
 		if_rele(p);
 
 		/*
 		 * VLAN MTU may change during addition of the vlandev.
 		 * If it did, do network layer specific procedure.
 		 */
 		if (ifp->if_mtu != oldmtu) {
 #ifdef INET6
 			nd6_setmtu(ifp);
 #endif
 			rt_updatemtu(ifp);
 		}
 		break;
 
 	case SIOCGETVLAN:
 #ifdef VIMAGE
 		if (ifp->if_vnet != ifp->if_home_vnet) {
 			error = EPERM;
 			break;
 		}
 #endif
 		bzero(&vlr, sizeof(vlr));
 		VLAN_SLOCK();
 		if (TRUNK(ifv) != NULL) {
 			strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname,
 			    sizeof(vlr.vlr_parent));
 			vlr.vlr_tag = ifv->ifv_vid;
 			vlr.vlr_proto = ifv->ifv_proto;
 		}
 		VLAN_SUNLOCK();
 		error = copyout(&vlr, ifr_data_get_ptr(ifr), sizeof(vlr));
 		break;
 
 	case SIOCSIFFLAGS:
 		/*
 		 * We should propagate selected flags to the parent,
 		 * e.g., promiscuous mode.
 		 */
 		VLAN_XLOCK();
 		if (TRUNK(ifv) != NULL)
 			error = vlan_setflags(ifp, 1);
 		VLAN_XUNLOCK();
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		/*
 		 * If we don't have a parent, just remember the membership for
 		 * when we do.
 		 *
 		 * XXX We need the rmlock here to avoid sleeping while
 		 * holding in6_multi_mtx.
 		 */
 		VLAN_XLOCK();
 		trunk = TRUNK(ifv);
 		if (trunk != NULL)
 			error = vlan_setmulti(ifp);
 		VLAN_XUNLOCK();
 
 		break;
 	case SIOCGVLANPCP:
 #ifdef VIMAGE
 		if (ifp->if_vnet != ifp->if_home_vnet) {
 			error = EPERM;
 			break;
 		}
 #endif
 		ifr->ifr_vlan_pcp = ifv->ifv_pcp;
 		break;
 
 	case SIOCSVLANPCP:
 #ifdef VIMAGE
 		if (ifp->if_vnet != ifp->if_home_vnet) {
 			error = EPERM;
 			break;
 		}
 #endif
 		error = priv_check(curthread, PRIV_NET_SETVLANPCP);
 		if (error)
 			break;
 		if (ifr->ifr_vlan_pcp > VLAN_PCP_MAX) {
 			error = EINVAL;
 			break;
 		}
 		ifv->ifv_pcp = ifr->ifr_vlan_pcp;
 		ifp->if_pcp = ifv->ifv_pcp;
 		/* broadcast event about PCP change */
 		EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP);
 		break;
 
 	case SIOCSIFCAP:
 		VLAN_SLOCK();
 		ifv->ifv_capenable = ifr->ifr_reqcap;
 		trunk = TRUNK(ifv);
 		if (trunk != NULL) {
 			struct epoch_tracker et;
 
 			NET_EPOCH_ENTER(et);
 			vlan_capabilities(ifv);
 			NET_EPOCH_EXIT(et);
 		}
 		VLAN_SUNLOCK();
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 static int
 vlan_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 	struct epoch_tracker et;
 	const struct if_snd_tag_sw *sw;
 	struct vlan_snd_tag *vst;
 	struct ifvlan *ifv;
 	struct ifnet *parent;
 	struct m_snd_tag *mst;
 	int error;
 
 	NET_EPOCH_ENTER(et);
 	ifv = ifp->if_softc;
 
 	switch (params->hdr.type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_UNLIMITED:
 		sw = &vlan_snd_tag_ul_sw;
 		break;
 	case IF_SND_TAG_TYPE_RATE_LIMIT:
 		sw = &vlan_snd_tag_rl_sw;
 		break;
 #endif
 #ifdef KERN_TLS
 	case IF_SND_TAG_TYPE_TLS:
 		sw = &vlan_snd_tag_tls_sw;
 		break;
 	case IF_SND_TAG_TYPE_TLS_RX:
 		sw = NULL;
 		if (params->tls_rx.vlan_id != 0)
 			goto failure;
 		params->tls_rx.vlan_id = ifv->ifv_vid;
 		break;
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
 		sw = &vlan_snd_tag_tls_rl_sw;
 		break;
 #endif
 #endif
 	default:
 		goto failure;
 	}
 
 	if (ifv->ifv_trunk != NULL)
 		parent = PARENT(ifv);
 	else
 		parent = NULL;
 	if (parent == NULL)
 		goto failure;
 	if_ref(parent);
 	NET_EPOCH_EXIT(et);
 
 	if (sw != NULL) {
 		vst = malloc(sizeof(*vst), M_VLAN, M_NOWAIT);
 		if (vst == NULL) {
 			if_rele(parent);
 			return (ENOMEM);
 		}
 	} else
 		vst = NULL;
 
 	error = m_snd_tag_alloc(parent, params, &mst);
 	if_rele(parent);
 	if (error) {
 		free(vst, M_VLAN);
 		return (error);
 	}
 
 	if (sw != NULL) {
 		m_snd_tag_init(&vst->com, ifp, sw);
 		vst->tag = mst;
 
 		*ppmt = &vst->com;
 	} else
 		*ppmt = mst;
 
 	return (0);
 failure:
 	NET_EPOCH_EXIT(et);
 	return (EOPNOTSUPP);
 }
 
 static struct m_snd_tag *
 vlan_next_snd_tag(struct m_snd_tag *mst)
 {
 	struct vlan_snd_tag *vst;
 
 	vst = mst_to_vst(mst);
 	return (vst->tag);
 }
 
 static int
 vlan_snd_tag_modify(struct m_snd_tag *mst,
     union if_snd_tag_modify_params *params)
 {
 	struct vlan_snd_tag *vst;
 
 	vst = mst_to_vst(mst);
 	return (vst->tag->sw->snd_tag_modify(vst->tag, params));
 }
 
 static int
 vlan_snd_tag_query(struct m_snd_tag *mst,
     union if_snd_tag_query_params *params)
 {
 	struct vlan_snd_tag *vst;
 
 	vst = mst_to_vst(mst);
 	return (vst->tag->sw->snd_tag_query(vst->tag, params));
 }
 
 static void
 vlan_snd_tag_free(struct m_snd_tag *mst)
 {
 	struct vlan_snd_tag *vst;
 
 	vst = mst_to_vst(mst);
 	m_snd_tag_rele(vst->tag);
 	free(vst, M_VLAN);
 }
 
 static void
 vlan_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
 {
 	/*
 	 * For vlan, we have an indirect
 	 * interface. The caller needs to
 	 * get a ratelimit tag on the actual
 	 * interface the flow will go on.
 	 */
 	q->rate_table = NULL;
 	q->flags = RT_IS_INDIRECT;
 	q->max_flows = 0;
 	q->number_of_rates = 0;
 }
 
 #endif
diff --git a/sys/net/if_vxlan.c b/sys/net/if_vxlan.c
index 27ad2ec08335..45a24f2b75eb 100644
--- a/sys/net/if_vxlan.c
+++ b/sys/net/if_vxlan.c
@@ -1,3701 +1,3702 @@
 /*-
  * Copyright (c) 2014, Bryan Venteicher <bryanv@FreeBSD.org>
  * All rights reserved.
  * Copyright (c) 2020, Chelsio Communications.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/hash.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/refcount.h>
 #include <sys/rmlock.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 #include <net/if_vxlan.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/in_fib.h>
 #include <netinet6/in6_fib.h>
 
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 
 struct vxlan_softc;
 LIST_HEAD(vxlan_softc_head, vxlan_softc);
 
 struct sx vxlan_sx;
 SX_SYSINIT(vxlan, &vxlan_sx, "VXLAN global start/stop lock");
 
 struct vxlan_socket_mc_info {
 	union vxlan_sockaddr		 vxlsomc_saddr;
 	union vxlan_sockaddr		 vxlsomc_gaddr;
 	int				 vxlsomc_ifidx;
 	int				 vxlsomc_users;
 };
 
 /*
  * The maximum MTU of encapsulated ethernet frame within IPv4/UDP packet.
  */
 #define VXLAN_MAX_MTU	(IP_MAXPACKET - \
 		60 /* Maximum IPv4 header len */ - \
 		sizeof(struct udphdr) - \
 		sizeof(struct vxlan_header) - \
 		ETHER_HDR_LEN - ETHER_CRC_LEN - ETHER_VLAN_ENCAP_LEN)
 #define VXLAN_BASIC_IFCAPS (IFCAP_LINKSTATE | IFCAP_JUMBO_MTU)
 
 #define VXLAN_SO_MC_MAX_GROUPS		32
 
 #define VXLAN_SO_VNI_HASH_SHIFT		6
 #define VXLAN_SO_VNI_HASH_SIZE		(1 << VXLAN_SO_VNI_HASH_SHIFT)
 #define VXLAN_SO_VNI_HASH(_vni)		((_vni) % VXLAN_SO_VNI_HASH_SIZE)
 
 struct vxlan_socket {
 	struct socket			*vxlso_sock;
 	struct rmlock			 vxlso_lock;
 	u_int				 vxlso_refcnt;
 	union vxlan_sockaddr		 vxlso_laddr;
 	LIST_ENTRY(vxlan_socket)	 vxlso_entry;
 	struct vxlan_softc_head		 vxlso_vni_hash[VXLAN_SO_VNI_HASH_SIZE];
 	struct vxlan_socket_mc_info	 vxlso_mc[VXLAN_SO_MC_MAX_GROUPS];
 };
 
 #define VXLAN_SO_RLOCK(_vso, _p)	rm_rlock(&(_vso)->vxlso_lock, (_p))
 #define VXLAN_SO_RUNLOCK(_vso, _p)	rm_runlock(&(_vso)->vxlso_lock, (_p))
 #define VXLAN_SO_WLOCK(_vso)		rm_wlock(&(_vso)->vxlso_lock)
 #define VXLAN_SO_WUNLOCK(_vso)		rm_wunlock(&(_vso)->vxlso_lock)
 #define VXLAN_SO_LOCK_ASSERT(_vso) \
     rm_assert(&(_vso)->vxlso_lock, RA_LOCKED)
 #define VXLAN_SO_LOCK_WASSERT(_vso) \
     rm_assert(&(_vso)->vxlso_lock, RA_WLOCKED)
 
 #define VXLAN_SO_ACQUIRE(_vso)		refcount_acquire(&(_vso)->vxlso_refcnt)
 #define VXLAN_SO_RELEASE(_vso)		refcount_release(&(_vso)->vxlso_refcnt)
 
 struct vxlan_ftable_entry {
 	LIST_ENTRY(vxlan_ftable_entry)	 vxlfe_hash;
 	uint16_t			 vxlfe_flags;
 	uint8_t				 vxlfe_mac[ETHER_ADDR_LEN];
 	union vxlan_sockaddr		 vxlfe_raddr;
 	time_t				 vxlfe_expire;
 };
 
 #define VXLAN_FE_FLAG_DYNAMIC		0x01
 #define VXLAN_FE_FLAG_STATIC		0x02
 
 #define VXLAN_FE_IS_DYNAMIC(_fe) \
     ((_fe)->vxlfe_flags & VXLAN_FE_FLAG_DYNAMIC)
 
 #define VXLAN_SC_FTABLE_SHIFT		9
 #define VXLAN_SC_FTABLE_SIZE		(1 << VXLAN_SC_FTABLE_SHIFT)
 #define VXLAN_SC_FTABLE_MASK		(VXLAN_SC_FTABLE_SIZE - 1)
 #define VXLAN_SC_FTABLE_HASH(_sc, _mac)	\
     (vxlan_mac_hash(_sc, _mac) % VXLAN_SC_FTABLE_SIZE)
 
 LIST_HEAD(vxlan_ftable_head, vxlan_ftable_entry);
 
 struct vxlan_statistics {
 	uint32_t	ftable_nospace;
 	uint32_t	ftable_lock_upgrade_failed;
 	counter_u64_t	txcsum;
 	counter_u64_t	tso;
 	counter_u64_t	rxcsum;
 };
 
 struct vxlan_softc {
 	struct ifnet			*vxl_ifp;
 	int				 vxl_reqcap;
 	u_int				 vxl_fibnum;
 	struct vxlan_socket		*vxl_sock;
 	uint32_t			 vxl_vni;
 	union vxlan_sockaddr		 vxl_src_addr;
 	union vxlan_sockaddr		 vxl_dst_addr;
 	uint32_t			 vxl_flags;
 #define VXLAN_FLAG_INIT		0x0001
 #define VXLAN_FLAG_TEARDOWN	0x0002
 #define VXLAN_FLAG_LEARN	0x0004
 #define VXLAN_FLAG_USER_MTU	0x0008
 
 	uint32_t			 vxl_port_hash_key;
 	uint16_t			 vxl_min_port;
 	uint16_t			 vxl_max_port;
 	uint8_t				 vxl_ttl;
 
 	/* Lookup table from MAC address to forwarding entry. */
 	uint32_t			 vxl_ftable_cnt;
 	uint32_t			 vxl_ftable_max;
 	uint32_t			 vxl_ftable_timeout;
 	uint32_t			 vxl_ftable_hash_key;
 	struct vxlan_ftable_head	*vxl_ftable;
 
 	/* Derived from vxl_dst_addr. */
 	struct vxlan_ftable_entry	 vxl_default_fe;
 
 	struct ip_moptions		*vxl_im4o;
 	struct ip6_moptions		*vxl_im6o;
 
 	struct rmlock			 vxl_lock;
 	volatile u_int			 vxl_refcnt;
 
 	int				 vxl_unit;
 	int				 vxl_vso_mc_index;
 	struct vxlan_statistics		 vxl_stats;
 	struct sysctl_oid		*vxl_sysctl_node;
 	struct sysctl_ctx_list		 vxl_sysctl_ctx;
 	struct callout			 vxl_callout;
 	struct ether_addr		 vxl_hwaddr;
 	int				 vxl_mc_ifindex;
 	struct ifnet			*vxl_mc_ifp;
 	struct ifmedia 			 vxl_media;
 	char				 vxl_mc_ifname[IFNAMSIZ];
 	LIST_ENTRY(vxlan_softc)		 vxl_entry;
 	LIST_ENTRY(vxlan_softc)		 vxl_ifdetach_list;
 
 	/* For rate limiting errors on the tx fast path. */
 	struct timeval err_time;
 	int err_pps;
 };
 
 #define VXLAN_RLOCK(_sc, _p)	rm_rlock(&(_sc)->vxl_lock, (_p))
 #define VXLAN_RUNLOCK(_sc, _p)	rm_runlock(&(_sc)->vxl_lock, (_p))
 #define VXLAN_WLOCK(_sc)	rm_wlock(&(_sc)->vxl_lock)
 #define VXLAN_WUNLOCK(_sc)	rm_wunlock(&(_sc)->vxl_lock)
 #define VXLAN_LOCK_WOWNED(_sc)	rm_wowned(&(_sc)->vxl_lock)
 #define VXLAN_LOCK_ASSERT(_sc)	rm_assert(&(_sc)->vxl_lock, RA_LOCKED)
 #define VXLAN_LOCK_WASSERT(_sc) rm_assert(&(_sc)->vxl_lock, RA_WLOCKED)
 #define VXLAN_UNLOCK(_sc, _p) do {		\
     if (VXLAN_LOCK_WOWNED(_sc))			\
 	VXLAN_WUNLOCK(_sc);			\
     else					\
 	VXLAN_RUNLOCK(_sc, _p);			\
 } while (0)
 
 #define VXLAN_ACQUIRE(_sc)	refcount_acquire(&(_sc)->vxl_refcnt)
 #define VXLAN_RELEASE(_sc)	refcount_release(&(_sc)->vxl_refcnt)
 
 #define	satoconstsin(sa)	((const struct sockaddr_in *)(sa))
 #define	satoconstsin6(sa)	((const struct sockaddr_in6 *)(sa))
 
 struct vxlanudphdr {
 	struct udphdr		vxlh_udp;
 	struct vxlan_header	vxlh_hdr;
 } __packed;
 
 static int	vxlan_ftable_addr_cmp(const uint8_t *, const uint8_t *);
 static void	vxlan_ftable_init(struct vxlan_softc *);
 static void	vxlan_ftable_fini(struct vxlan_softc *);
 static void	vxlan_ftable_flush(struct vxlan_softc *, int);
 static void	vxlan_ftable_expire(struct vxlan_softc *);
 static int	vxlan_ftable_update_locked(struct vxlan_softc *,
 		    const union vxlan_sockaddr *, const uint8_t *,
 		    struct rm_priotracker *);
 static int	vxlan_ftable_learn(struct vxlan_softc *,
 		    const struct sockaddr *, const uint8_t *);
 static int	vxlan_ftable_sysctl_dump(SYSCTL_HANDLER_ARGS);
 
 static struct vxlan_ftable_entry *
 		vxlan_ftable_entry_alloc(void);
 static void	vxlan_ftable_entry_free(struct vxlan_ftable_entry *);
 static void	vxlan_ftable_entry_init(struct vxlan_softc *,
 		    struct vxlan_ftable_entry *, const uint8_t *,
 		    const struct sockaddr *, uint32_t);
 static void	vxlan_ftable_entry_destroy(struct vxlan_softc *,
 		    struct vxlan_ftable_entry *);
 static int	vxlan_ftable_entry_insert(struct vxlan_softc *,
 		    struct vxlan_ftable_entry *);
 static struct vxlan_ftable_entry *
 		vxlan_ftable_entry_lookup(struct vxlan_softc *,
 		    const uint8_t *);
 static void	vxlan_ftable_entry_dump(struct vxlan_ftable_entry *,
 		    struct sbuf *);
 
 static struct vxlan_socket *
 		vxlan_socket_alloc(const union vxlan_sockaddr *);
 static void	vxlan_socket_destroy(struct vxlan_socket *);
 static void	vxlan_socket_release(struct vxlan_socket *);
 static struct vxlan_socket *
 		vxlan_socket_lookup(union vxlan_sockaddr *vxlsa);
 static void	vxlan_socket_insert(struct vxlan_socket *);
 static int	vxlan_socket_init(struct vxlan_socket *, struct ifnet *);
 static int	vxlan_socket_bind(struct vxlan_socket *, struct ifnet *);
 static int	vxlan_socket_create(struct ifnet *, int,
 		    const union vxlan_sockaddr *, struct vxlan_socket **);
 static void	vxlan_socket_ifdetach(struct vxlan_socket *,
 		    struct ifnet *, struct vxlan_softc_head *);
 
 static struct vxlan_socket *
 		vxlan_socket_mc_lookup(const union vxlan_sockaddr *);
 static int	vxlan_sockaddr_mc_info_match(
 		    const struct vxlan_socket_mc_info *,
 		    const union vxlan_sockaddr *,
 		    const union vxlan_sockaddr *, int);
 static int	vxlan_socket_mc_join_group(struct vxlan_socket *,
 		    const union vxlan_sockaddr *, const union vxlan_sockaddr *,
 		    int *, union vxlan_sockaddr *);
 static int	vxlan_socket_mc_leave_group(struct vxlan_socket *,
 		    const union vxlan_sockaddr *,
 		    const union vxlan_sockaddr *, int);
 static int	vxlan_socket_mc_add_group(struct vxlan_socket *,
 		    const union vxlan_sockaddr *, const union vxlan_sockaddr *,
 		    int, int *);
 static void	vxlan_socket_mc_release_group_by_idx(struct vxlan_socket *,
 		    int);
 
 static struct vxlan_softc *
 		vxlan_socket_lookup_softc_locked(struct vxlan_socket *,
 		    uint32_t);
 static struct vxlan_softc *
 		vxlan_socket_lookup_softc(struct vxlan_socket *, uint32_t);
 static int	vxlan_socket_insert_softc(struct vxlan_socket *,
 		    struct vxlan_softc *);
 static void	vxlan_socket_remove_softc(struct vxlan_socket *,
 		    struct vxlan_softc *);
 
 static struct ifnet *
 		vxlan_multicast_if_ref(struct vxlan_softc *, int);
 static void	vxlan_free_multicast(struct vxlan_softc *);
 static int	vxlan_setup_multicast_interface(struct vxlan_softc *);
 
 static int	vxlan_setup_multicast(struct vxlan_softc *);
 static int	vxlan_setup_socket(struct vxlan_softc *);
 #ifdef INET6
 static void	vxlan_setup_zero_checksum_port(struct vxlan_softc *);
 #endif
 static void	vxlan_setup_interface_hdrlen(struct vxlan_softc *);
 static int	vxlan_valid_init_config(struct vxlan_softc *);
 static void	vxlan_init_wait(struct vxlan_softc *);
 static void	vxlan_init_complete(struct vxlan_softc *);
 static void	vxlan_init(void *);
 static void	vxlan_release(struct vxlan_softc *);
 static void	vxlan_teardown_wait(struct vxlan_softc *);
 static void	vxlan_teardown_complete(struct vxlan_softc *);
 static void	vxlan_teardown_locked(struct vxlan_softc *);
 static void	vxlan_teardown(struct vxlan_softc *);
 static void	vxlan_ifdetach(struct vxlan_softc *, struct ifnet *,
 		    struct vxlan_softc_head *);
 static void	vxlan_timer(void *);
 
 static int	vxlan_ctrl_get_config(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_vni(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_local_addr(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_remote_addr(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_local_port(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_remote_port(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_port_range(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_ftable_timeout(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_ftable_max(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_multicast_if(struct vxlan_softc * , void *);
 static int	vxlan_ctrl_set_ttl(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_learn(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_ftable_entry_add(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_ftable_entry_rem(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_flush(struct vxlan_softc *, void *);
 static int	vxlan_ioctl_drvspec(struct vxlan_softc *,
 		    struct ifdrv *, int);
 static int	vxlan_ioctl_ifflags(struct vxlan_softc *);
 static int	vxlan_ioctl(struct ifnet *, u_long, caddr_t);
 
 #if defined(INET) || defined(INET6)
 static uint16_t vxlan_pick_source_port(struct vxlan_softc *, struct mbuf *);
 static void	vxlan_encap_header(struct vxlan_softc *, struct mbuf *,
 		    int, uint16_t, uint16_t);
 #endif
 static int	vxlan_encap4(struct vxlan_softc *,
 		    const union vxlan_sockaddr *, struct mbuf *);
 static int	vxlan_encap6(struct vxlan_softc *,
 		    const union vxlan_sockaddr *, struct mbuf *);
 static int	vxlan_transmit(struct ifnet *, struct mbuf *);
 static void	vxlan_qflush(struct ifnet *);
 static bool	vxlan_rcv_udp_packet(struct mbuf *, int, struct inpcb *,
 		    const struct sockaddr *, void *);
 static int	vxlan_input(struct vxlan_socket *, uint32_t, struct mbuf **,
 		    const struct sockaddr *);
 
 static int	vxlan_stats_alloc(struct vxlan_softc *);
 static void	vxlan_stats_free(struct vxlan_softc *);
 static void	vxlan_set_default_config(struct vxlan_softc *);
 static int	vxlan_set_user_config(struct vxlan_softc *,
 		     struct ifvxlanparam *);
 static int	vxlan_set_reqcap(struct vxlan_softc *, struct ifnet *, int);
 static void	vxlan_set_hwcaps(struct vxlan_softc *);
 static int	vxlan_clone_create(struct if_clone *, char *, size_t,
 		    struct ifc_data *, struct ifnet **);
 static int	vxlan_clone_destroy(struct if_clone *, struct ifnet *, uint32_t);
 
 static uint32_t vxlan_mac_hash(struct vxlan_softc *, const uint8_t *);
 static int	vxlan_media_change(struct ifnet *);
 static void	vxlan_media_status(struct ifnet *, struct ifmediareq *);
 
 static int	vxlan_sockaddr_cmp(const union vxlan_sockaddr *,
 		    const struct sockaddr *);
 static void	vxlan_sockaddr_copy(union vxlan_sockaddr *,
 		    const struct sockaddr *);
 static int	vxlan_sockaddr_in_equal(const union vxlan_sockaddr *,
 		    const struct sockaddr *);
 static void	vxlan_sockaddr_in_copy(union vxlan_sockaddr *,
 		    const struct sockaddr *);
 static int	vxlan_sockaddr_supported(const union vxlan_sockaddr *, int);
 static int	vxlan_sockaddr_in_any(const union vxlan_sockaddr *);
 static int	vxlan_sockaddr_in_multicast(const union vxlan_sockaddr *);
 static int	vxlan_sockaddr_in6_embedscope(union vxlan_sockaddr *);
 
 static int	vxlan_can_change_config(struct vxlan_softc *);
 static int	vxlan_check_vni(uint32_t);
 static int	vxlan_check_ttl(int);
 static int	vxlan_check_ftable_timeout(uint32_t);
 static int	vxlan_check_ftable_max(uint32_t);
 
 static void	vxlan_sysctl_setup(struct vxlan_softc *);
 static void	vxlan_sysctl_destroy(struct vxlan_softc *);
 static int	vxlan_tunable_int(struct vxlan_softc *, const char *, int);
 
 static void	vxlan_ifdetach_event(void *, struct ifnet *);
 static void	vxlan_load(void);
 static void	vxlan_unload(void);
 static int	vxlan_modevent(module_t, int, void *);
 
 static const char vxlan_name[] = "vxlan";
 static MALLOC_DEFINE(M_VXLAN, vxlan_name,
     "Virtual eXtensible LAN Interface");
 static struct if_clone *vxlan_cloner;
 
 static struct mtx vxlan_list_mtx;
 #define VXLAN_LIST_LOCK()	mtx_lock(&vxlan_list_mtx)
 #define VXLAN_LIST_UNLOCK()	mtx_unlock(&vxlan_list_mtx)
 
 static LIST_HEAD(, vxlan_socket) vxlan_socket_list;
 
 static eventhandler_tag vxlan_ifdetach_event_tag;
 
 SYSCTL_DECL(_net_link);
 SYSCTL_NODE(_net_link, OID_AUTO, vxlan, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Virtual eXtensible Local Area Network");
 
 static int vxlan_legacy_port = 0;
 TUNABLE_INT("net.link.vxlan.legacy_port", &vxlan_legacy_port);
 static int vxlan_reuse_port = 0;
 TUNABLE_INT("net.link.vxlan.reuse_port", &vxlan_reuse_port);
 
 /* Default maximum number of addresses in the forwarding table. */
 #ifndef VXLAN_FTABLE_MAX
 #define VXLAN_FTABLE_MAX	2000
 #endif
 
 /* Timeout (in seconds) of addresses learned in the forwarding table. */
 #ifndef VXLAN_FTABLE_TIMEOUT
 #define VXLAN_FTABLE_TIMEOUT	(20 * 60)
 #endif
 
 /*
  * Maximum timeout (in seconds) of addresses learned in the forwarding
  * table.
  */
 #ifndef VXLAN_FTABLE_MAX_TIMEOUT
 #define VXLAN_FTABLE_MAX_TIMEOUT	(60 * 60 * 24)
 #endif
 
 /* Number of seconds between pruning attempts of the forwarding table. */
 #ifndef VXLAN_FTABLE_PRUNE
 #define VXLAN_FTABLE_PRUNE	(5 * 60)
 #endif
 
 static int vxlan_ftable_prune_period = VXLAN_FTABLE_PRUNE;
 
 struct vxlan_control {
 	int	(*vxlc_func)(struct vxlan_softc *, void *);
 	int	vxlc_argsize;
 	int	vxlc_flags;
 #define VXLAN_CTRL_FLAG_COPYIN	0x01
 #define VXLAN_CTRL_FLAG_COPYOUT	0x02
 #define VXLAN_CTRL_FLAG_SUSER	0x04
 };
 
 static const struct vxlan_control vxlan_control_table[] = {
 	[VXLAN_CMD_GET_CONFIG] =
 	    {	vxlan_ctrl_get_config, sizeof(struct ifvxlancfg),
 		VXLAN_CTRL_FLAG_COPYOUT
 	    },
 
 	[VXLAN_CMD_SET_VNI] =
 	    {   vxlan_ctrl_set_vni, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_LOCAL_ADDR] =
 	    {   vxlan_ctrl_set_local_addr, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_REMOTE_ADDR] =
 	    {   vxlan_ctrl_set_remote_addr, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_LOCAL_PORT] =
 	    {   vxlan_ctrl_set_local_port, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_REMOTE_PORT] =
 	    {   vxlan_ctrl_set_remote_port, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_PORT_RANGE] =
 	    {   vxlan_ctrl_set_port_range, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_FTABLE_TIMEOUT] =
 	    {	vxlan_ctrl_set_ftable_timeout, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_FTABLE_MAX] =
 	    {	vxlan_ctrl_set_ftable_max, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_MULTICAST_IF] =
 	    {	vxlan_ctrl_set_multicast_if, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_TTL] =
 	    {	vxlan_ctrl_set_ttl, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_LEARN] =
 	    {	vxlan_ctrl_set_learn, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_FTABLE_ENTRY_ADD] =
 	    {	vxlan_ctrl_ftable_entry_add, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_FTABLE_ENTRY_REM] =
 	    {	vxlan_ctrl_ftable_entry_rem, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_FLUSH] =
 	    {   vxlan_ctrl_flush, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 };
 
 static const int vxlan_control_table_size = nitems(vxlan_control_table);
 
 static int
 vxlan_ftable_addr_cmp(const uint8_t *a, const uint8_t *b)
 {
 	int i, d;
 
 	for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++)
 		d = ((int)a[i]) - ((int)b[i]);
 
 	return (d);
 }
 
 static void
 vxlan_ftable_init(struct vxlan_softc *sc)
 {
 	int i;
 
 	sc->vxl_ftable = malloc(sizeof(struct vxlan_ftable_head) *
 	    VXLAN_SC_FTABLE_SIZE, M_VXLAN, M_ZERO | M_WAITOK);
 
 	for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++)
 		LIST_INIT(&sc->vxl_ftable[i]);
 	sc->vxl_ftable_hash_key = arc4random();
 }
 
 static void
 vxlan_ftable_fini(struct vxlan_softc *sc)
 {
 	int i;
 
 	for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) {
 		KASSERT(LIST_EMPTY(&sc->vxl_ftable[i]),
 		    ("%s: vxlan %p ftable[%d] not empty", __func__, sc, i));
 	}
 	MPASS(sc->vxl_ftable_cnt == 0);
 
 	free(sc->vxl_ftable, M_VXLAN);
 	sc->vxl_ftable = NULL;
 }
 
 static void
 vxlan_ftable_flush(struct vxlan_softc *sc, int all)
 {
 	struct vxlan_ftable_entry *fe, *tfe;
 	int i;
 
 	for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) {
 		LIST_FOREACH_SAFE(fe, &sc->vxl_ftable[i], vxlfe_hash, tfe) {
 			if (all || VXLAN_FE_IS_DYNAMIC(fe))
 				vxlan_ftable_entry_destroy(sc, fe);
 		}
 	}
 }
 
 static void
 vxlan_ftable_expire(struct vxlan_softc *sc)
 {
 	struct vxlan_ftable_entry *fe, *tfe;
 	int i;
 
 	VXLAN_LOCK_WASSERT(sc);
 
 	for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) {
 		LIST_FOREACH_SAFE(fe, &sc->vxl_ftable[i], vxlfe_hash, tfe) {
 			if (VXLAN_FE_IS_DYNAMIC(fe) &&
 			    time_uptime >= fe->vxlfe_expire)
 				vxlan_ftable_entry_destroy(sc, fe);
 		}
 	}
 }
 
 static int
 vxlan_ftable_update_locked(struct vxlan_softc *sc,
     const union vxlan_sockaddr *vxlsa, const uint8_t *mac,
     struct rm_priotracker *tracker)
 {
 	struct vxlan_ftable_entry *fe;
 	int error __unused;
 
 	VXLAN_LOCK_ASSERT(sc);
 
 again:
 	/*
 	 * A forwarding entry for this MAC address might already exist. If
 	 * so, update it, otherwise create a new one. We may have to upgrade
 	 * the lock if we have to change or create an entry.
 	 */
 	fe = vxlan_ftable_entry_lookup(sc, mac);
 	if (fe != NULL) {
 		fe->vxlfe_expire = time_uptime + sc->vxl_ftable_timeout;
 
 		if (!VXLAN_FE_IS_DYNAMIC(fe) ||
 		    vxlan_sockaddr_in_equal(&fe->vxlfe_raddr, &vxlsa->sa))
 			return (0);
 		if (!VXLAN_LOCK_WOWNED(sc)) {
 			VXLAN_RUNLOCK(sc, tracker);
 			VXLAN_WLOCK(sc);
 			sc->vxl_stats.ftable_lock_upgrade_failed++;
 			goto again;
 		}
 		vxlan_sockaddr_in_copy(&fe->vxlfe_raddr, &vxlsa->sa);
 		return (0);
 	}
 
 	if (!VXLAN_LOCK_WOWNED(sc)) {
 		VXLAN_RUNLOCK(sc, tracker);
 		VXLAN_WLOCK(sc);
 		sc->vxl_stats.ftable_lock_upgrade_failed++;
 		goto again;
 	}
 
 	if (sc->vxl_ftable_cnt >= sc->vxl_ftable_max) {
 		sc->vxl_stats.ftable_nospace++;
 		return (ENOSPC);
 	}
 
 	fe = vxlan_ftable_entry_alloc();
 	if (fe == NULL)
 		return (ENOMEM);
 
 	vxlan_ftable_entry_init(sc, fe, mac, &vxlsa->sa, VXLAN_FE_FLAG_DYNAMIC);
 
 	/* The prior lookup failed, so the insert should not. */
 	error = vxlan_ftable_entry_insert(sc, fe);
 	MPASS(error == 0);
 
 	return (0);
 }
 
 static int
 vxlan_ftable_learn(struct vxlan_softc *sc, const struct sockaddr *sa,
     const uint8_t *mac)
 {
 	struct rm_priotracker tracker;
 	union vxlan_sockaddr vxlsa;
 	int error;
 
 	/*
 	 * The source port may be randomly selected by the remote host, so
 	 * use the port of the default destination address.
 	 */
 	vxlan_sockaddr_copy(&vxlsa, sa);
 	vxlsa.in4.sin_port = sc->vxl_dst_addr.in4.sin_port;
 
 	if (VXLAN_SOCKADDR_IS_IPV6(&vxlsa)) {
 		error = vxlan_sockaddr_in6_embedscope(&vxlsa);
 		if (error)
 			return (error);
 	}
 
 	VXLAN_RLOCK(sc, &tracker);
 	error = vxlan_ftable_update_locked(sc, &vxlsa, mac, &tracker);
 	VXLAN_UNLOCK(sc, &tracker);
 
 	return (error);
 }
 
 static int
 vxlan_ftable_sysctl_dump(SYSCTL_HANDLER_ARGS)
 {
 	struct rm_priotracker tracker;
 	struct sbuf sb;
 	struct vxlan_softc *sc;
 	struct vxlan_ftable_entry *fe;
 	size_t size;
 	int i, error;
 
 	/*
 	 * This is mostly intended for debugging during development. It is
 	 * not practical to dump an entire large table this way.
 	 */
 
 	sc = arg1;
 	size = PAGE_SIZE;	/* Calculate later. */
 
 	sbuf_new(&sb, NULL, size, SBUF_FIXEDLEN);
 	sbuf_putc(&sb, '\n');
 
 	VXLAN_RLOCK(sc, &tracker);
 	for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) {
 		LIST_FOREACH(fe, &sc->vxl_ftable[i], vxlfe_hash) {
 			if (sbuf_error(&sb) != 0)
 				break;
 			vxlan_ftable_entry_dump(fe, &sb);
 		}
 	}
 	VXLAN_RUNLOCK(sc, &tracker);
 
 	if (sbuf_len(&sb) == 1)
 		sbuf_setpos(&sb, 0);
 
 	sbuf_finish(&sb);
 	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 	sbuf_delete(&sb);
 
 	return (error);
 }
 
 static struct vxlan_ftable_entry *
 vxlan_ftable_entry_alloc(void)
 {
 	struct vxlan_ftable_entry *fe;
 
 	fe = malloc(sizeof(*fe), M_VXLAN, M_ZERO | M_NOWAIT);
 
 	return (fe);
 }
 
 static void
 vxlan_ftable_entry_free(struct vxlan_ftable_entry *fe)
 {
 
 	free(fe, M_VXLAN);
 }
 
 static void
 vxlan_ftable_entry_init(struct vxlan_softc *sc, struct vxlan_ftable_entry *fe,
     const uint8_t *mac, const struct sockaddr *sa, uint32_t flags)
 {
 
 	fe->vxlfe_flags = flags;
 	fe->vxlfe_expire = time_uptime + sc->vxl_ftable_timeout;
 	memcpy(fe->vxlfe_mac, mac, ETHER_ADDR_LEN);
 	vxlan_sockaddr_copy(&fe->vxlfe_raddr, sa);
 }
 
 static void
 vxlan_ftable_entry_destroy(struct vxlan_softc *sc,
     struct vxlan_ftable_entry *fe)
 {
 
 	sc->vxl_ftable_cnt--;
 	LIST_REMOVE(fe, vxlfe_hash);
 	vxlan_ftable_entry_free(fe);
 }
 
 static int
 vxlan_ftable_entry_insert(struct vxlan_softc *sc,
     struct vxlan_ftable_entry *fe)
 {
 	struct vxlan_ftable_entry *lfe;
 	uint32_t hash;
 	int dir;
 
 	VXLAN_LOCK_WASSERT(sc);
 	hash = VXLAN_SC_FTABLE_HASH(sc, fe->vxlfe_mac);
 
 	lfe = LIST_FIRST(&sc->vxl_ftable[hash]);
 	if (lfe == NULL) {
 		LIST_INSERT_HEAD(&sc->vxl_ftable[hash], fe, vxlfe_hash);
 		goto out;
 	}
 
 	do {
 		dir = vxlan_ftable_addr_cmp(fe->vxlfe_mac, lfe->vxlfe_mac);
 		if (dir == 0)
 			return (EEXIST);
 		if (dir > 0) {
 			LIST_INSERT_BEFORE(lfe, fe, vxlfe_hash);
 			goto out;
 		} else if (LIST_NEXT(lfe, vxlfe_hash) == NULL) {
 			LIST_INSERT_AFTER(lfe, fe, vxlfe_hash);
 			goto out;
 		} else
 			lfe = LIST_NEXT(lfe, vxlfe_hash);
 	} while (lfe != NULL);
 
 out:
 	sc->vxl_ftable_cnt++;
 
 	return (0);
 }
 
 static struct vxlan_ftable_entry *
 vxlan_ftable_entry_lookup(struct vxlan_softc *sc, const uint8_t *mac)
 {
 	struct vxlan_ftable_entry *fe;
 	uint32_t hash;
 	int dir;
 
 	VXLAN_LOCK_ASSERT(sc);
 	hash = VXLAN_SC_FTABLE_HASH(sc, mac);
 
 	LIST_FOREACH(fe, &sc->vxl_ftable[hash], vxlfe_hash) {
 		dir = vxlan_ftable_addr_cmp(mac, fe->vxlfe_mac);
 		if (dir == 0)
 			return (fe);
 		if (dir > 0)
 			break;
 	}
 
 	return (NULL);
 }
 
 static void
 vxlan_ftable_entry_dump(struct vxlan_ftable_entry *fe, struct sbuf *sb)
 {
 	char buf[64];
 	const union vxlan_sockaddr *sa;
 	const void *addr;
 	int i, len, af, width;
 
 	sa = &fe->vxlfe_raddr;
 	af = sa->sa.sa_family;
 	len = sbuf_len(sb);
 
 	sbuf_printf(sb, "%c 0x%02X ", VXLAN_FE_IS_DYNAMIC(fe) ? 'D' : 'S',
 	    fe->vxlfe_flags);
 
 	for (i = 0; i < ETHER_ADDR_LEN - 1; i++)
 		sbuf_printf(sb, "%02X:", fe->vxlfe_mac[i]);
 	sbuf_printf(sb, "%02X ", fe->vxlfe_mac[i]);
 
 	if (af == AF_INET) {
 		addr = &sa->in4.sin_addr;
 		width = INET_ADDRSTRLEN - 1;
 	} else {
 		addr = &sa->in6.sin6_addr;
 		width = INET6_ADDRSTRLEN - 1;
 	}
 	inet_ntop(af, addr, buf, sizeof(buf));
 	sbuf_printf(sb, "%*s ", width, buf);
 
 	sbuf_printf(sb, "%08jd", (intmax_t)fe->vxlfe_expire);
 
 	sbuf_putc(sb, '\n');
 
 	/* Truncate a partial line. */
 	if (sbuf_error(sb) != 0)
 		sbuf_setpos(sb, len);
 }
 
 static struct vxlan_socket *
 vxlan_socket_alloc(const union vxlan_sockaddr *sa)
 {
 	struct vxlan_socket *vso;
 	int i;
 
 	vso = malloc(sizeof(*vso), M_VXLAN, M_WAITOK | M_ZERO);
 	rm_init(&vso->vxlso_lock, "vxlansorm");
 	refcount_init(&vso->vxlso_refcnt, 0);
 	for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++)
 		LIST_INIT(&vso->vxlso_vni_hash[i]);
 	vso->vxlso_laddr = *sa;
 
 	return (vso);
 }
 
 static void
 vxlan_socket_destroy(struct vxlan_socket *vso)
 {
 	struct socket *so;
 #ifdef INVARIANTS
 	int i;
 	struct vxlan_socket_mc_info *mc;
 
 	for (i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) {
 		mc = &vso->vxlso_mc[i];
 		KASSERT(mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC,
 		    ("%s: socket %p mc[%d] still has address",
 		     __func__, vso, i));
 	}
 
 	for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++) {
 		KASSERT(LIST_EMPTY(&vso->vxlso_vni_hash[i]),
 		    ("%s: socket %p vni_hash[%d] not empty",
 		     __func__, vso, i));
 	}
 #endif
 	so = vso->vxlso_sock;
 	if (so != NULL) {
 		vso->vxlso_sock = NULL;
 		soclose(so);
 	}
 
 	rm_destroy(&vso->vxlso_lock);
 	free(vso, M_VXLAN);
 }
 
 static void
 vxlan_socket_release(struct vxlan_socket *vso)
 {
 	int destroy;
 
 	VXLAN_LIST_LOCK();
 	destroy = VXLAN_SO_RELEASE(vso);
 	if (destroy != 0)
 		LIST_REMOVE(vso, vxlso_entry);
 	VXLAN_LIST_UNLOCK();
 
 	if (destroy != 0)
 		vxlan_socket_destroy(vso);
 }
 
 static struct vxlan_socket *
 vxlan_socket_lookup(union vxlan_sockaddr *vxlsa)
 {
 	struct vxlan_socket *vso;
 
 	VXLAN_LIST_LOCK();
 	LIST_FOREACH(vso, &vxlan_socket_list, vxlso_entry) {
 		if (vxlan_sockaddr_cmp(&vso->vxlso_laddr, &vxlsa->sa) == 0) {
 			VXLAN_SO_ACQUIRE(vso);
 			break;
 		}
 	}
 	VXLAN_LIST_UNLOCK();
 
 	return (vso);
 }
 
 static void
 vxlan_socket_insert(struct vxlan_socket *vso)
 {
 
 	VXLAN_LIST_LOCK();
 	VXLAN_SO_ACQUIRE(vso);
 	LIST_INSERT_HEAD(&vxlan_socket_list, vso, vxlso_entry);
 	VXLAN_LIST_UNLOCK();
 }
 
 static int
 vxlan_socket_init(struct vxlan_socket *vso, struct ifnet *ifp)
 {
 	struct thread *td;
 	int error;
 
 	td = curthread;
 
 	error = socreate(vso->vxlso_laddr.sa.sa_family, &vso->vxlso_sock,
 	    SOCK_DGRAM, IPPROTO_UDP, td->td_ucred, td);
 	if (error) {
 		if_printf(ifp, "cannot create socket: %d\n", error);
 		return (error);
 	}
 
 	error = udp_set_kernel_tunneling(vso->vxlso_sock,
 	    vxlan_rcv_udp_packet, NULL, vso);
 	if (error) {
 		if_printf(ifp, "cannot set tunneling function: %d\n", error);
 		return (error);
 	}
 
 	if (vxlan_reuse_port != 0) {
 		struct sockopt sopt;
 		int val = 1;
 
 		bzero(&sopt, sizeof(sopt));
 		sopt.sopt_dir = SOPT_SET;
 		sopt.sopt_level = IPPROTO_IP;
 		sopt.sopt_name = SO_REUSEPORT;
 		sopt.sopt_val = &val;
 		sopt.sopt_valsize = sizeof(val);
 		error = sosetopt(vso->vxlso_sock, &sopt);
 		if (error) {
 			if_printf(ifp,
 			    "cannot set REUSEADDR socket opt: %d\n", error);
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 static int
 vxlan_socket_bind(struct vxlan_socket *vso, struct ifnet *ifp)
 {
 	union vxlan_sockaddr laddr;
 	struct thread *td;
 	int error;
 
 	td = curthread;
 	laddr = vso->vxlso_laddr;
 
 	error = sobind(vso->vxlso_sock, &laddr.sa, td);
 	if (error) {
 		if (error != EADDRINUSE)
 			if_printf(ifp, "cannot bind socket: %d\n", error);
 		return (error);
 	}
 
 	return (0);
 }
 
 static int
 vxlan_socket_create(struct ifnet *ifp, int multicast,
     const union vxlan_sockaddr *saddr, struct vxlan_socket **vsop)
 {
 	union vxlan_sockaddr laddr;
 	struct vxlan_socket *vso;
 	int error;
 
 	laddr = *saddr;
 
 	/*
 	 * If this socket will be multicast, then only the local port
 	 * must be specified when binding.
 	 */
 	if (multicast != 0) {
 		if (VXLAN_SOCKADDR_IS_IPV4(&laddr))
 			laddr.in4.sin_addr.s_addr = INADDR_ANY;
 #ifdef INET6
 		else
 			laddr.in6.sin6_addr = in6addr_any;
 #endif
 	}
 
 	vso = vxlan_socket_alloc(&laddr);
 	if (vso == NULL)
 		return (ENOMEM);
 
 	error = vxlan_socket_init(vso, ifp);
 	if (error)
 		goto fail;
 
 	error = vxlan_socket_bind(vso, ifp);
 	if (error)
 		goto fail;
 
 	/*
 	 * There is a small window between the bind completing and
 	 * inserting the socket, so that a concurrent create may fail.
 	 * Let's not worry about that for now.
 	 */
 	vxlan_socket_insert(vso);
 	*vsop = vso;
 
 	return (0);
 
 fail:
 	vxlan_socket_destroy(vso);
 
 	return (error);
 }
 
 static void
 vxlan_socket_ifdetach(struct vxlan_socket *vso, struct ifnet *ifp,
     struct vxlan_softc_head *list)
 {
 	struct rm_priotracker tracker;
 	struct vxlan_softc *sc;
 	int i;
 
 	VXLAN_SO_RLOCK(vso, &tracker);
 	for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++) {
 		LIST_FOREACH(sc, &vso->vxlso_vni_hash[i], vxl_entry)
 			vxlan_ifdetach(sc, ifp, list);
 	}
 	VXLAN_SO_RUNLOCK(vso, &tracker);
 }
 
 static struct vxlan_socket *
 vxlan_socket_mc_lookup(const union vxlan_sockaddr *vxlsa)
 {
 	union vxlan_sockaddr laddr;
 	struct vxlan_socket *vso;
 
 	laddr = *vxlsa;
 
 	if (VXLAN_SOCKADDR_IS_IPV4(&laddr))
 		laddr.in4.sin_addr.s_addr = INADDR_ANY;
 #ifdef INET6
 	else
 		laddr.in6.sin6_addr = in6addr_any;
 #endif
 
 	vso = vxlan_socket_lookup(&laddr);
 
 	return (vso);
 }
 
 static int
 vxlan_sockaddr_mc_info_match(const struct vxlan_socket_mc_info *mc,
     const union vxlan_sockaddr *group, const union vxlan_sockaddr *local,
     int ifidx)
 {
 
 	if (!vxlan_sockaddr_in_any(local) &&
 	    !vxlan_sockaddr_in_equal(&mc->vxlsomc_saddr, &local->sa))
 		return (0);
 	if (!vxlan_sockaddr_in_equal(&mc->vxlsomc_gaddr, &group->sa))
 		return (0);
 	if (ifidx != 0 && ifidx != mc->vxlsomc_ifidx)
 		return (0);
 
 	return (1);
 }
 
 static int
 vxlan_socket_mc_join_group(struct vxlan_socket *vso,
     const union vxlan_sockaddr *group, const union vxlan_sockaddr *local,
     int *ifidx, union vxlan_sockaddr *source)
 {
 	struct sockopt sopt;
 	int error;
 
 	*source = *local;
 
 	if (VXLAN_SOCKADDR_IS_IPV4(group)) {
 		struct ip_mreq mreq;
 
 		mreq.imr_multiaddr = group->in4.sin_addr;
 		mreq.imr_interface = local->in4.sin_addr;
 
 		bzero(&sopt, sizeof(sopt));
 		sopt.sopt_dir = SOPT_SET;
 		sopt.sopt_level = IPPROTO_IP;
 		sopt.sopt_name = IP_ADD_MEMBERSHIP;
 		sopt.sopt_val = &mreq;
 		sopt.sopt_valsize = sizeof(mreq);
 		error = sosetopt(vso->vxlso_sock, &sopt);
 		if (error)
 			return (error);
 
 		/*
 		 * BMV: Ideally, there would be a formal way for us to get
 		 * the local interface that was selected based on the
 		 * imr_interface address. We could then update *ifidx so
 		 * vxlan_sockaddr_mc_info_match() would return a match for
 		 * later creates that explicitly set the multicast interface.
 		 *
 		 * If we really need to, we can of course look in the INP's
 		 * membership list:
 		 *     sotoinpcb(vso->vxlso_sock)->inp_moptions->
 		 *         imo_head[]->imf_inm->inm_ifp
 		 * similarly to imo_match_group().
 		 */
 		source->in4.sin_addr = local->in4.sin_addr;
 
 	} else if (VXLAN_SOCKADDR_IS_IPV6(group)) {
 		struct ipv6_mreq mreq;
 
 		mreq.ipv6mr_multiaddr = group->in6.sin6_addr;
 		mreq.ipv6mr_interface = *ifidx;
 
 		bzero(&sopt, sizeof(sopt));
 		sopt.sopt_dir = SOPT_SET;
 		sopt.sopt_level = IPPROTO_IPV6;
 		sopt.sopt_name = IPV6_JOIN_GROUP;
 		sopt.sopt_val = &mreq;
 		sopt.sopt_valsize = sizeof(mreq);
 		error = sosetopt(vso->vxlso_sock, &sopt);
 		if (error)
 			return (error);
 
 		/*
 		 * BMV: As with IPv4, we would really like to know what
 		 * interface in6p_lookup_mcast_ifp() selected.
 		 */
 	} else
 		error = EAFNOSUPPORT;
 
 	return (error);
 }
 
 static int
 vxlan_socket_mc_leave_group(struct vxlan_socket *vso,
     const union vxlan_sockaddr *group, const union vxlan_sockaddr *source,
     int ifidx)
 {
 	struct sockopt sopt;
 	int error;
 
 	bzero(&sopt, sizeof(sopt));
 	sopt.sopt_dir = SOPT_SET;
 
 	if (VXLAN_SOCKADDR_IS_IPV4(group)) {
 		struct ip_mreq mreq;
 
 		mreq.imr_multiaddr = group->in4.sin_addr;
 		mreq.imr_interface = source->in4.sin_addr;
 
 		sopt.sopt_level = IPPROTO_IP;
 		sopt.sopt_name = IP_DROP_MEMBERSHIP;
 		sopt.sopt_val = &mreq;
 		sopt.sopt_valsize = sizeof(mreq);
 		error = sosetopt(vso->vxlso_sock, &sopt);
 
 	} else if (VXLAN_SOCKADDR_IS_IPV6(group)) {
 		struct ipv6_mreq mreq;
 
 		mreq.ipv6mr_multiaddr = group->in6.sin6_addr;
 		mreq.ipv6mr_interface = ifidx;
 
 		sopt.sopt_level = IPPROTO_IPV6;
 		sopt.sopt_name = IPV6_LEAVE_GROUP;
 		sopt.sopt_val = &mreq;
 		sopt.sopt_valsize = sizeof(mreq);
 		error = sosetopt(vso->vxlso_sock, &sopt);
 
 	} else
 		error = EAFNOSUPPORT;
 
 	return (error);
 }
 
 static int
 vxlan_socket_mc_add_group(struct vxlan_socket *vso,
     const union vxlan_sockaddr *group, const union vxlan_sockaddr *local,
     int ifidx, int *idx)
 {
 	union vxlan_sockaddr source;
 	struct vxlan_socket_mc_info *mc;
 	int i, empty, error;
 
 	/*
 	 * Within a socket, the same multicast group may be used by multiple
 	 * interfaces, each with a different network identifier. But a socket
 	 * may only join a multicast group once, so keep track of the users
 	 * here.
 	 */
 
 	VXLAN_SO_WLOCK(vso);
 	for (empty = 0, i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) {
 		mc = &vso->vxlso_mc[i];
 
 		if (mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC) {
 			empty++;
 			continue;
 		}
 
 		if (vxlan_sockaddr_mc_info_match(mc, group, local, ifidx))
 			goto out;
 	}
 	VXLAN_SO_WUNLOCK(vso);
 
 	if (empty == 0)
 		return (ENOSPC);
 
 	error = vxlan_socket_mc_join_group(vso, group, local, &ifidx, &source);
 	if (error)
 		return (error);
 
 	VXLAN_SO_WLOCK(vso);
 	for (i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) {
 		mc = &vso->vxlso_mc[i];
 
 		if (mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC) {
 			vxlan_sockaddr_copy(&mc->vxlsomc_gaddr, &group->sa);
 			vxlan_sockaddr_copy(&mc->vxlsomc_saddr, &source.sa);
 			mc->vxlsomc_ifidx = ifidx;
 			goto out;
 		}
 	}
 	VXLAN_SO_WUNLOCK(vso);
 
 	error = vxlan_socket_mc_leave_group(vso, group, &source, ifidx);
 	MPASS(error == 0);
 
 	return (ENOSPC);
 
 out:
 	mc->vxlsomc_users++;
 	VXLAN_SO_WUNLOCK(vso);
 
 	*idx = i;
 
 	return (0);
 }
 
 static void
 vxlan_socket_mc_release_group_by_idx(struct vxlan_socket *vso, int idx)
 {
 	union vxlan_sockaddr group, source;
 	struct vxlan_socket_mc_info *mc;
 	int ifidx, leave;
 
 	KASSERT(idx >= 0 && idx < VXLAN_SO_MC_MAX_GROUPS,
 	    ("%s: vso %p idx %d out of bounds", __func__, vso, idx));
 
 	leave = 0;
 	mc = &vso->vxlso_mc[idx];
 
 	VXLAN_SO_WLOCK(vso);
 	mc->vxlsomc_users--;
 	if (mc->vxlsomc_users == 0) {
 		group = mc->vxlsomc_gaddr;
 		source = mc->vxlsomc_saddr;
 		ifidx = mc->vxlsomc_ifidx;
 		bzero(mc, sizeof(*mc));
 		leave = 1;
 	}
 	VXLAN_SO_WUNLOCK(vso);
 
 	if (leave != 0) {
 		/*
 		 * Our socket's membership in this group may have already
 		 * been removed if we joined through an interface that's
 		 * been detached.
 		 */
 		vxlan_socket_mc_leave_group(vso, &group, &source, ifidx);
 	}
 }
 
 static struct vxlan_softc *
 vxlan_socket_lookup_softc_locked(struct vxlan_socket *vso, uint32_t vni)
 {
 	struct vxlan_softc *sc;
 	uint32_t hash;
 
 	VXLAN_SO_LOCK_ASSERT(vso);
 	hash = VXLAN_SO_VNI_HASH(vni);
 
 	LIST_FOREACH(sc, &vso->vxlso_vni_hash[hash], vxl_entry) {
 		if (sc->vxl_vni == vni) {
 			VXLAN_ACQUIRE(sc);
 			break;
 		}
 	}
 
 	return (sc);
 }
 
 static struct vxlan_softc *
 vxlan_socket_lookup_softc(struct vxlan_socket *vso, uint32_t vni)
 {
 	struct rm_priotracker tracker;
 	struct vxlan_softc *sc;
 
 	VXLAN_SO_RLOCK(vso, &tracker);
 	sc = vxlan_socket_lookup_softc_locked(vso, vni);
 	VXLAN_SO_RUNLOCK(vso, &tracker);
 
 	return (sc);
 }
 
 static int
 vxlan_socket_insert_softc(struct vxlan_socket *vso, struct vxlan_softc *sc)
 {
 	struct vxlan_softc *tsc;
 	uint32_t vni, hash;
 
 	vni = sc->vxl_vni;
 	hash = VXLAN_SO_VNI_HASH(vni);
 
 	VXLAN_SO_WLOCK(vso);
 	tsc = vxlan_socket_lookup_softc_locked(vso, vni);
 	if (tsc != NULL) {
 		VXLAN_SO_WUNLOCK(vso);
 		vxlan_release(tsc);
 		return (EEXIST);
 	}
 
 	VXLAN_ACQUIRE(sc);
 	LIST_INSERT_HEAD(&vso->vxlso_vni_hash[hash], sc, vxl_entry);
 	VXLAN_SO_WUNLOCK(vso);
 
 	return (0);
 }
 
 static void
 vxlan_socket_remove_softc(struct vxlan_socket *vso, struct vxlan_softc *sc)
 {
 
 	VXLAN_SO_WLOCK(vso);
 	LIST_REMOVE(sc, vxl_entry);
 	VXLAN_SO_WUNLOCK(vso);
 
 	vxlan_release(sc);
 }
 
 static struct ifnet *
 vxlan_multicast_if_ref(struct vxlan_softc *sc, int ipv4)
 {
 	struct ifnet *ifp;
 
 	VXLAN_LOCK_ASSERT(sc);
 
 	if (ipv4 && sc->vxl_im4o != NULL)
 		ifp = sc->vxl_im4o->imo_multicast_ifp;
 	else if (!ipv4 && sc->vxl_im6o != NULL)
 		ifp = sc->vxl_im6o->im6o_multicast_ifp;
 	else
 		ifp = NULL;
 
 	if (ifp != NULL)
 		if_ref(ifp);
 
 	return (ifp);
 }
 
 static void
 vxlan_free_multicast(struct vxlan_softc *sc)
 {
 
 	if (sc->vxl_mc_ifp != NULL) {
 		if_rele(sc->vxl_mc_ifp);
 		sc->vxl_mc_ifp = NULL;
 		sc->vxl_mc_ifindex = 0;
 	}
 
 	if (sc->vxl_im4o != NULL) {
 		free(sc->vxl_im4o, M_VXLAN);
 		sc->vxl_im4o = NULL;
 	}
 
 	if (sc->vxl_im6o != NULL) {
 		free(sc->vxl_im6o, M_VXLAN);
 		sc->vxl_im6o = NULL;
 	}
 }
 
 static int
 vxlan_setup_multicast_interface(struct vxlan_softc *sc)
 {
 	struct ifnet *ifp;
 
 	ifp = ifunit_ref(sc->vxl_mc_ifname);
 	if (ifp == NULL) {
 		if_printf(sc->vxl_ifp, "multicast interface %s does "
 		    "not exist\n", sc->vxl_mc_ifname);
 		return (ENOENT);
 	}
 
 	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 		if_printf(sc->vxl_ifp, "interface %s does not support "
 		     "multicast\n", sc->vxl_mc_ifname);
 		if_rele(ifp);
 		return (ENOTSUP);
 	}
 
 	sc->vxl_mc_ifp = ifp;
 	sc->vxl_mc_ifindex = ifp->if_index;
 
 	return (0);
 }
 
 static int
 vxlan_setup_multicast(struct vxlan_softc *sc)
 {
 	const union vxlan_sockaddr *group;
 	int error;
 
 	group = &sc->vxl_dst_addr;
 	error = 0;
 
 	if (sc->vxl_mc_ifname[0] != '\0') {
 		error = vxlan_setup_multicast_interface(sc);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Initialize an multicast options structure that is sufficiently
 	 * populated for use in the respective IP output routine. This
 	 * structure is typically stored in the socket, but our sockets
 	 * may be shared among multiple interfaces.
 	 */
 	if (VXLAN_SOCKADDR_IS_IPV4(group)) {
 		sc->vxl_im4o = malloc(sizeof(struct ip_moptions), M_VXLAN,
 		    M_ZERO | M_WAITOK);
 		sc->vxl_im4o->imo_multicast_ifp = sc->vxl_mc_ifp;
 		sc->vxl_im4o->imo_multicast_ttl = sc->vxl_ttl;
 		sc->vxl_im4o->imo_multicast_vif = -1;
 	} else if (VXLAN_SOCKADDR_IS_IPV6(group)) {
 		sc->vxl_im6o = malloc(sizeof(struct ip6_moptions), M_VXLAN,
 		    M_ZERO | M_WAITOK);
 		sc->vxl_im6o->im6o_multicast_ifp = sc->vxl_mc_ifp;
 		sc->vxl_im6o->im6o_multicast_hlim = sc->vxl_ttl;
 	}
 
 	return (error);
 }
 
 static int
 vxlan_setup_socket(struct vxlan_softc *sc)
 {
 	struct vxlan_socket *vso;
 	struct ifnet *ifp;
 	union vxlan_sockaddr *saddr, *daddr;
 	int multicast, error;
 
 	vso = NULL;
 	ifp = sc->vxl_ifp;
 	saddr = &sc->vxl_src_addr;
 	daddr = &sc->vxl_dst_addr;
 
 	multicast = vxlan_sockaddr_in_multicast(daddr);
 	MPASS(multicast != -1);
 	sc->vxl_vso_mc_index = -1;
 
 	/*
 	 * Try to create the socket. If that fails, attempt to use an
 	 * existing socket.
 	 */
 	error = vxlan_socket_create(ifp, multicast, saddr, &vso);
 	if (error) {
 		if (multicast != 0)
 			vso = vxlan_socket_mc_lookup(saddr);
 		else
 			vso = vxlan_socket_lookup(saddr);
 
 		if (vso == NULL) {
 			if_printf(ifp, "cannot create socket (error: %d), "
 			    "and no existing socket found\n", error);
 			goto out;
 		}
 	}
 
 	if (multicast != 0) {
 		error = vxlan_setup_multicast(sc);
 		if (error)
 			goto out;
 
 		error = vxlan_socket_mc_add_group(vso, daddr, saddr,
 		    sc->vxl_mc_ifindex, &sc->vxl_vso_mc_index);
 		if (error)
 			goto out;
 	}
 
 	sc->vxl_sock = vso;
 	error = vxlan_socket_insert_softc(vso, sc);
 	if (error) {
 		sc->vxl_sock = NULL;
 		if_printf(ifp, "network identifier %d already exists in "
 		    "this socket\n", sc->vxl_vni);
 		goto out;
 	}
 
 	return (0);
 
 out:
 	if (vso != NULL) {
 		if (sc->vxl_vso_mc_index != -1) {
 			vxlan_socket_mc_release_group_by_idx(vso,
 			    sc->vxl_vso_mc_index);
 			sc->vxl_vso_mc_index = -1;
 		}
 		if (multicast != 0)
 			vxlan_free_multicast(sc);
 		vxlan_socket_release(vso);
 	}
 
 	return (error);
 }
 
 #ifdef INET6
 static void
 vxlan_setup_zero_checksum_port(struct vxlan_softc *sc)
 {
 
 	if (!VXLAN_SOCKADDR_IS_IPV6(&sc->vxl_src_addr))
 		return;
 
 	MPASS(sc->vxl_src_addr.in6.sin6_port != 0);
 	MPASS(sc->vxl_dst_addr.in6.sin6_port != 0);
 
 	if (sc->vxl_src_addr.in6.sin6_port != sc->vxl_dst_addr.in6.sin6_port) {
 		if_printf(sc->vxl_ifp, "port %d in src address does not match "
 		    "port %d in dst address, rfc6935_port (%d) not updated.\n",
 		    ntohs(sc->vxl_src_addr.in6.sin6_port),
 		    ntohs(sc->vxl_dst_addr.in6.sin6_port),
 		    V_zero_checksum_port);
 		return;
 	}
 
 	if (V_zero_checksum_port != 0) {
 		if (V_zero_checksum_port !=
 		    ntohs(sc->vxl_src_addr.in6.sin6_port)) {
 			if_printf(sc->vxl_ifp, "rfc6935_port is already set to "
 			    "%d, cannot set it to %d.\n", V_zero_checksum_port,
 			    ntohs(sc->vxl_src_addr.in6.sin6_port));
 		}
 		return;
 	}
 
 	V_zero_checksum_port = ntohs(sc->vxl_src_addr.in6.sin6_port);
 	if_printf(sc->vxl_ifp, "rfc6935_port set to %d\n",
 	    V_zero_checksum_port);
 }
 #endif
 
 static void
 vxlan_setup_interface_hdrlen(struct vxlan_softc *sc)
 {
 	struct ifnet *ifp;
 
 	VXLAN_LOCK_WASSERT(sc);
 
 	ifp = sc->vxl_ifp;
 	ifp->if_hdrlen = ETHER_HDR_LEN + sizeof(struct vxlanudphdr);
 
 	if (VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_dst_addr) != 0)
 		ifp->if_hdrlen += sizeof(struct ip);
 	else if (VXLAN_SOCKADDR_IS_IPV6(&sc->vxl_dst_addr) != 0)
 		ifp->if_hdrlen += sizeof(struct ip6_hdr);
 
 	if ((sc->vxl_flags & VXLAN_FLAG_USER_MTU) == 0)
 		ifp->if_mtu = ETHERMTU - ifp->if_hdrlen;
 }
 
 static int
 vxlan_valid_init_config(struct vxlan_softc *sc)
 {
 	const char *reason;
 
 	if (vxlan_check_vni(sc->vxl_vni) != 0) {
 		reason = "invalid virtual network identifier specified";
 		goto fail;
 	}
 
 	if (vxlan_sockaddr_supported(&sc->vxl_src_addr, 1) == 0) {
 		reason = "source address type is not supported";
 		goto fail;
 	}
 
 	if (vxlan_sockaddr_supported(&sc->vxl_dst_addr, 0) == 0) {
 		reason = "destination address type is not supported";
 		goto fail;
 	}
 
 	if (vxlan_sockaddr_in_any(&sc->vxl_dst_addr) != 0) {
 		reason = "no valid destination address specified";
 		goto fail;
 	}
 
 	if (vxlan_sockaddr_in_multicast(&sc->vxl_dst_addr) == 0 &&
 	    sc->vxl_mc_ifname[0] != '\0') {
 		reason = "can only specify interface with a group address";
 		goto fail;
 	}
 
 	if (vxlan_sockaddr_in_any(&sc->vxl_src_addr) == 0) {
 		if (VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_src_addr) ^
 		    VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_dst_addr)) {
 			reason = "source and destination address must both "
 			    "be either IPv4 or IPv6";
 			goto fail;
 		}
 	}
 
 	if (sc->vxl_src_addr.in4.sin_port == 0) {
 		reason = "local port not specified";
 		goto fail;
 	}
 
 	if (sc->vxl_dst_addr.in4.sin_port == 0) {
 		reason = "remote port not specified";
 		goto fail;
 	}
 
 	return (0);
 
 fail:
 	if_printf(sc->vxl_ifp, "cannot initialize interface: %s\n", reason);
 	return (EINVAL);
 }
 
 static void
 vxlan_init_wait(struct vxlan_softc *sc)
 {
 
 	VXLAN_LOCK_WASSERT(sc);
 	while (sc->vxl_flags & VXLAN_FLAG_INIT)
 		rm_sleep(sc, &sc->vxl_lock, 0, "vxlint", hz);
 }
 
 static void
 vxlan_init_complete(struct vxlan_softc *sc)
 {
 
 	VXLAN_WLOCK(sc);
 	sc->vxl_flags &= ~VXLAN_FLAG_INIT;
 	wakeup(sc);
 	VXLAN_WUNLOCK(sc);
 }
 
 static void
 vxlan_init(void *xsc)
 {
 	static const uint8_t empty_mac[ETHER_ADDR_LEN];
 	struct vxlan_softc *sc;
 	struct ifnet *ifp;
 
 	sc = xsc;
 	ifp = sc->vxl_ifp;
 
 	sx_xlock(&vxlan_sx);
 	VXLAN_WLOCK(sc);
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		VXLAN_WUNLOCK(sc);
 		sx_xunlock(&vxlan_sx);
 		return;
 	}
 	sc->vxl_flags |= VXLAN_FLAG_INIT;
 	VXLAN_WUNLOCK(sc);
 
 	if (vxlan_valid_init_config(sc) != 0)
 		goto out;
 
 	if (vxlan_setup_socket(sc) != 0)
 		goto out;
 
 #ifdef INET6
 	vxlan_setup_zero_checksum_port(sc);
 #endif
 
 	/* Initialize the default forwarding entry. */
 	vxlan_ftable_entry_init(sc, &sc->vxl_default_fe, empty_mac,
 	    &sc->vxl_dst_addr.sa, VXLAN_FE_FLAG_STATIC);
 
 	VXLAN_WLOCK(sc);
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	callout_reset(&sc->vxl_callout, vxlan_ftable_prune_period * hz,
 	    vxlan_timer, sc);
 	VXLAN_WUNLOCK(sc);
 
 	if_link_state_change(ifp, LINK_STATE_UP);
 
 	EVENTHANDLER_INVOKE(vxlan_start, ifp, sc->vxl_src_addr.in4.sin_family,
 	    ntohs(sc->vxl_src_addr.in4.sin_port));
 out:
 	vxlan_init_complete(sc);
 	sx_xunlock(&vxlan_sx);
 }
 
 static void
 vxlan_release(struct vxlan_softc *sc)
 {
 
 	/*
 	 * The softc may be destroyed as soon as we release our reference,
 	 * so we cannot serialize the wakeup with the softc lock. We use a
 	 * timeout in our sleeps so a missed wakeup is unfortunate but not
 	 * fatal.
 	 */
 	if (VXLAN_RELEASE(sc) != 0)
 		wakeup(sc);
 }
 
 static void
 vxlan_teardown_wait(struct vxlan_softc *sc)
 {
 
 	VXLAN_LOCK_WASSERT(sc);
 	while (sc->vxl_flags & VXLAN_FLAG_TEARDOWN)
 		rm_sleep(sc, &sc->vxl_lock, 0, "vxltrn", hz);
 }
 
 static void
 vxlan_teardown_complete(struct vxlan_softc *sc)
 {
 
 	VXLAN_WLOCK(sc);
 	sc->vxl_flags &= ~VXLAN_FLAG_TEARDOWN;
 	wakeup(sc);
 	VXLAN_WUNLOCK(sc);
 }
 
 static void
 vxlan_teardown_locked(struct vxlan_softc *sc)
 {
 	struct ifnet *ifp;
 	struct vxlan_socket *vso;
 
 	sx_assert(&vxlan_sx, SA_XLOCKED);
 	VXLAN_LOCK_WASSERT(sc);
 	MPASS(sc->vxl_flags & VXLAN_FLAG_TEARDOWN);
 
 	ifp = sc->vxl_ifp;
 	ifp->if_flags &= ~IFF_UP;
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	callout_stop(&sc->vxl_callout);
 	vso = sc->vxl_sock;
 	sc->vxl_sock = NULL;
 
 	VXLAN_WUNLOCK(sc);
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 	EVENTHANDLER_INVOKE(vxlan_stop, ifp, sc->vxl_src_addr.in4.sin_family,
 	    ntohs(sc->vxl_src_addr.in4.sin_port));
 
 	if (vso != NULL) {
 		vxlan_socket_remove_softc(vso, sc);
 
 		if (sc->vxl_vso_mc_index != -1) {
 			vxlan_socket_mc_release_group_by_idx(vso,
 			    sc->vxl_vso_mc_index);
 			sc->vxl_vso_mc_index = -1;
 		}
 	}
 
 	VXLAN_WLOCK(sc);
 	while (sc->vxl_refcnt != 0)
 		rm_sleep(sc, &sc->vxl_lock, 0, "vxldrn", hz);
 	VXLAN_WUNLOCK(sc);
 
 	callout_drain(&sc->vxl_callout);
 
 	vxlan_free_multicast(sc);
 	if (vso != NULL)
 		vxlan_socket_release(vso);
 
 	vxlan_teardown_complete(sc);
 }
 
 static void
 vxlan_teardown(struct vxlan_softc *sc)
 {
 
 	sx_xlock(&vxlan_sx);
 	VXLAN_WLOCK(sc);
 	if (sc->vxl_flags & VXLAN_FLAG_TEARDOWN) {
 		vxlan_teardown_wait(sc);
 		VXLAN_WUNLOCK(sc);
 		sx_xunlock(&vxlan_sx);
 		return;
 	}
 
 	sc->vxl_flags |= VXLAN_FLAG_TEARDOWN;
 	vxlan_teardown_locked(sc);
 	sx_xunlock(&vxlan_sx);
 }
 
 static void
 vxlan_ifdetach(struct vxlan_softc *sc, struct ifnet *ifp,
     struct vxlan_softc_head *list)
 {
 
 	VXLAN_WLOCK(sc);
 
 	if (sc->vxl_mc_ifp != ifp)
 		goto out;
 	if (sc->vxl_flags & VXLAN_FLAG_TEARDOWN)
 		goto out;
 
 	sc->vxl_flags |= VXLAN_FLAG_TEARDOWN;
 	LIST_INSERT_HEAD(list, sc, vxl_ifdetach_list);
 
 out:
 	VXLAN_WUNLOCK(sc);
 }
 
 static void
 vxlan_timer(void *xsc)
 {
 	struct vxlan_softc *sc;
 
 	sc = xsc;
 	VXLAN_LOCK_WASSERT(sc);
 
 	vxlan_ftable_expire(sc);
 	callout_schedule(&sc->vxl_callout, vxlan_ftable_prune_period * hz);
 }
 
 static int
 vxlan_ioctl_ifflags(struct vxlan_softc *sc)
 {
 	struct ifnet *ifp;
 
 	ifp = sc->vxl_ifp;
 
 	if (ifp->if_flags & IFF_UP) {
 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 			vxlan_init(sc);
 	} else {
 		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 			vxlan_teardown(sc);
 	}
 
 	return (0);
 }
 
 static int
 vxlan_ctrl_get_config(struct vxlan_softc *sc, void *arg)
 {
 	struct rm_priotracker tracker;
 	struct ifvxlancfg *cfg;
 
 	cfg = arg;
 	bzero(cfg, sizeof(*cfg));
 
 	VXLAN_RLOCK(sc, &tracker);
 	cfg->vxlc_vni = sc->vxl_vni;
 	memcpy(&cfg->vxlc_local_sa, &sc->vxl_src_addr,
 	    sizeof(union vxlan_sockaddr));
 	memcpy(&cfg->vxlc_remote_sa, &sc->vxl_dst_addr,
 	    sizeof(union vxlan_sockaddr));
 	cfg->vxlc_mc_ifindex = sc->vxl_mc_ifindex;
 	cfg->vxlc_ftable_cnt = sc->vxl_ftable_cnt;
 	cfg->vxlc_ftable_max = sc->vxl_ftable_max;
 	cfg->vxlc_ftable_timeout = sc->vxl_ftable_timeout;
 	cfg->vxlc_port_min = sc->vxl_min_port;
 	cfg->vxlc_port_max = sc->vxl_max_port;
 	cfg->vxlc_learn = (sc->vxl_flags & VXLAN_FLAG_LEARN) != 0;
 	cfg->vxlc_ttl = sc->vxl_ttl;
 	VXLAN_RUNLOCK(sc, &tracker);
 
 #ifdef INET6
 	if (VXLAN_SOCKADDR_IS_IPV6(&cfg->vxlc_local_sa))
 		sa6_recoverscope(&cfg->vxlc_local_sa.in6);
 	if (VXLAN_SOCKADDR_IS_IPV6(&cfg->vxlc_remote_sa))
 		sa6_recoverscope(&cfg->vxlc_remote_sa.in6);
 #endif
 
 	return (0);
 }
 
 static int
 vxlan_ctrl_set_vni(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	int error;
 
 	cmd = arg;
 
 	if (vxlan_check_vni(cmd->vxlcmd_vni) != 0)
 		return (EINVAL);
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_can_change_config(sc)) {
 		sc->vxl_vni = cmd->vxlcmd_vni;
 		error = 0;
 	} else
 		error = EBUSY;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_local_addr(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	union vxlan_sockaddr *vxlsa;
 	int error;
 
 	cmd = arg;
 	vxlsa = &cmd->vxlcmd_sa;
 
 	if (!VXLAN_SOCKADDR_IS_IPV46(vxlsa))
 		return (EINVAL);
 	if (vxlan_sockaddr_in_multicast(vxlsa) != 0)
 		return (EINVAL);
 	if (VXLAN_SOCKADDR_IS_IPV6(vxlsa)) {
 		error = vxlan_sockaddr_in6_embedscope(vxlsa);
 		if (error)
 			return (error);
 	}
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_can_change_config(sc)) {
 		vxlan_sockaddr_in_copy(&sc->vxl_src_addr, &vxlsa->sa);
 		vxlan_set_hwcaps(sc);
 		error = 0;
 	} else
 		error = EBUSY;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_remote_addr(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	union vxlan_sockaddr *vxlsa;
 	int error;
 
 	cmd = arg;
 	vxlsa = &cmd->vxlcmd_sa;
 
 	if (!VXLAN_SOCKADDR_IS_IPV46(vxlsa))
 		return (EINVAL);
 	if (VXLAN_SOCKADDR_IS_IPV6(vxlsa)) {
 		error = vxlan_sockaddr_in6_embedscope(vxlsa);
 		if (error)
 			return (error);
 	}
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_can_change_config(sc)) {
 		vxlan_sockaddr_in_copy(&sc->vxl_dst_addr, &vxlsa->sa);
 		vxlan_setup_interface_hdrlen(sc);
 		error = 0;
 	} else
 		error = EBUSY;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_local_port(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	int error;
 
 	cmd = arg;
 
 	if (cmd->vxlcmd_port == 0)
 		return (EINVAL);
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_can_change_config(sc)) {
 		sc->vxl_src_addr.in4.sin_port = htons(cmd->vxlcmd_port);
 		error = 0;
 	} else
 		error = EBUSY;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_remote_port(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	int error;
 
 	cmd = arg;
 
 	if (cmd->vxlcmd_port == 0)
 		return (EINVAL);
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_can_change_config(sc)) {
 		sc->vxl_dst_addr.in4.sin_port = htons(cmd->vxlcmd_port);
 		error = 0;
 	} else
 		error = EBUSY;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_port_range(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	uint16_t min, max;
 	int error;
 
 	cmd = arg;
 	min = cmd->vxlcmd_port_min;
 	max = cmd->vxlcmd_port_max;
 
 	if (max < min)
 		return (EINVAL);
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_can_change_config(sc)) {
 		sc->vxl_min_port = min;
 		sc->vxl_max_port = max;
 		error = 0;
 	} else
 		error = EBUSY;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_ftable_timeout(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	int error;
 
 	cmd = arg;
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_check_ftable_timeout(cmd->vxlcmd_ftable_timeout) == 0) {
 		sc->vxl_ftable_timeout = cmd->vxlcmd_ftable_timeout;
 		error = 0;
 	} else
 		error = EINVAL;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_ftable_max(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	int error;
 
 	cmd = arg;
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_check_ftable_max(cmd->vxlcmd_ftable_max) == 0) {
 		sc->vxl_ftable_max = cmd->vxlcmd_ftable_max;
 		error = 0;
 	} else
 		error = EINVAL;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_multicast_if(struct vxlan_softc * sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	int error;
 
 	cmd = arg;
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_can_change_config(sc)) {
 		strlcpy(sc->vxl_mc_ifname, cmd->vxlcmd_ifname, IFNAMSIZ);
 		vxlan_set_hwcaps(sc);
 		error = 0;
 	} else
 		error = EBUSY;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_ttl(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	int error;
 
 	cmd = arg;
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_check_ttl(cmd->vxlcmd_ttl) == 0) {
 		sc->vxl_ttl = cmd->vxlcmd_ttl;
 		if (sc->vxl_im4o != NULL)
 			sc->vxl_im4o->imo_multicast_ttl = sc->vxl_ttl;
 		if (sc->vxl_im6o != NULL)
 			sc->vxl_im6o->im6o_multicast_hlim = sc->vxl_ttl;
 		error = 0;
 	} else
 		error = EINVAL;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_learn(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 
 	cmd = arg;
 
 	VXLAN_WLOCK(sc);
 	if (cmd->vxlcmd_flags & VXLAN_CMD_FLAG_LEARN)
 		sc->vxl_flags |= VXLAN_FLAG_LEARN;
 	else
 		sc->vxl_flags &= ~VXLAN_FLAG_LEARN;
 	VXLAN_WUNLOCK(sc);
 
 	return (0);
 }
 
 static int
 vxlan_ctrl_ftable_entry_add(struct vxlan_softc *sc, void *arg)
 {
 	union vxlan_sockaddr vxlsa;
 	struct ifvxlancmd *cmd;
 	struct vxlan_ftable_entry *fe;
 	int error;
 
 	cmd = arg;
 	vxlsa = cmd->vxlcmd_sa;
 
 	if (!VXLAN_SOCKADDR_IS_IPV46(&vxlsa))
 		return (EINVAL);
 	if (vxlan_sockaddr_in_any(&vxlsa) != 0)
 		return (EINVAL);
 	if (vxlan_sockaddr_in_multicast(&vxlsa) != 0)
 		return (EINVAL);
 	/* BMV: We could support both IPv4 and IPv6 later. */
 	if (vxlsa.sa.sa_family != sc->vxl_dst_addr.sa.sa_family)
 		return (EAFNOSUPPORT);
 
 	if (VXLAN_SOCKADDR_IS_IPV6(&vxlsa)) {
 		error = vxlan_sockaddr_in6_embedscope(&vxlsa);
 		if (error)
 			return (error);
 	}
 
 	fe = vxlan_ftable_entry_alloc();
 	if (fe == NULL)
 		return (ENOMEM);
 
 	if (vxlsa.in4.sin_port == 0)
 		vxlsa.in4.sin_port = sc->vxl_dst_addr.in4.sin_port;
 
 	vxlan_ftable_entry_init(sc, fe, cmd->vxlcmd_mac, &vxlsa.sa,
 	    VXLAN_FE_FLAG_STATIC);
 
 	VXLAN_WLOCK(sc);
 	error = vxlan_ftable_entry_insert(sc, fe);
 	VXLAN_WUNLOCK(sc);
 
 	if (error)
 		vxlan_ftable_entry_free(fe);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_ftable_entry_rem(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	struct vxlan_ftable_entry *fe;
 	int error;
 
 	cmd = arg;
 
 	VXLAN_WLOCK(sc);
 	fe = vxlan_ftable_entry_lookup(sc, cmd->vxlcmd_mac);
 	if (fe != NULL) {
 		vxlan_ftable_entry_destroy(sc, fe);
 		error = 0;
 	} else
 		error = ENOENT;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_flush(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	int all;
 
 	cmd = arg;
 	all = cmd->vxlcmd_flags & VXLAN_CMD_FLAG_FLUSH_ALL;
 
 	VXLAN_WLOCK(sc);
 	vxlan_ftable_flush(sc, all);
 	VXLAN_WUNLOCK(sc);
 
 	return (0);
 }
 
 static int
 vxlan_ioctl_drvspec(struct vxlan_softc *sc, struct ifdrv *ifd, int get)
 {
 	const struct vxlan_control *vc;
 	union {
 		struct ifvxlancfg	cfg;
 		struct ifvxlancmd	cmd;
 	} args;
 	int out, error;
 
 	if (ifd->ifd_cmd >= vxlan_control_table_size)
 		return (EINVAL);
 
 	bzero(&args, sizeof(args));
 	vc = &vxlan_control_table[ifd->ifd_cmd];
 	out = (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYOUT) != 0;
 
 	if ((get != 0 && out == 0) || (get == 0 && out != 0))
 		return (EINVAL);
 
 	if (vc->vxlc_flags & VXLAN_CTRL_FLAG_SUSER) {
 		error = priv_check(curthread, PRIV_NET_VXLAN);
 		if (error)
 			return (error);
 	}
 
 	if (ifd->ifd_len != vc->vxlc_argsize ||
 	    ifd->ifd_len > sizeof(args))
 		return (EINVAL);
 
 	if (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYIN) {
 		error = copyin(ifd->ifd_data, &args, ifd->ifd_len);
 		if (error)
 			return (error);
 	}
 
 	error = vc->vxlc_func(sc, &args);
 	if (error)
 		return (error);
 
 	if (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYOUT) {
 		error = copyout(&args, ifd->ifd_data, ifd->ifd_len);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 static int
 vxlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct rm_priotracker tracker;
 	struct vxlan_softc *sc;
 	struct ifreq *ifr;
 	struct ifdrv *ifd;
 	int error;
 
 	sc = ifp->if_softc;
 	ifr = (struct ifreq *) data;
 	ifd = (struct ifdrv *) data;
 
 	error = 0;
 
 	switch (cmd) {
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		break;
 
 	case SIOCGDRVSPEC:
 	case SIOCSDRVSPEC:
 		error = vxlan_ioctl_drvspec(sc, ifd, cmd == SIOCGDRVSPEC);
 		break;
 
 	case SIOCSIFFLAGS:
 		error = vxlan_ioctl_ifflags(sc);
 		break;
 
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		error = ifmedia_ioctl(ifp, ifr, &sc->vxl_media, cmd);
 		break;
 
 	case SIOCSIFMTU:
 		if (ifr->ifr_mtu < ETHERMIN || ifr->ifr_mtu > VXLAN_MAX_MTU) {
 			error = EINVAL;
 		} else {
 			VXLAN_WLOCK(sc);
 			ifp->if_mtu = ifr->ifr_mtu;
 			sc->vxl_flags |= VXLAN_FLAG_USER_MTU;
 			VXLAN_WUNLOCK(sc);
 		}
 		break;
 
 	case SIOCSIFCAP:
 		VXLAN_WLOCK(sc);
 		error = vxlan_set_reqcap(sc, ifp, ifr->ifr_reqcap);
 		if (error == 0)
 			vxlan_set_hwcaps(sc);
 		VXLAN_WUNLOCK(sc);
 		break;
 
 	case SIOCGTUNFIB:
 		VXLAN_RLOCK(sc, &tracker);
 		ifr->ifr_fib = sc->vxl_fibnum;
 		VXLAN_RUNLOCK(sc, &tracker);
 		break;
 
 	case SIOCSTUNFIB:
 		if ((error = priv_check(curthread, PRIV_NET_VXLAN)) != 0)
 			break;
 
 		if (ifr->ifr_fib >= rt_numfibs)
 			error = EINVAL;
 		else {
 			VXLAN_WLOCK(sc);
 			sc->vxl_fibnum = ifr->ifr_fib;
 			VXLAN_WUNLOCK(sc);
 		}
 		break;
 
 	default:
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 
 	return (error);
 }
 
 #if defined(INET) || defined(INET6)
 static uint16_t
 vxlan_pick_source_port(struct vxlan_softc *sc, struct mbuf *m)
 {
 	int range;
 	uint32_t hash;
 
 	range = sc->vxl_max_port - sc->vxl_min_port + 1;
 
 	if (M_HASHTYPE_ISHASH(m))
 		hash = m->m_pkthdr.flowid;
 	else
 		hash = jenkins_hash(m->m_data, ETHER_HDR_LEN,
 		    sc->vxl_port_hash_key);
 
 	return (sc->vxl_min_port + (hash % range));
 }
 
 static void
 vxlan_encap_header(struct vxlan_softc *sc, struct mbuf *m, int ipoff,
     uint16_t srcport, uint16_t dstport)
 {
 	struct vxlanudphdr *hdr;
 	struct udphdr *udph;
 	struct vxlan_header *vxh;
 	int len;
 
 	len = m->m_pkthdr.len - ipoff;
 	MPASS(len >= sizeof(struct vxlanudphdr));
 	hdr = mtodo(m, ipoff);
 
 	udph = &hdr->vxlh_udp;
 	udph->uh_sport = srcport;
 	udph->uh_dport = dstport;
 	udph->uh_ulen = htons(len);
 	udph->uh_sum = 0;
 
 	vxh = &hdr->vxlh_hdr;
 	vxh->vxlh_flags = htonl(VXLAN_HDR_FLAGS_VALID_VNI);
 	vxh->vxlh_vni = htonl(sc->vxl_vni << VXLAN_HDR_VNI_SHIFT);
 }
 #endif
 
 #if defined(INET6) || defined(INET)
 /*
  * Return the CSUM_INNER_* equivalent of CSUM_* caps.
  */
 static uint32_t
 csum_flags_to_inner_flags(uint32_t csum_flags_in, const uint32_t encap)
 {
 	uint32_t csum_flags = encap;
 	const uint32_t v4 = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP;
 
 	/*
 	 * csum_flags can request either v4 or v6 offload but not both.
 	 * tcp_output always sets CSUM_TSO (both CSUM_IP_TSO and CSUM_IP6_TSO)
 	 * so those bits are no good to detect the IP version.  Other bits are
 	 * always set with CSUM_TSO and we use those to figure out the IP
 	 * version.
 	 */
 	if (csum_flags_in & v4) {
 		if (csum_flags_in & CSUM_IP)
 			csum_flags |= CSUM_INNER_IP;
 		if (csum_flags_in & CSUM_IP_UDP)
 			csum_flags |= CSUM_INNER_IP_UDP;
 		if (csum_flags_in & CSUM_IP_TCP)
 			csum_flags |= CSUM_INNER_IP_TCP;
 		if (csum_flags_in & CSUM_IP_TSO)
 			csum_flags |= CSUM_INNER_IP_TSO;
 	} else {
 #ifdef INVARIANTS
 		const uint32_t v6 = CSUM_IP6_UDP | CSUM_IP6_TCP;
 
 		MPASS((csum_flags_in & v6) != 0);
 #endif
 		if (csum_flags_in & CSUM_IP6_UDP)
 			csum_flags |= CSUM_INNER_IP6_UDP;
 		if (csum_flags_in & CSUM_IP6_TCP)
 			csum_flags |= CSUM_INNER_IP6_TCP;
 		if (csum_flags_in & CSUM_IP6_TSO)
 			csum_flags |= CSUM_INNER_IP6_TSO;
 	}
 
 	return (csum_flags);
 }
 #endif
 
 static int
 vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
     struct mbuf *m)
 {
 #ifdef INET
 	struct ifnet *ifp;
 	struct ip *ip;
 	struct in_addr srcaddr, dstaddr;
 	uint16_t srcport, dstport;
 	int plen, mcast, error;
 	struct route route, *ro;
 	struct sockaddr_in *sin;
 	uint32_t csum_flags;
 
 	NET_EPOCH_ASSERT();
 
 	ifp = sc->vxl_ifp;
 	srcaddr = sc->vxl_src_addr.in4.sin_addr;
 	srcport = vxlan_pick_source_port(sc, m);
 	dstaddr = fvxlsa->in4.sin_addr;
 	dstport = fvxlsa->in4.sin_port;
 
 	plen = m->m_pkthdr.len;
 	M_PREPEND(m, sizeof(struct ip) + sizeof(struct vxlanudphdr),
 	    M_NOWAIT);
 	if (m == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENOBUFS);
 	}
 
 	ip = mtod(m, struct ip *);
 	ip->ip_tos = 0;
 	ip->ip_len = htons(m->m_pkthdr.len);
 	ip->ip_off = 0;
 	ip->ip_ttl = sc->vxl_ttl;
 	ip->ip_p = IPPROTO_UDP;
 	ip->ip_sum = 0;
 	ip->ip_src = srcaddr;
 	ip->ip_dst = dstaddr;
 
 	vxlan_encap_header(sc, m, sizeof(struct ip), srcport, dstport);
 
 	mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
 	m->m_flags &= ~(M_MCAST | M_BCAST);
 
 	m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX;
 	if (m->m_pkthdr.csum_flags != 0) {
 		/*
 		 * HW checksum (L3 and/or L4) or TSO has been requested.  Look
 		 * up the ifnet for the outbound route and verify that the
 		 * outbound ifnet can perform the requested operation on the
 		 * inner frame.
 		 */
 		bzero(&route, sizeof(route));
 		ro = &route;
 		sin = (struct sockaddr_in *)&ro->ro_dst;
 		sin->sin_family = AF_INET;
 		sin->sin_len = sizeof(*sin);
 		sin->sin_addr = ip->ip_dst;
 		ro->ro_nh = fib4_lookup(M_GETFIB(m), ip->ip_dst, 0, NHR_NONE,
 		    0);
 		if (ro->ro_nh == NULL) {
 			m_freem(m);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			return (EHOSTUNREACH);
 		}
 
 		csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags,
 		    CSUM_ENCAP_VXLAN);
 		if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) !=
 		    csum_flags) {
 			if (ppsratecheck(&sc->err_time, &sc->err_pps, 1)) {
 				const struct ifnet *nh_ifp = ro->ro_nh->nh_ifp;
 
 				if_printf(ifp, "interface %s is missing hwcaps "
 				    "0x%08x, csum_flags 0x%08x -> 0x%08x, "
 				    "hwassist 0x%08x\n", nh_ifp->if_xname,
 				    csum_flags & ~(uint32_t)nh_ifp->if_hwassist,
 				    m->m_pkthdr.csum_flags, csum_flags,
 				    (uint32_t)nh_ifp->if_hwassist);
 			}
 			m_freem(m);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			return (ENXIO);
 		}
 		m->m_pkthdr.csum_flags = csum_flags;
 		if (csum_flags &
 		    (CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_UDP |
 		    CSUM_INNER_IP_TCP | CSUM_INNER_IP6_TCP)) {
 			counter_u64_add(sc->vxl_stats.txcsum, 1);
 			if (csum_flags & CSUM_INNER_TSO)
 				counter_u64_add(sc->vxl_stats.tso, 1);
 		}
 	} else
 		ro = NULL;
 	error = ip_output(m, NULL, ro, 0, sc->vxl_im4o, NULL);
 	if (error == 0) {
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, plen);
 		if (mcast != 0)
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 	} else
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 
 	return (error);
 #else
 	m_freem(m);
 	return (ENOTSUP);
 #endif
 }
 
 static int
 vxlan_encap6(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
     struct mbuf *m)
 {
 #ifdef INET6
 	struct ifnet *ifp;
 	struct ip6_hdr *ip6;
 	const struct in6_addr *srcaddr, *dstaddr;
 	uint16_t srcport, dstport;
 	int plen, mcast, error;
 	struct route_in6 route, *ro;
 	struct sockaddr_in6 *sin6;
 	uint32_t csum_flags;
 
 	NET_EPOCH_ASSERT();
 
 	ifp = sc->vxl_ifp;
 	srcaddr = &sc->vxl_src_addr.in6.sin6_addr;
 	srcport = vxlan_pick_source_port(sc, m);
 	dstaddr = &fvxlsa->in6.sin6_addr;
 	dstport = fvxlsa->in6.sin6_port;
 
 	plen = m->m_pkthdr.len;
 	M_PREPEND(m, sizeof(struct ip6_hdr) + sizeof(struct vxlanudphdr),
 	    M_NOWAIT);
 	if (m == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENOBUFS);
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow = 0;		/* BMV: Keep in forwarding entry? */
 	ip6->ip6_vfc = IPV6_VERSION;
 	ip6->ip6_plen = 0;
 	ip6->ip6_nxt = IPPROTO_UDP;
 	ip6->ip6_hlim = sc->vxl_ttl;
 	ip6->ip6_src = *srcaddr;
 	ip6->ip6_dst = *dstaddr;
 
 	vxlan_encap_header(sc, m, sizeof(struct ip6_hdr), srcport, dstport);
 
 	mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
 	m->m_flags &= ~(M_MCAST | M_BCAST);
 
 	ro = NULL;
 	m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX;
 	if (m->m_pkthdr.csum_flags != 0) {
 		/*
 		 * HW checksum (L3 and/or L4) or TSO has been requested.  Look
 		 * up the ifnet for the outbound route and verify that the
 		 * outbound ifnet can perform the requested operation on the
 		 * inner frame.
 		 */
 		bzero(&route, sizeof(route));
 		ro = &route;
 		sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
 		sin6->sin6_family = AF_INET6;
 		sin6->sin6_len = sizeof(*sin6);
 		sin6->sin6_addr = ip6->ip6_dst;
 		ro->ro_nh = fib6_lookup(M_GETFIB(m), &ip6->ip6_dst, 0,
 		    NHR_NONE, 0);
 		if (ro->ro_nh == NULL) {
 			m_freem(m);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			return (EHOSTUNREACH);
 		}
 
 		csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags,
 		    CSUM_ENCAP_VXLAN);
 		if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) !=
 		    csum_flags) {
 			if (ppsratecheck(&sc->err_time, &sc->err_pps, 1)) {
 				const struct ifnet *nh_ifp = ro->ro_nh->nh_ifp;
 
 				if_printf(ifp, "interface %s is missing hwcaps "
 				    "0x%08x, csum_flags 0x%08x -> 0x%08x, "
 				    "hwassist 0x%08x\n", nh_ifp->if_xname,
 				    csum_flags & ~(uint32_t)nh_ifp->if_hwassist,
 				    m->m_pkthdr.csum_flags, csum_flags,
 				    (uint32_t)nh_ifp->if_hwassist);
 			}
 			m_freem(m);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			return (ENXIO);
 		}
 		m->m_pkthdr.csum_flags = csum_flags;
 		if (csum_flags &
 		    (CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_UDP |
 		    CSUM_INNER_IP_TCP | CSUM_INNER_IP6_TCP)) {
 			counter_u64_add(sc->vxl_stats.txcsum, 1);
 			if (csum_flags & CSUM_INNER_TSO)
 				counter_u64_add(sc->vxl_stats.tso, 1);
 		}
 	} else if (ntohs(dstport) != V_zero_checksum_port) {
 		struct udphdr *hdr = mtodo(m, sizeof(struct ip6_hdr));
 
 		hdr->uh_sum = in6_cksum_pseudo(ip6,
 		    m->m_pkthdr.len - sizeof(struct ip6_hdr), IPPROTO_UDP, 0);
 		m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 	}
 	error = ip6_output(m, NULL, ro, 0, sc->vxl_im6o, NULL, NULL);
 	if (error == 0) {
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, plen);
 		if (mcast != 0)
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 	} else
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 
 	return (error);
 #else
 	m_freem(m);
 	return (ENOTSUP);
 #endif
 }
 
 static int
 vxlan_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct rm_priotracker tracker;
 	union vxlan_sockaddr vxlsa;
 	struct vxlan_softc *sc;
 	struct vxlan_ftable_entry *fe;
 	struct ifnet *mcifp;
 	struct ether_header *eh;
 	int ipv4, error;
 
 	sc = ifp->if_softc;
 	eh = mtod(m, struct ether_header *);
 	fe = NULL;
 	mcifp = NULL;
 
 	ETHER_BPF_MTAP(ifp, m);
 
 	VXLAN_RLOCK(sc, &tracker);
 	M_SETFIB(m, sc->vxl_fibnum);
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		VXLAN_RUNLOCK(sc, &tracker);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	if ((m->m_flags & (M_BCAST | M_MCAST)) == 0)
 		fe = vxlan_ftable_entry_lookup(sc, eh->ether_dhost);
 	if (fe == NULL)
 		fe = &sc->vxl_default_fe;
 	vxlan_sockaddr_copy(&vxlsa, &fe->vxlfe_raddr.sa);
 
 	ipv4 = VXLAN_SOCKADDR_IS_IPV4(&vxlsa) != 0;
 	if (vxlan_sockaddr_in_multicast(&vxlsa) != 0)
 		mcifp = vxlan_multicast_if_ref(sc, ipv4);
 
 	VXLAN_ACQUIRE(sc);
 	VXLAN_RUNLOCK(sc, &tracker);
 
 	if (ipv4 != 0)
 		error = vxlan_encap4(sc, &vxlsa, m);
 	else
 		error = vxlan_encap6(sc, &vxlsa, m);
 
 	vxlan_release(sc);
 	if (mcifp != NULL)
 		if_rele(mcifp);
 
 	return (error);
 }
 
 static void
 vxlan_qflush(struct ifnet *ifp __unused)
 {
 }
 
 static bool
 vxlan_rcv_udp_packet(struct mbuf *m, int offset, struct inpcb *inpcb,
     const struct sockaddr *srcsa, void *xvso)
 {
 	struct vxlan_socket *vso;
 	struct vxlan_header *vxh, vxlanhdr;
 	uint32_t vni;
 	int error __unused;
 
 	M_ASSERTPKTHDR(m);
 	vso = xvso;
 	offset += sizeof(struct udphdr);
 
 	if (m->m_pkthdr.len < offset + sizeof(struct vxlan_header))
 		goto out;
 
 	if (__predict_false(m->m_len < offset + sizeof(struct vxlan_header))) {
 		m_copydata(m, offset, sizeof(struct vxlan_header),
 		    (caddr_t) &vxlanhdr);
 		vxh = &vxlanhdr;
 	} else
 		vxh = mtodo(m, offset);
 
 	/*
 	 * Drop if there is a reserved bit set in either the flags or VNI
 	 * fields of the header. This goes against the specification, but
 	 * a bit set may indicate an unsupported new feature. This matches
 	 * the behavior of the Linux implementation.
 	 */
 	if (vxh->vxlh_flags != htonl(VXLAN_HDR_FLAGS_VALID_VNI) ||
 	    vxh->vxlh_vni & ~VXLAN_VNI_MASK)
 		goto out;
 
 	vni = ntohl(vxh->vxlh_vni) >> VXLAN_HDR_VNI_SHIFT;
 
 	/* Adjust to the start of the inner Ethernet frame. */
 	m_adj_decap(m, offset + sizeof(struct vxlan_header));
 
 	error = vxlan_input(vso, vni, &m, srcsa);
 	MPASS(error != 0 || m == NULL);
 
 out:
 	if (m != NULL)
 		m_freem(m);
 
 	return (true);
 }
 
 static int
 vxlan_input(struct vxlan_socket *vso, uint32_t vni, struct mbuf **m0,
     const struct sockaddr *sa)
 {
 	struct vxlan_softc *sc;
 	struct ifnet *ifp;
 	struct mbuf *m;
 	struct ether_header *eh;
 	int error;
 
 	m = *m0;
 
 	if (m->m_pkthdr.len < ETHER_HDR_LEN)
 		return (EINVAL);
 
 	sc = vxlan_socket_lookup_softc(vso, vni);
 	if (sc == NULL)
 		return (ENOENT);
 
 	ifp = sc->vxl_ifp;
 	if (m->m_len < ETHER_HDR_LEN &&
 	    (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
 		*m0 = NULL;
 		error = ENOBUFS;
 		goto out;
 	}
 	eh = mtod(m, struct ether_header *);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		error = ENETDOWN;
 		goto out;
 	} else if (ifp == m->m_pkthdr.rcvif) {
 		/* XXX Does not catch more complex loops. */
 		error = EDEADLK;
 		goto out;
 	}
 
 	if (sc->vxl_flags & VXLAN_FLAG_LEARN)
 		vxlan_ftable_learn(sc, sa, eh->ether_shost);
 
 	m_clrprotoflags(m);
 	m->m_pkthdr.rcvif = ifp;
 	M_SETFIB(m, ifp->if_fib);
 	if (((ifp->if_capenable & IFCAP_RXCSUM &&
 	    m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC) ||
 	    (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
 	    !(m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC)))) {
 		uint32_t csum_flags = 0;
 
 		if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC)
 			csum_flags |= CSUM_L3_CALC;
 		if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_VALID)
 			csum_flags |= CSUM_L3_VALID;
 		if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_CALC)
 			csum_flags |= CSUM_L4_CALC;
 		if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_VALID)
 			csum_flags |= CSUM_L4_VALID;
 		m->m_pkthdr.csum_flags = csum_flags;
 		counter_u64_add(sc->vxl_stats.rxcsum, 1);
 	} else {
 		/* clear everything */
 		m->m_pkthdr.csum_flags = 0;
 		m->m_pkthdr.csum_data = 0;
 	}
 
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	(*ifp->if_input)(ifp, m);
 	*m0 = NULL;
 	error = 0;
 
 out:
 	vxlan_release(sc);
 	return (error);
 }
 
 static int
 vxlan_stats_alloc(struct vxlan_softc *sc)
 {
 	struct vxlan_statistics *stats = &sc->vxl_stats;
 
 	stats->txcsum = counter_u64_alloc(M_WAITOK);
 	if (stats->txcsum == NULL)
 		goto failed;
 
 	stats->tso = counter_u64_alloc(M_WAITOK);
 	if (stats->tso == NULL)
 		goto failed;
 
 	stats->rxcsum = counter_u64_alloc(M_WAITOK);
 	if (stats->rxcsum == NULL)
 		goto failed;
 
 	return (0);
 failed:
 	vxlan_stats_free(sc);
 	return (ENOMEM);
 }
 
 static void
 vxlan_stats_free(struct vxlan_softc *sc)
 {
 	struct vxlan_statistics *stats = &sc->vxl_stats;
 
 	if (stats->txcsum != NULL) {
 		counter_u64_free(stats->txcsum);
 		stats->txcsum = NULL;
 	}
 	if (stats->tso != NULL) {
 		counter_u64_free(stats->tso);
 		stats->tso = NULL;
 	}
 	if (stats->rxcsum != NULL) {
 		counter_u64_free(stats->rxcsum);
 		stats->rxcsum = NULL;
 	}
 }
 
 static void
 vxlan_set_default_config(struct vxlan_softc *sc)
 {
 
 	sc->vxl_flags |= VXLAN_FLAG_LEARN;
 
 	sc->vxl_vni = VXLAN_VNI_MAX;
 	sc->vxl_ttl = IPDEFTTL;
 
 	if (!vxlan_tunable_int(sc, "legacy_port", vxlan_legacy_port)) {
 		sc->vxl_src_addr.in4.sin_port = htons(VXLAN_PORT);
 		sc->vxl_dst_addr.in4.sin_port = htons(VXLAN_PORT);
 	} else {
 		sc->vxl_src_addr.in4.sin_port = htons(VXLAN_LEGACY_PORT);
 		sc->vxl_dst_addr.in4.sin_port = htons(VXLAN_LEGACY_PORT);
 	}
 
 	sc->vxl_min_port = V_ipport_firstauto;
 	sc->vxl_max_port = V_ipport_lastauto;
 
 	sc->vxl_ftable_max = VXLAN_FTABLE_MAX;
 	sc->vxl_ftable_timeout = VXLAN_FTABLE_TIMEOUT;
 }
 
 static int
 vxlan_set_user_config(struct vxlan_softc *sc, struct ifvxlanparam *vxlp)
 {
 
 #ifndef INET
 	if (vxlp->vxlp_with & (VXLAN_PARAM_WITH_LOCAL_ADDR4 |
 	    VXLAN_PARAM_WITH_REMOTE_ADDR4))
 		return (EAFNOSUPPORT);
 #endif
 
 #ifndef INET6
 	if (vxlp->vxlp_with & (VXLAN_PARAM_WITH_LOCAL_ADDR6 |
 	    VXLAN_PARAM_WITH_REMOTE_ADDR6))
 		return (EAFNOSUPPORT);
 #else
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR6) {
 		int error = vxlan_sockaddr_in6_embedscope(&vxlp->vxlp_local_sa);
 		if (error)
 			return (error);
 	}
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR6) {
 		int error = vxlan_sockaddr_in6_embedscope(
 		   &vxlp->vxlp_remote_sa);
 		if (error)
 			return (error);
 	}
 #endif
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_VNI) {
 		if (vxlan_check_vni(vxlp->vxlp_vni) == 0)
 			sc->vxl_vni = vxlp->vxlp_vni;
 	}
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR4) {
 		sc->vxl_src_addr.in4.sin_len = sizeof(struct sockaddr_in);
 		sc->vxl_src_addr.in4.sin_family = AF_INET;
 		sc->vxl_src_addr.in4.sin_addr =
 		    vxlp->vxlp_local_sa.in4.sin_addr;
 	} else if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR6) {
 		sc->vxl_src_addr.in6.sin6_len = sizeof(struct sockaddr_in6);
 		sc->vxl_src_addr.in6.sin6_family = AF_INET6;
 		sc->vxl_src_addr.in6.sin6_addr =
 		    vxlp->vxlp_local_sa.in6.sin6_addr;
 	}
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR4) {
 		sc->vxl_dst_addr.in4.sin_len = sizeof(struct sockaddr_in);
 		sc->vxl_dst_addr.in4.sin_family = AF_INET;
 		sc->vxl_dst_addr.in4.sin_addr =
 		    vxlp->vxlp_remote_sa.in4.sin_addr;
 	} else if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR6) {
 		sc->vxl_dst_addr.in6.sin6_len = sizeof(struct sockaddr_in6);
 		sc->vxl_dst_addr.in6.sin6_family = AF_INET6;
 		sc->vxl_dst_addr.in6.sin6_addr =
 		    vxlp->vxlp_remote_sa.in6.sin6_addr;
 	}
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_PORT)
 		sc->vxl_src_addr.in4.sin_port = htons(vxlp->vxlp_local_port);
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_PORT)
 		sc->vxl_dst_addr.in4.sin_port = htons(vxlp->vxlp_remote_port);
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_PORT_RANGE) {
 		if (vxlp->vxlp_min_port <= vxlp->vxlp_max_port) {
 			sc->vxl_min_port = vxlp->vxlp_min_port;
 			sc->vxl_max_port = vxlp->vxlp_max_port;
 		}
 	}
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_MULTICAST_IF)
 		strlcpy(sc->vxl_mc_ifname, vxlp->vxlp_mc_ifname, IFNAMSIZ);
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_FTABLE_TIMEOUT) {
 		if (vxlan_check_ftable_timeout(vxlp->vxlp_ftable_timeout) == 0)
 			sc->vxl_ftable_timeout = vxlp->vxlp_ftable_timeout;
 	}
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_FTABLE_MAX) {
 		if (vxlan_check_ftable_max(vxlp->vxlp_ftable_max) == 0)
 			sc->vxl_ftable_max = vxlp->vxlp_ftable_max;
 	}
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_TTL) {
 		if (vxlan_check_ttl(vxlp->vxlp_ttl) == 0)
 			sc->vxl_ttl = vxlp->vxlp_ttl;
 	}
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LEARN) {
 		if (vxlp->vxlp_learn == 0)
 			sc->vxl_flags &= ~VXLAN_FLAG_LEARN;
 	}
 
 	return (0);
 }
 
 static int
 vxlan_set_reqcap(struct vxlan_softc *sc, struct ifnet *ifp, int reqcap)
 {
 	int mask = reqcap ^ ifp->if_capenable;
 
 	/* Disable TSO if tx checksums are disabled. */
 	if (mask & IFCAP_TXCSUM && !(reqcap & IFCAP_TXCSUM) &&
 	    reqcap & IFCAP_TSO4) {
 		reqcap &= ~IFCAP_TSO4;
 		if_printf(ifp, "tso4 disabled due to -txcsum.\n");
 	}
 	if (mask & IFCAP_TXCSUM_IPV6 && !(reqcap & IFCAP_TXCSUM_IPV6) &&
 	    reqcap & IFCAP_TSO6) {
 		reqcap &= ~IFCAP_TSO6;
 		if_printf(ifp, "tso6 disabled due to -txcsum6.\n");
 	}
 
 	/* Do not enable TSO if tx checksums are disabled. */
 	if (mask & IFCAP_TSO4 && reqcap & IFCAP_TSO4 &&
 	    !(reqcap & IFCAP_TXCSUM)) {
 		if_printf(ifp, "enable txcsum first.\n");
 		return (EAGAIN);
 	}
 	if (mask & IFCAP_TSO6 && reqcap & IFCAP_TSO6 &&
 	    !(reqcap & IFCAP_TXCSUM_IPV6)) {
 		if_printf(ifp, "enable txcsum6 first.\n");
 		return (EAGAIN);
 	}
 
 	sc->vxl_reqcap = reqcap;
 	return (0);
 }
 
 /*
  * A VXLAN interface inherits the capabilities of the vxlandev or the interface
  * hosting the vxlanlocal address.
  */
 static void
 vxlan_set_hwcaps(struct vxlan_softc *sc)
 {
 	struct epoch_tracker et;
 	struct ifnet *p;
 	struct ifaddr *ifa;
 	u_long hwa;
 	int cap, ena;
 	bool rel;
 	struct ifnet *ifp = sc->vxl_ifp;
 
 	/* reset caps */
 	ifp->if_capabilities &= VXLAN_BASIC_IFCAPS;
 	ifp->if_capenable &= VXLAN_BASIC_IFCAPS;
 	ifp->if_hwassist = 0;
 
 	NET_EPOCH_ENTER(et);
 	CURVNET_SET(ifp->if_vnet);
 
 	rel = false;
 	p = NULL;
 	if (sc->vxl_mc_ifname[0] != '\0') {
 		rel = true;
 		p = ifunit_ref(sc->vxl_mc_ifname);
 	} else if (vxlan_sockaddr_in_any(&sc->vxl_src_addr) == 0) {
 		if (sc->vxl_src_addr.sa.sa_family == AF_INET) {
 			struct sockaddr_in in4 = sc->vxl_src_addr.in4;
 
 			in4.sin_port = 0;
 			ifa = ifa_ifwithaddr((struct sockaddr *)&in4);
 			if (ifa != NULL)
 				p = ifa->ifa_ifp;
 		} else if (sc->vxl_src_addr.sa.sa_family == AF_INET6) {
 			struct sockaddr_in6 in6 = sc->vxl_src_addr.in6;
 
 			in6.sin6_port = 0;
 			ifa = ifa_ifwithaddr((struct sockaddr *)&in6);
 			if (ifa != NULL)
 				p = ifa->ifa_ifp;
 		}
 	}
 	if (p == NULL)
 		goto done;
 
 	cap = ena = hwa = 0;
 
 	/* checksum offload */
 	if (p->if_capabilities & IFCAP_VXLAN_HWCSUM)
 		cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
 	if (p->if_capenable & IFCAP_VXLAN_HWCSUM) {
 		ena |= sc->vxl_reqcap & p->if_capenable &
 		    (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
 		if (ena & IFCAP_TXCSUM) {
 			if (p->if_hwassist & CSUM_INNER_IP)
 				hwa |= CSUM_IP;
 			if (p->if_hwassist & CSUM_INNER_IP_UDP)
 				hwa |= CSUM_IP_UDP;
 			if (p->if_hwassist & CSUM_INNER_IP_TCP)
 				hwa |= CSUM_IP_TCP;
 		}
 		if (ena & IFCAP_TXCSUM_IPV6) {
 			if (p->if_hwassist & CSUM_INNER_IP6_UDP)
 				hwa |= CSUM_IP6_UDP;
 			if (p->if_hwassist & CSUM_INNER_IP6_TCP)
 				hwa |= CSUM_IP6_TCP;
 		}
 	}
 
 	/* hardware TSO */
 	if (p->if_capabilities & IFCAP_VXLAN_HWTSO) {
 		cap |= p->if_capabilities & IFCAP_TSO;
 		if (p->if_hw_tsomax > IP_MAXPACKET - ifp->if_hdrlen)
 			ifp->if_hw_tsomax = IP_MAXPACKET - ifp->if_hdrlen;
 		else
 			ifp->if_hw_tsomax = p->if_hw_tsomax;
 		/* XXX: tsomaxsegcount decrement is cxgbe specific  */
 		ifp->if_hw_tsomaxsegcount = p->if_hw_tsomaxsegcount - 1;
 		ifp->if_hw_tsomaxsegsize = p->if_hw_tsomaxsegsize;
 	}
 	if (p->if_capenable & IFCAP_VXLAN_HWTSO) {
 		ena |= sc->vxl_reqcap & p->if_capenable & IFCAP_TSO;
 		if (ena & IFCAP_TSO) {
 			if (p->if_hwassist & CSUM_INNER_IP_TSO)
 				hwa |= CSUM_IP_TSO;
 			if (p->if_hwassist & CSUM_INNER_IP6_TSO)
 				hwa |= CSUM_IP6_TSO;
 		}
 	}
 
 	ifp->if_capabilities |= cap;
 	ifp->if_capenable |= ena;
 	ifp->if_hwassist |= hwa;
 	if (rel)
 		if_rele(p);
 done:
 	CURVNET_RESTORE();
 	NET_EPOCH_EXIT(et);
 }
 
 static int
 vxlan_clone_create(struct if_clone *ifc, char *name, size_t len,
     struct ifc_data *ifd, struct ifnet **ifpp)
 {
 	struct vxlan_softc *sc;
 	struct ifnet *ifp;
 	struct ifvxlanparam vxlp;
 	int error;
 
 	sc = malloc(sizeof(struct vxlan_softc), M_VXLAN, M_WAITOK | M_ZERO);
 	sc->vxl_unit = ifd->unit;
 	sc->vxl_fibnum = curthread->td_proc->p_fibnum;
 	vxlan_set_default_config(sc);
 	error = vxlan_stats_alloc(sc);
 	if (error != 0)
 		goto fail;
 
 	if (ifd->params != NULL) {
 		error = ifc_copyin(ifd, &vxlp, sizeof(vxlp));
 		if (error)
 			goto fail;
 
 		error = vxlan_set_user_config(sc, &vxlp);
 		if (error)
 			goto fail;
 	}
 
 	ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		error = ENOSPC;
 		goto fail;
 	}
 
 	sc->vxl_ifp = ifp;
 	rm_init(&sc->vxl_lock, "vxlanrm");
 	callout_init_rw(&sc->vxl_callout, &sc->vxl_lock, 0);
 	sc->vxl_port_hash_key = arc4random();
 	vxlan_ftable_init(sc);
 
 	vxlan_sysctl_setup(sc);
 
 	ifp->if_softc = sc;
 	if_initname(ifp, vxlan_name, ifd->unit);
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_init = vxlan_init;
 	ifp->if_ioctl = vxlan_ioctl;
 	ifp->if_transmit = vxlan_transmit;
 	ifp->if_qflush = vxlan_qflush;
 	ifp->if_capabilities = VXLAN_BASIC_IFCAPS;
 	ifp->if_capenable = VXLAN_BASIC_IFCAPS;
 	sc->vxl_reqcap = -1;
 	vxlan_set_hwcaps(sc);
 
 	ifmedia_init(&sc->vxl_media, 0, vxlan_media_change, vxlan_media_status);
 	ifmedia_add(&sc->vxl_media, IFM_ETHER | IFM_AUTO, 0, NULL);
 	ifmedia_set(&sc->vxl_media, IFM_ETHER | IFM_AUTO);
 
 	ether_gen_addr(ifp, &sc->vxl_hwaddr);
 	ether_ifattach(ifp, sc->vxl_hwaddr.octet);
 
 	ifp->if_baudrate = 0;
 
 	VXLAN_WLOCK(sc);
 	vxlan_setup_interface_hdrlen(sc);
 	VXLAN_WUNLOCK(sc);
 	*ifpp = ifp;
 
 	return (0);
 
 fail:
 	free(sc, M_VXLAN);
 	return (error);
 }
 
 static int
 vxlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags)
 {
 	struct vxlan_softc *sc;
 
 	sc = ifp->if_softc;
 
 	vxlan_teardown(sc);
 
 	vxlan_ftable_flush(sc, 1);
 
 	ether_ifdetach(ifp);
 	if_free(ifp);
 	ifmedia_removeall(&sc->vxl_media);
 
 	vxlan_ftable_fini(sc);
 
 	vxlan_sysctl_destroy(sc);
 	rm_destroy(&sc->vxl_lock);
 	vxlan_stats_free(sc);
 	free(sc, M_VXLAN);
 
 	return (0);
 }
 
 /* BMV: Taken from if_bridge. */
 static uint32_t
 vxlan_mac_hash(struct vxlan_softc *sc, const uint8_t *addr)
 {
 	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->vxl_ftable_hash_key;
 
 	b += addr[5] << 8;
 	b += addr[4];
 	a += addr[3] << 24;
 	a += addr[2] << 16;
 	a += addr[1] << 8;
 	a += addr[0];
 
 /*
  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
  */
 #define	mix(a, b, c)							\
 do {									\
 	a -= b; a -= c; a ^= (c >> 13);					\
 	b -= c; b -= a; b ^= (a << 8);					\
 	c -= a; c -= b; c ^= (b >> 13);					\
 	a -= b; a -= c; a ^= (c >> 12);					\
 	b -= c; b -= a; b ^= (a << 16);					\
 	c -= a; c -= b; c ^= (b >> 5);					\
 	a -= b; a -= c; a ^= (c >> 3);					\
 	b -= c; b -= a; b ^= (a << 10);					\
 	c -= a; c -= b; c ^= (b >> 15);					\
 } while (0)
 
 	mix(a, b, c);
 
 #undef mix
 
 	return (c);
 }
 
 static int
 vxlan_media_change(struct ifnet *ifp)
 {
 
 	/* Ignore. */
 	return (0);
 }
 
 static void
 vxlan_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
 {
 
 	ifmr->ifm_status = IFM_ACTIVE | IFM_AVALID;
 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
 }
 
 static int
 vxlan_sockaddr_cmp(const union vxlan_sockaddr *vxladdr,
     const struct sockaddr *sa)
 {
 
 	return (bcmp(&vxladdr->sa, sa, vxladdr->sa.sa_len));
 }
 
 static void
 vxlan_sockaddr_copy(union vxlan_sockaddr *vxladdr,
     const struct sockaddr *sa)
 {
 
 	MPASS(sa->sa_family == AF_INET || sa->sa_family == AF_INET6);
 	bzero(vxladdr, sizeof(*vxladdr));
 
 	if (sa->sa_family == AF_INET) {
 		vxladdr->in4 = *satoconstsin(sa);
 		vxladdr->in4.sin_len = sizeof(struct sockaddr_in);
 	} else if (sa->sa_family == AF_INET6) {
 		vxladdr->in6 = *satoconstsin6(sa);
 		vxladdr->in6.sin6_len = sizeof(struct sockaddr_in6);
 	}
 }
 
 static int
 vxlan_sockaddr_in_equal(const union vxlan_sockaddr *vxladdr,
     const struct sockaddr *sa)
 {
 	int equal;
 
 	if (sa->sa_family == AF_INET) {
 		const struct in_addr *in4 = &satoconstsin(sa)->sin_addr;
 		equal = in4->s_addr == vxladdr->in4.sin_addr.s_addr;
 	} else if (sa->sa_family == AF_INET6) {
 		const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr;
 		equal = IN6_ARE_ADDR_EQUAL(in6, &vxladdr->in6.sin6_addr);
 	} else
 		equal = 0;
 
 	return (equal);
 }
 
 static void
 vxlan_sockaddr_in_copy(union vxlan_sockaddr *vxladdr,
     const struct sockaddr *sa)
 {
 
 	MPASS(sa->sa_family == AF_INET || sa->sa_family == AF_INET6);
 
 	if (sa->sa_family == AF_INET) {
 		const struct in_addr *in4 = &satoconstsin(sa)->sin_addr;
 		vxladdr->in4.sin_family = AF_INET;
 		vxladdr->in4.sin_len = sizeof(struct sockaddr_in);
 		vxladdr->in4.sin_addr = *in4;
 	} else if (sa->sa_family == AF_INET6) {
 		const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr;
 		vxladdr->in6.sin6_family = AF_INET6;
 		vxladdr->in6.sin6_len = sizeof(struct sockaddr_in6);
 		vxladdr->in6.sin6_addr = *in6;
 	}
 }
 
 static int
 vxlan_sockaddr_supported(const union vxlan_sockaddr *vxladdr, int unspec)
 {
 	const struct sockaddr *sa;
 	int supported;
 
 	sa = &vxladdr->sa;
 	supported = 0;
 
 	if (sa->sa_family == AF_UNSPEC && unspec != 0) {
 		supported = 1;
 	} else if (sa->sa_family == AF_INET) {
 #ifdef INET
 		supported = 1;
 #endif
 	} else if (sa->sa_family == AF_INET6) {
 #ifdef INET6
 		supported = 1;
 #endif
 	}
 
 	return (supported);
 }
 
 static int
 vxlan_sockaddr_in_any(const union vxlan_sockaddr *vxladdr)
 {
 	const struct sockaddr *sa;
 	int any;
 
 	sa = &vxladdr->sa;
 
 	if (sa->sa_family == AF_INET) {
 		const struct in_addr *in4 = &satoconstsin(sa)->sin_addr;
 		any = in4->s_addr == INADDR_ANY;
 	} else if (sa->sa_family == AF_INET6) {
 		const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr;
 		any = IN6_IS_ADDR_UNSPECIFIED(in6);
 	} else
 		any = -1;
 
 	return (any);
 }
 
 static int
 vxlan_sockaddr_in_multicast(const union vxlan_sockaddr *vxladdr)
 {
 	const struct sockaddr *sa;
 	int mc;
 
 	sa = &vxladdr->sa;
 
 	if (sa->sa_family == AF_INET) {
 		const struct in_addr *in4 = &satoconstsin(sa)->sin_addr;
 		mc = IN_MULTICAST(ntohl(in4->s_addr));
 	} else if (sa->sa_family == AF_INET6) {
 		const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr;
 		mc = IN6_IS_ADDR_MULTICAST(in6);
 	} else
 		mc = -1;
 
 	return (mc);
 }
 
 static int
 vxlan_sockaddr_in6_embedscope(union vxlan_sockaddr *vxladdr)
 {
 	int error;
 
 	MPASS(VXLAN_SOCKADDR_IS_IPV6(vxladdr));
 #ifdef INET6
 	error = sa6_embedscope(&vxladdr->in6, V_ip6_use_defzone);
 #else
 	error = EAFNOSUPPORT;
 #endif
 
 	return (error);
 }
 
 static int
 vxlan_can_change_config(struct vxlan_softc *sc)
 {
 	struct ifnet *ifp;
 
 	ifp = sc->vxl_ifp;
 	VXLAN_LOCK_ASSERT(sc);
 
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 		return (0);
 	if (sc->vxl_flags & (VXLAN_FLAG_INIT | VXLAN_FLAG_TEARDOWN))
 		return (0);
 
 	return (1);
 }
 
 static int
 vxlan_check_vni(uint32_t vni)
 {
 
 	return (vni >= VXLAN_VNI_MAX);
 }
 
 static int
 vxlan_check_ttl(int ttl)
 {
 
 	return (ttl > MAXTTL);
 }
 
 static int
 vxlan_check_ftable_timeout(uint32_t timeout)
 {
 
 	return (timeout > VXLAN_FTABLE_MAX_TIMEOUT);
 }
 
 static int
 vxlan_check_ftable_max(uint32_t max)
 {
 
 	return (max > VXLAN_FTABLE_MAX);
 }
 
 static void
 vxlan_sysctl_setup(struct vxlan_softc *sc)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *node;
 	struct vxlan_statistics *stats;
 	char namebuf[8];
 
 	ctx = &sc->vxl_sysctl_ctx;
 	stats = &sc->vxl_stats;
 	snprintf(namebuf, sizeof(namebuf), "%d", sc->vxl_unit);
 
 	sysctl_ctx_init(ctx);
 	sc->vxl_sysctl_node = SYSCTL_ADD_NODE(ctx,
 	    SYSCTL_STATIC_CHILDREN(_net_link_vxlan), OID_AUTO, namebuf,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 
 	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->vxl_sysctl_node),
 	    OID_AUTO, "ftable", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "count",
 	    CTLFLAG_RD, &sc->vxl_ftable_cnt, 0,
 	    "Number of entries in forwarding table");
 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "max",
 	     CTLFLAG_RD, &sc->vxl_ftable_max, 0,
 	    "Maximum number of entries allowed in forwarding table");
 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "timeout",
 	    CTLFLAG_RD, &sc->vxl_ftable_timeout, 0,
 	    "Number of seconds between prunes of the forwarding table");
 	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "dump",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
 	    sc, 0, vxlan_ftable_sysctl_dump, "A",
 	    "Dump the forwarding table entries");
 
 	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->vxl_sysctl_node),
 	    OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO,
 	    "ftable_nospace", CTLFLAG_RD, &stats->ftable_nospace, 0,
 	    "Fowarding table reached maximum entries");
 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO,
 	    "ftable_lock_upgrade_failed", CTLFLAG_RD,
 	    &stats->ftable_lock_upgrade_failed, 0,
 	    "Forwarding table update required lock upgrade");
 
 	SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "txcsum",
 	    CTLFLAG_RD, &stats->txcsum,
 	    "# of times hardware assisted with tx checksum");
 	SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "tso",
 	    CTLFLAG_RD, &stats->tso, "# of times hardware assisted with TSO");
 	SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "rxcsum",
 	    CTLFLAG_RD, &stats->rxcsum,
 	    "# of times hardware assisted with rx checksum");
 }
 
 static void
 vxlan_sysctl_destroy(struct vxlan_softc *sc)
 {
 
 	sysctl_ctx_free(&sc->vxl_sysctl_ctx);
 	sc->vxl_sysctl_node = NULL;
 }
 
 static int
 vxlan_tunable_int(struct vxlan_softc *sc, const char *knob, int def)
 {
 	char path[64];
 
 	snprintf(path, sizeof(path), "net.link.vxlan.%d.%s",
 	    sc->vxl_unit, knob);
 	TUNABLE_INT_FETCH(path, &def);
 
 	return (def);
 }
 
 static void
 vxlan_ifdetach_event(void *arg __unused, struct ifnet *ifp)
 {
 	struct vxlan_softc_head list;
 	struct vxlan_socket *vso;
 	struct vxlan_softc *sc, *tsc;
 
 	LIST_INIT(&list);
 
 	if (ifp->if_flags & IFF_RENAMING)
 		return;
 	if ((ifp->if_flags & IFF_MULTICAST) == 0)
 		return;
 
 	VXLAN_LIST_LOCK();
 	LIST_FOREACH(vso, &vxlan_socket_list, vxlso_entry)
 		vxlan_socket_ifdetach(vso, ifp, &list);
 	VXLAN_LIST_UNLOCK();
 
 	LIST_FOREACH_SAFE(sc, &list, vxl_ifdetach_list, tsc) {
 		LIST_REMOVE(sc, vxl_ifdetach_list);
 
 		sx_xlock(&vxlan_sx);
 		VXLAN_WLOCK(sc);
 		if (sc->vxl_flags & VXLAN_FLAG_INIT)
 			vxlan_init_wait(sc);
 		vxlan_teardown_locked(sc);
 		sx_xunlock(&vxlan_sx);
 	}
 }
 
 static void
 vxlan_load(void)
 {
 
 	mtx_init(&vxlan_list_mtx, "vxlan list", NULL, MTX_DEF);
 	LIST_INIT(&vxlan_socket_list);
 	vxlan_ifdetach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
 	    vxlan_ifdetach_event, NULL, EVENTHANDLER_PRI_ANY);
 
 	struct if_clone_addreq req = {
 		.create_f = vxlan_clone_create,
 		.destroy_f = vxlan_clone_destroy,
 		.flags = IFC_F_AUTOUNIT,
 	};
 	vxlan_cloner = ifc_attach_cloner(vxlan_name, &req);
 }
 
 static void
 vxlan_unload(void)
 {
 
 	EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 	    vxlan_ifdetach_event_tag);
 	ifc_detach_cloner(vxlan_cloner);
 	mtx_destroy(&vxlan_list_mtx);
 	MPASS(LIST_EMPTY(&vxlan_socket_list));
 }
 
 static int
 vxlan_modevent(module_t mod, int type, void *unused)
 {
 	int error;
 
 	error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		vxlan_load();
 		break;
 	case MOD_UNLOAD:
 		vxlan_unload();
 		break;
 	default:
 		error = ENOTSUP;
 		break;
 	}
 
 	return (error);
 }
 
 static moduledata_t vxlan_mod = {
 	"if_vxlan",
 	vxlan_modevent,
 	0
 };
 
 DECLARE_MODULE(if_vxlan, vxlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_vxlan, 1);
diff --git a/sys/net/iflib.c b/sys/net/iflib.c
index 3b743caa34e0..5f202b120005 100644
--- a/sys/net/iflib.c
+++ b/sys/net/iflib.c
@@ -1,7350 +1,7351 @@
 /*-
  * Copyright (c) 2014-2018, Matthew Macy <mmacy@mattmacy.io>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  *  1. Redistributions of source code must retain the above copyright notice,
  *     this list of conditions and the following disclaimer.
  *
  *  2. Neither the name of Matthew Macy nor the names of its
  *     contributors may be used to endorse or promote products derived from
  *     this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_acpi.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/kobj.h>
 #include <sys/rman.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/taskqueue.h>
 #include <sys/limits.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/if_media.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/mp_ring.h>
 #include <net/debugnet.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_lro.h>
 #include <netinet/in_systm.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <netinet/ip_var.h>
 #include <netinet6/ip6_var.h>
 
 #include <machine/bus.h>
 #include <machine/in_cksum.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <dev/led/led.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pci_private.h>
 
 #include <net/iflib.h>
 #include <net/iflib_private.h>
 
 #include "ifdi_if.h"
 
 #ifdef PCI_IOV
 #include <dev/pci/pci_iov.h>
 #endif
 
 #include <sys/bitstring.h>
 /*
  * enable accounting of every mbuf as it comes in to and goes out of
  * iflib's software descriptor references
  */
 #define MEMORY_LOGGING 0
 /*
  * Enable mbuf vectors for compressing long mbuf chains
  */
 
 /*
  * NB:
  * - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead
  *   we prefetch needs to be determined by the time spent in m_free vis a vis
  *   the cost of a prefetch. This will of course vary based on the workload:
  *      - NFLX's m_free path is dominated by vm-based M_EXT manipulation which
  *        is quite expensive, thus suggesting very little prefetch.
  *      - small packet forwarding which is just returning a single mbuf to
  *        UMA will typically be very fast vis a vis the cost of a memory
  *        access.
  */
 
 /*
  * File organization:
  *  - private structures
  *  - iflib private utility functions
  *  - ifnet functions
  *  - vlan registry and other exported functions
  *  - iflib public core functions
  *
  *
  */
 MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library");
 
 #define	IFLIB_RXEOF_MORE (1U << 0)
 #define	IFLIB_RXEOF_EMPTY (2U << 0)
 
 struct iflib_txq;
 typedef struct iflib_txq *iflib_txq_t;
 struct iflib_rxq;
 typedef struct iflib_rxq *iflib_rxq_t;
 struct iflib_fl;
 typedef struct iflib_fl *iflib_fl_t;
 
 struct iflib_ctx;
 
 static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid);
 static void iflib_timer(void *arg);
 static void iflib_tqg_detach(if_ctx_t ctx);
 
 typedef struct iflib_filter_info {
 	driver_filter_t *ifi_filter;
 	void *ifi_filter_arg;
 	struct grouptask *ifi_task;
 	void *ifi_ctx;
 } *iflib_filter_info_t;
 
 struct iflib_ctx {
 	KOBJ_FIELDS;
 	/*
 	 * Pointer to hardware driver's softc
 	 */
 	void *ifc_softc;
 	device_t ifc_dev;
 	if_t ifc_ifp;
 
 	cpuset_t ifc_cpus;
 	if_shared_ctx_t ifc_sctx;
 	struct if_softc_ctx ifc_softc_ctx;
 
 	struct sx ifc_ctx_sx;
 	struct mtx ifc_state_mtx;
 
 	iflib_txq_t ifc_txqs;
 	iflib_rxq_t ifc_rxqs;
 	uint32_t ifc_if_flags;
 	uint32_t ifc_flags;
 	uint32_t ifc_max_fl_buf_size;
 	uint32_t ifc_rx_mbuf_sz;
 
 	int ifc_link_state;
 	int ifc_watchdog_events;
 	struct cdev *ifc_led_dev;
 	struct resource *ifc_msix_mem;
 
 	struct if_irq ifc_legacy_irq;
 	struct grouptask ifc_admin_task;
 	struct grouptask ifc_vflr_task;
 	struct iflib_filter_info ifc_filter_info;
 	struct ifmedia	ifc_media;
 	struct ifmedia	*ifc_mediap;
 
 	struct sysctl_oid *ifc_sysctl_node;
 	uint16_t ifc_sysctl_ntxqs;
 	uint16_t ifc_sysctl_nrxqs;
 	uint16_t ifc_sysctl_qs_eq_override;
 	uint16_t ifc_sysctl_rx_budget;
 	uint16_t ifc_sysctl_tx_abdicate;
 	uint16_t ifc_sysctl_core_offset;
 #define	CORE_OFFSET_UNSPECIFIED	0xffff
 	uint8_t  ifc_sysctl_separate_txrx;
 	uint8_t  ifc_sysctl_use_logical_cores;
 	bool	 ifc_cpus_are_physical_cores;
 
 	qidx_t ifc_sysctl_ntxds[8];
 	qidx_t ifc_sysctl_nrxds[8];
 	struct if_txrx ifc_txrx;
 #define isc_txd_encap  ifc_txrx.ift_txd_encap
 #define isc_txd_flush  ifc_txrx.ift_txd_flush
 #define isc_txd_credits_update  ifc_txrx.ift_txd_credits_update
 #define isc_rxd_available ifc_txrx.ift_rxd_available
 #define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get
 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
 #define isc_rxd_flush ifc_txrx.ift_rxd_flush
 #define isc_legacy_intr ifc_txrx.ift_legacy_intr
 #define isc_txq_select ifc_txrx.ift_txq_select
 #define isc_txq_select_v2 ifc_txrx.ift_txq_select_v2
 	eventhandler_tag ifc_vlan_attach_event;
 	eventhandler_tag ifc_vlan_detach_event;
 	struct ether_addr ifc_mac;
 };
 
 void *
 iflib_get_softc(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_softc);
 }
 
 device_t
 iflib_get_dev(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_dev);
 }
 
 if_t
 iflib_get_ifp(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_ifp);
 }
 
 struct ifmedia *
 iflib_get_media(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_mediap);
 }
 
 uint32_t
 iflib_get_flags(if_ctx_t ctx)
 {
 	return (ctx->ifc_flags);
 }
 
 void
 iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN])
 {
 
 	bcopy(mac, ctx->ifc_mac.octet, ETHER_ADDR_LEN);
 }
 
 if_softc_ctx_t
 iflib_get_softc_ctx(if_ctx_t ctx)
 {
 
 	return (&ctx->ifc_softc_ctx);
 }
 
 if_shared_ctx_t
 iflib_get_sctx(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_sctx);
 }
 
 #define IP_ALIGNED(m) ((((uintptr_t)(m)->m_data) & 0x3) == 0x2)
 #define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*))
 #define CACHE_PTR_NEXT(ptr) ((void *)(((uintptr_t)(ptr)+CACHE_LINE_SIZE-1) & (CACHE_LINE_SIZE-1)))
 
 #define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP)
 #define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
 
 typedef struct iflib_sw_rx_desc_array {
 	bus_dmamap_t	*ifsd_map;         /* bus_dma maps for packet */
 	struct mbuf	**ifsd_m;           /* pkthdr mbufs */
 	caddr_t		*ifsd_cl;          /* direct cluster pointer for rx */
 	bus_addr_t	*ifsd_ba;          /* bus addr of cluster for rx */
 } iflib_rxsd_array_t;
 
 typedef struct iflib_sw_tx_desc_array {
 	bus_dmamap_t    *ifsd_map;         /* bus_dma maps for packet */
 	bus_dmamap_t	*ifsd_tso_map;     /* bus_dma maps for TSO packet */
 	struct mbuf    **ifsd_m;           /* pkthdr mbufs */
 } if_txsd_vec_t;
 
 /* magic number that should be high enough for any hardware */
 #define IFLIB_MAX_TX_SEGS		128
 #define IFLIB_RX_COPY_THRESH		128
 #define IFLIB_MAX_RX_REFRESH		32
 /* The minimum descriptors per second before we start coalescing */
 #define IFLIB_MIN_DESC_SEC		16384
 #define IFLIB_DEFAULT_TX_UPDATE_FREQ	16
 #define IFLIB_QUEUE_IDLE		0
 #define IFLIB_QUEUE_HUNG		1
 #define IFLIB_QUEUE_WORKING		2
 /* maximum number of txqs that can share an rx interrupt */
 #define IFLIB_MAX_TX_SHARED_INTR	4
 
 /* this should really scale with ring size - this is a fairly arbitrary value */
 #define TX_BATCH_SIZE			32
 
 #define IFLIB_RESTART_BUDGET		8
 
 #define CSUM_OFFLOAD		(CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \
 				 CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \
 				 CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP)
 
 struct iflib_txq {
 	qidx_t		ift_in_use;
 	qidx_t		ift_cidx;
 	qidx_t		ift_cidx_processed;
 	qidx_t		ift_pidx;
 	uint8_t		ift_gen;
 	uint8_t		ift_br_offset;
 	uint16_t	ift_npending;
 	uint16_t	ift_db_pending;
 	uint16_t	ift_rs_pending;
 	/* implicit pad */
 	uint8_t		ift_txd_size[8];
 	uint64_t	ift_processed;
 	uint64_t	ift_cleaned;
 	uint64_t	ift_cleaned_prev;
 #if MEMORY_LOGGING
 	uint64_t	ift_enqueued;
 	uint64_t	ift_dequeued;
 #endif
 	uint64_t	ift_no_tx_dma_setup;
 	uint64_t	ift_no_desc_avail;
 	uint64_t	ift_mbuf_defrag_failed;
 	uint64_t	ift_mbuf_defrag;
 	uint64_t	ift_map_failed;
 	uint64_t	ift_txd_encap_efbig;
 	uint64_t	ift_pullups;
 	uint64_t	ift_last_timer_tick;
 
 	struct mtx	ift_mtx;
 	struct mtx	ift_db_mtx;
 
 	/* constant values */
 	if_ctx_t	ift_ctx;
 	struct ifmp_ring        *ift_br;
 	struct grouptask	ift_task;
 	qidx_t		ift_size;
 	uint16_t	ift_id;
 	struct callout	ift_timer;
 #ifdef DEV_NETMAP
 	struct callout	ift_netmap_timer;
 #endif /* DEV_NETMAP */
 
 	if_txsd_vec_t	ift_sds;
 	uint8_t		ift_qstatus;
 	uint8_t		ift_closed;
 	uint8_t		ift_update_freq;
 	struct iflib_filter_info ift_filter_info;
 	bus_dma_tag_t	ift_buf_tag;
 	bus_dma_tag_t	ift_tso_buf_tag;
 	iflib_dma_info_t	ift_ifdi;
 #define	MTX_NAME_LEN	32
 	char                    ift_mtx_name[MTX_NAME_LEN];
 	bus_dma_segment_t	ift_segs[IFLIB_MAX_TX_SEGS]  __aligned(CACHE_LINE_SIZE);
 #ifdef IFLIB_DIAGNOSTICS
 	uint64_t ift_cpu_exec_count[256];
 #endif
 } __aligned(CACHE_LINE_SIZE);
 
 struct iflib_fl {
 	qidx_t		ifl_cidx;
 	qidx_t		ifl_pidx;
 	qidx_t		ifl_credits;
 	uint8_t		ifl_gen;
 	uint8_t		ifl_rxd_size;
 #if MEMORY_LOGGING
 	uint64_t	ifl_m_enqueued;
 	uint64_t	ifl_m_dequeued;
 	uint64_t	ifl_cl_enqueued;
 	uint64_t	ifl_cl_dequeued;
 #endif
 	/* implicit pad */
 	bitstr_t 	*ifl_rx_bitmap;
 	qidx_t		ifl_fragidx;
 	/* constant */
 	qidx_t		ifl_size;
 	uint16_t	ifl_buf_size;
 	uint16_t	ifl_cltype;
 	uma_zone_t	ifl_zone;
 	iflib_rxsd_array_t	ifl_sds;
 	iflib_rxq_t	ifl_rxq;
 	uint8_t		ifl_id;
 	bus_dma_tag_t	ifl_buf_tag;
 	iflib_dma_info_t	ifl_ifdi;
 	uint64_t	ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE);
 	qidx_t		ifl_rxd_idxs[IFLIB_MAX_RX_REFRESH];
 }  __aligned(CACHE_LINE_SIZE);
 
 static inline qidx_t
 get_inuse(int size, qidx_t cidx, qidx_t pidx, uint8_t gen)
 {
 	qidx_t used;
 
 	if (pidx > cidx)
 		used = pidx - cidx;
 	else if (pidx < cidx)
 		used = size - cidx + pidx;
 	else if (gen == 0 && pidx == cidx)
 		used = 0;
 	else if (gen == 1 && pidx == cidx)
 		used = size;
 	else
 		panic("bad state");
 
 	return (used);
 }
 
 #define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen))
 
 #define IDXDIFF(head, tail, wrap) \
 	((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
 
 struct iflib_rxq {
 	if_ctx_t	ifr_ctx;
 	iflib_fl_t	ifr_fl;
 	uint64_t	ifr_rx_irq;
 	struct pfil_head	*pfil;
 	/*
 	 * If there is a separate completion queue (IFLIB_HAS_RXCQ), this is
 	 * the completion queue consumer index.  Otherwise it's unused.
 	 */
 	qidx_t		ifr_cq_cidx;
 	uint16_t	ifr_id;
 	uint8_t		ifr_nfl;
 	uint8_t		ifr_ntxqirq;
 	uint8_t		ifr_txqid[IFLIB_MAX_TX_SHARED_INTR];
 	uint8_t		ifr_fl_offset;
 	struct lro_ctrl			ifr_lc;
 	struct grouptask        ifr_task;
 	struct callout		ifr_watchdog;
 	struct iflib_filter_info ifr_filter_info;
 	iflib_dma_info_t		ifr_ifdi;
 
 	/* dynamically allocate if any drivers need a value substantially larger than this */
 	struct if_rxd_frag	ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE);
 #ifdef IFLIB_DIAGNOSTICS
 	uint64_t ifr_cpu_exec_count[256];
 #endif
 }  __aligned(CACHE_LINE_SIZE);
 
 typedef struct if_rxsd {
 	caddr_t *ifsd_cl;
 	iflib_fl_t ifsd_fl;
 } *if_rxsd_t;
 
 /* multiple of word size */
 #ifdef __LP64__
 #define PKT_INFO_SIZE	6
 #define RXD_INFO_SIZE	5
 #define PKT_TYPE uint64_t
 #else
 #define PKT_INFO_SIZE	11
 #define RXD_INFO_SIZE	8
 #define PKT_TYPE uint32_t
 #endif
 #define PKT_LOOP_BOUND  ((PKT_INFO_SIZE/3)*3)
 #define RXD_LOOP_BOUND  ((RXD_INFO_SIZE/4)*4)
 
 typedef struct if_pkt_info_pad {
 	PKT_TYPE pkt_val[PKT_INFO_SIZE];
 } *if_pkt_info_pad_t;
 typedef struct if_rxd_info_pad {
 	PKT_TYPE rxd_val[RXD_INFO_SIZE];
 } *if_rxd_info_pad_t;
 
 CTASSERT(sizeof(struct if_pkt_info_pad) == sizeof(struct if_pkt_info));
 CTASSERT(sizeof(struct if_rxd_info_pad) == sizeof(struct if_rxd_info));
 
 static inline void
 pkt_info_zero(if_pkt_info_t pi)
 {
 	if_pkt_info_pad_t pi_pad;
 
 	pi_pad = (if_pkt_info_pad_t)pi;
 	pi_pad->pkt_val[0] = 0; pi_pad->pkt_val[1] = 0; pi_pad->pkt_val[2] = 0;
 	pi_pad->pkt_val[3] = 0; pi_pad->pkt_val[4] = 0; pi_pad->pkt_val[5] = 0;
 #ifndef __LP64__
 	pi_pad->pkt_val[6] = 0; pi_pad->pkt_val[7] = 0; pi_pad->pkt_val[8] = 0;
 	pi_pad->pkt_val[9] = 0; pi_pad->pkt_val[10] = 0;
 #endif	
 }
 
 static device_method_t iflib_pseudo_methods[] = {
 	DEVMETHOD(device_attach, noop_attach),
 	DEVMETHOD(device_detach, iflib_pseudo_detach),
 	DEVMETHOD_END
 };
 
 driver_t iflib_pseudodriver = {
 	"iflib_pseudo", iflib_pseudo_methods, sizeof(struct iflib_ctx),
 };
 
 static inline void
 rxd_info_zero(if_rxd_info_t ri)
 {
 	if_rxd_info_pad_t ri_pad;
 	int i;
 
 	ri_pad = (if_rxd_info_pad_t)ri;
 	for (i = 0; i < RXD_LOOP_BOUND; i += 4) {
 		ri_pad->rxd_val[i] = 0;
 		ri_pad->rxd_val[i+1] = 0;
 		ri_pad->rxd_val[i+2] = 0;
 		ri_pad->rxd_val[i+3] = 0;
 	}
 #ifdef __LP64__
 	ri_pad->rxd_val[RXD_INFO_SIZE-1] = 0;
 #endif
 }
 
 /*
  * Only allow a single packet to take up most 1/nth of the tx ring
  */
 #define MAX_SINGLE_PACKET_FRACTION 12
 #define IF_BAD_DMA (bus_addr_t)-1
 
 #define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING))
 
 #define CTX_LOCK_INIT(_sc)  sx_init(&(_sc)->ifc_ctx_sx, "iflib ctx lock")
 #define CTX_LOCK(ctx) sx_xlock(&(ctx)->ifc_ctx_sx)
 #define CTX_UNLOCK(ctx) sx_xunlock(&(ctx)->ifc_ctx_sx)
 #define CTX_LOCK_DESTROY(ctx) sx_destroy(&(ctx)->ifc_ctx_sx)
 
 #define STATE_LOCK_INIT(_sc, _name)  mtx_init(&(_sc)->ifc_state_mtx, _name, "iflib state lock", MTX_DEF)
 #define STATE_LOCK(ctx) mtx_lock(&(ctx)->ifc_state_mtx)
 #define STATE_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_state_mtx)
 #define STATE_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_state_mtx)
 
 #define CALLOUT_LOCK(txq)	mtx_lock(&txq->ift_mtx)
 #define CALLOUT_UNLOCK(txq) 	mtx_unlock(&txq->ift_mtx)
 
 void
 iflib_set_detach(if_ctx_t ctx)
 {
 	STATE_LOCK(ctx);
 	ctx->ifc_flags |= IFC_IN_DETACH;
 	STATE_UNLOCK(ctx);
 }
 
 /* Our boot-time initialization hook */
 static int	iflib_module_event_handler(module_t, int, void *);
 
 static moduledata_t iflib_moduledata = {
 	"iflib",
 	iflib_module_event_handler,
 	NULL
 };
 
 DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY);
 MODULE_VERSION(iflib, 1);
 
 MODULE_DEPEND(iflib, pci, 1, 1, 1);
 MODULE_DEPEND(iflib, ether, 1, 1, 1);
 
 TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
 TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
 
 #ifndef IFLIB_DEBUG_COUNTERS
 #ifdef INVARIANTS
 #define IFLIB_DEBUG_COUNTERS 1
 #else
 #define IFLIB_DEBUG_COUNTERS 0
 #endif /* !INVARIANTS */
 #endif
 
 static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "iflib driver parameters");
 
 /*
  * XXX need to ensure that this can't accidentally cause the head to be moved backwards 
  */
 static int iflib_min_tx_latency = 0;
 SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW,
 		   &iflib_min_tx_latency, 0, "minimize transmit latency at the possible expense of throughput");
 static int iflib_no_tx_batch = 0;
 SYSCTL_INT(_net_iflib, OID_AUTO, no_tx_batch, CTLFLAG_RW,
 		   &iflib_no_tx_batch, 0, "minimize transmit latency at the possible expense of throughput");
 static int iflib_timer_default = 1000;
 SYSCTL_INT(_net_iflib, OID_AUTO, timer_default, CTLFLAG_RW,
 		   &iflib_timer_default, 0, "number of ticks between iflib_timer calls");
 
 
 #if IFLIB_DEBUG_COUNTERS
 
 static int iflib_tx_seen;
 static int iflib_tx_sent;
 static int iflib_tx_encap;
 static int iflib_rx_allocs;
 static int iflib_fl_refills;
 static int iflib_fl_refills_large;
 static int iflib_tx_frees;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD,
 		   &iflib_tx_seen, 0, "# TX mbufs seen");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD,
 		   &iflib_tx_sent, 0, "# TX mbufs sent");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD,
 		   &iflib_tx_encap, 0, "# TX mbufs encapped");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD,
 		   &iflib_tx_frees, 0, "# TX frees");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD,
 		   &iflib_rx_allocs, 0, "# RX allocations");
 SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD,
 		   &iflib_fl_refills, 0, "# refills");
 SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD,
 		   &iflib_fl_refills_large, 0, "# large refills");
 
 static int iflib_txq_drain_flushing;
 static int iflib_txq_drain_oactive;
 static int iflib_txq_drain_notready;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD,
 		   &iflib_txq_drain_flushing, 0, "# drain flushes");
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD,
 		   &iflib_txq_drain_oactive, 0, "# drain oactives");
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD,
 		   &iflib_txq_drain_notready, 0, "# drain notready");
 
 static int iflib_encap_load_mbuf_fail;
 static int iflib_encap_pad_mbuf_fail;
 static int iflib_encap_txq_avail_fail;
 static int iflib_encap_txd_encap_fail;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD,
 		   &iflib_encap_load_mbuf_fail, 0, "# busdma load failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_pad_mbuf_fail, CTLFLAG_RD,
 		   &iflib_encap_pad_mbuf_fail, 0, "# runt frame pad failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD,
 		   &iflib_encap_txq_avail_fail, 0, "# txq avail failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD,
 		   &iflib_encap_txd_encap_fail, 0, "# driver encap failures");
 
 static int iflib_task_fn_rxs;
 static int iflib_rx_intr_enables;
 static int iflib_fast_intrs;
 static int iflib_rx_unavail;
 static int iflib_rx_ctx_inactive;
 static int iflib_rx_if_input;
 static int iflib_rxd_flush;
 
 static int iflib_verbose_debug;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD,
 		   &iflib_task_fn_rxs, 0, "# task_fn_rx calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD,
 		   &iflib_rx_intr_enables, 0, "# RX intr enables");
 SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD,
 		   &iflib_fast_intrs, 0, "# fast_intr calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD,
 		   &iflib_rx_unavail, 0, "# times rxeof called with no available data");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD,
 		   &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD,
 		   &iflib_rx_if_input, 0, "# times rxeof called if_input");
 SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD,
 	         &iflib_rxd_flush, 0, "# times rxd_flush called");
 SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
 		   &iflib_verbose_debug, 0, "enable verbose debugging");
 
 #define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1)
 static void
 iflib_debug_reset(void)
 {
 	iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs =
 		iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees =
 		iflib_txq_drain_flushing = iflib_txq_drain_oactive =
 		iflib_txq_drain_notready =
 		iflib_encap_load_mbuf_fail = iflib_encap_pad_mbuf_fail =
 		iflib_encap_txq_avail_fail = iflib_encap_txd_encap_fail =
 		iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs =
 		iflib_rx_unavail =
 		iflib_rx_ctx_inactive = iflib_rx_if_input =
 		iflib_rxd_flush = 0;
 }
 
 #else
 #define DBG_COUNTER_INC(name)
 static void iflib_debug_reset(void) {}
 #endif
 
 #define IFLIB_DEBUG 0
 
 static void iflib_tx_structures_free(if_ctx_t ctx);
 static void iflib_rx_structures_free(if_ctx_t ctx);
 static int iflib_queues_alloc(if_ctx_t ctx);
 static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq);
 static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget);
 static int iflib_qset_structures_setup(if_ctx_t ctx);
 static int iflib_msix_init(if_ctx_t ctx);
 static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, const char *str);
 static void iflib_txq_check_drain(iflib_txq_t txq, int budget);
 static uint32_t iflib_txq_can_drain(struct ifmp_ring *);
 #ifdef ALTQ
 static void iflib_altq_if_start(if_t ifp);
 static int iflib_altq_if_transmit(if_t ifp, struct mbuf *m);
 #endif
 static int iflib_register(if_ctx_t);
 static void iflib_deregister(if_ctx_t);
 static void iflib_unregister_vlan_handlers(if_ctx_t ctx);
 static uint16_t iflib_get_mbuf_size_for(unsigned int size);
 static void iflib_init_locked(if_ctx_t ctx);
 static void iflib_add_device_sysctl_pre(if_ctx_t ctx);
 static void iflib_add_device_sysctl_post(if_ctx_t ctx);
 static void iflib_ifmp_purge(iflib_txq_t txq);
 static void _iflib_pre_assert(if_softc_ctx_t scctx);
 static void iflib_if_init_locked(if_ctx_t ctx);
 static void iflib_free_intr_mem(if_ctx_t ctx);
 #ifndef __NO_STRICT_ALIGNMENT
 static struct mbuf * iflib_fixup_rx(struct mbuf *m);
 #endif
 
 static SLIST_HEAD(cpu_offset_list, cpu_offset) cpu_offsets =
     SLIST_HEAD_INITIALIZER(cpu_offsets);
 struct cpu_offset {
 	SLIST_ENTRY(cpu_offset) entries;
 	cpuset_t	set;
 	unsigned int	refcount;
 	uint16_t	next_cpuid;
 };
 static struct mtx cpu_offset_mtx;
 MTX_SYSINIT(iflib_cpu_offset, &cpu_offset_mtx, "iflib_cpu_offset lock",
     MTX_DEF);
 
 DEBUGNET_DEFINE(iflib);
 
 static int
 iflib_num_rx_descs(if_ctx_t ctx)
 {
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	uint16_t first_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0;
 
 	return scctx->isc_nrxd[first_rxq];
 }
 
 static int
 iflib_num_tx_descs(if_ctx_t ctx)
 {
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	uint16_t first_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0;
 
 	return scctx->isc_ntxd[first_txq];
 }
 
 #ifdef DEV_NETMAP
 #include <sys/selinfo.h>
 #include <net/netmap.h>
 #include <dev/netmap/netmap_kern.h>
 
 MODULE_DEPEND(iflib, netmap, 1, 1, 1);
 
 static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init);
 static void iflib_netmap_timer(void *arg);
 
 /*
  * device-specific sysctl variables:
  *
  * iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
  *	During regular operations the CRC is stripped, but on some
  *	hardware reception of frames not multiple of 64 is slower,
  *	so using crcstrip=0 helps in benchmarks.
  *
  * iflib_rx_miss, iflib_rx_miss_bufs:
  *	count packets that might be missed due to lost interrupts.
  */
 SYSCTL_DECL(_dev_netmap);
 /*
  * The xl driver by default strips CRCs and we do not override it.
  */
 
 int iflib_crcstrip = 1;
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip,
     CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on RX frames");
 
 int iflib_rx_miss, iflib_rx_miss_bufs;
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss,
     CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed RX intr");
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs,
     CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed RX intr bufs");
 
 /*
  * Register/unregister. We are already under netmap lock.
  * Only called on the first register or the last unregister.
  */
 static int
 iflib_netmap_register(struct netmap_adapter *na, int onoff)
 {
 	if_t ifp = na->ifp;
 	if_ctx_t ctx = if_getsoftc(ifp);
 	int status;
 
 	CTX_LOCK(ctx);
 	if (!CTX_IS_VF(ctx))
 		IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip);
 
 	iflib_stop(ctx);
 
 	/*
 	 * Enable (or disable) netmap flags, and intercept (or restore)
 	 * ifp->if_transmit. This is done once the device has been stopped
 	 * to prevent race conditions. Also, this must be done after
 	 * calling netmap_disable_all_rings() and before calling
 	 * netmap_enable_all_rings(), so that these two functions see the
 	 * updated state of the NAF_NETMAP_ON bit.
 	 */
 	if (onoff) {
 		nm_set_native_flags(na);
 	} else {
 		nm_clear_native_flags(na);
 	}
 
 	iflib_init_locked(ctx);
 	IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ?
 	status = if_getdrvflags(ifp) & IFF_DRV_RUNNING ? 0 : 1;
 	if (status)
 		nm_clear_native_flags(na);
 	CTX_UNLOCK(ctx);
 	return (status);
 }
 
 static int
 iflib_netmap_config(struct netmap_adapter *na, struct nm_config_info *info)
 {
 	if_t ifp = na->ifp;
 	if_ctx_t ctx = if_getsoftc(ifp);
 	iflib_rxq_t rxq = &ctx->ifc_rxqs[0];
 	iflib_fl_t fl = &rxq->ifr_fl[0];
 
 	info->num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
 	info->num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
 	info->num_tx_descs = iflib_num_tx_descs(ctx);
 	info->num_rx_descs = iflib_num_rx_descs(ctx);
 	info->rx_buf_maxsize = fl->ifl_buf_size;
 	nm_prinf("txr %u rxr %u txd %u rxd %u rbufsz %u",
 		info->num_tx_rings, info->num_rx_rings, info->num_tx_descs,
 		info->num_rx_descs, info->rx_buf_maxsize);
 
 	return 0;
 }
 
 static int
 netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init)
 {
 	struct netmap_adapter *na = kring->na;
 	u_int const lim = kring->nkr_num_slots - 1;
 	struct netmap_ring *ring = kring->ring;
 	bus_dmamap_t *map;
 	struct if_rxd_update iru;
 	if_ctx_t ctx = rxq->ifr_ctx;
 	iflib_fl_t fl = &rxq->ifr_fl[0];
 	u_int nic_i_first, nic_i;
 	u_int nm_i;
 	int i, n;
 #if IFLIB_DEBUG_COUNTERS
 	int rf_count = 0;
 #endif
 
 	/*
 	 * This function is used both at initialization and in rxsync.
 	 * At initialization we need to prepare (with isc_rxd_refill())
 	 * all the netmap buffers currently owned by the kernel, in
 	 * such a way to keep fl->ifl_pidx and kring->nr_hwcur in sync
 	 * (except for kring->nkr_hwofs). These may be less than
 	 * kring->nkr_num_slots if netmap_reset() was called while
 	 * an application using the kring that still owned some
 	 * buffers.
 	 * At rxsync time, both indexes point to the next buffer to be
 	 * refilled.
 	 * In any case we publish (with isc_rxd_flush()) up to
 	 * (fl->ifl_pidx - 1) % N (included), to avoid the NIC tail/prod
 	 * pointer to overrun the head/cons pointer, although this is
 	 * not necessary for some NICs (e.g. vmx).
 	 */
 	if (__predict_false(init)) {
 		n = kring->nkr_num_slots - nm_kr_rxspace(kring);
 	} else {
 		n = kring->rhead - kring->nr_hwcur;
 		if (n == 0)
 			return (0); /* Nothing to do. */
 		if (n < 0)
 			n += kring->nkr_num_slots;
 	}
 
 	iru_init(&iru, rxq, 0 /* flid */);
 	map = fl->ifl_sds.ifsd_map;
 	nic_i = fl->ifl_pidx;
 	nm_i = netmap_idx_n2k(kring, nic_i);
 	if (__predict_false(init)) {
 		/*
 		 * On init/reset, nic_i must be 0, and we must
 		 * start to refill from hwtail (see netmap_reset()).
 		 */
 		MPASS(nic_i == 0);
 		MPASS(nm_i == kring->nr_hwtail);
 	} else
 		MPASS(nm_i == kring->nr_hwcur);
 	DBG_COUNTER_INC(fl_refills);
 	while (n > 0) {
 #if IFLIB_DEBUG_COUNTERS
 		if (++rf_count == 9)
 			DBG_COUNTER_INC(fl_refills_large);
 #endif
 		nic_i_first = nic_i;
 		for (i = 0; n > 0 && i < IFLIB_MAX_RX_REFRESH; n--, i++) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			uint64_t paddr;
 			void *addr = PNMB(na, slot, &paddr);
 
 			MPASS(i < IFLIB_MAX_RX_REFRESH);
 
 			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
 			        return netmap_ring_reinit(kring);
 
 			fl->ifl_bus_addrs[i] = paddr +
 			    nm_get_offset(kring, slot);
 			fl->ifl_rxd_idxs[i] = nic_i;
 
 			if (__predict_false(init)) {
 				netmap_load_map(na, fl->ifl_buf_tag,
 				    map[nic_i], addr);
 			} else if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
 				netmap_reload_map(na, fl->ifl_buf_tag,
 				    map[nic_i], addr);
 			}
 			bus_dmamap_sync(fl->ifl_buf_tag, map[nic_i],
 			    BUS_DMASYNC_PREREAD);
 			slot->flags &= ~NS_BUF_CHANGED;
 
 			nm_i = nm_next(nm_i, lim);
 			nic_i = nm_next(nic_i, lim);
 		}
 
 		iru.iru_pidx = nic_i_first;
 		iru.iru_count = i;
 		ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
 	}
 	fl->ifl_pidx = nic_i;
 	/*
 	 * At the end of the loop we must have refilled everything
 	 * we could possibly refill.
 	 */
 	MPASS(nm_i == kring->rhead);
 	kring->nr_hwcur = nm_i;
 
 	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id,
 	    nm_prev(nic_i, lim));
 	DBG_COUNTER_INC(rxd_flush);
 
 	return (0);
 }
 
 #define NETMAP_TX_TIMER_US	90
 
 /*
  * Reconcile kernel and user view of the transmit ring.
  *
  * All information is in the kring.
  * Userspace wants to send packets up to the one before kring->rhead,
  * kernel knows kring->nr_hwcur is the first unsent packet.
  *
  * Here we push packets out (as many as possible), and possibly
  * reclaim buffers from previously completed transmission.
  *
  * The caller (netmap) guarantees that there is only one instance
  * running at any time. Any interference with other driver
  * methods should be handled by the individual drivers.
  */
 static int
 iflib_netmap_txsync(struct netmap_kring *kring, int flags)
 {
 	struct netmap_adapter *na = kring->na;
 	if_t ifp = na->ifp;
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap kring */
 	u_int nic_i;	/* index into the NIC ring */
 	u_int const lim = kring->nkr_num_slots - 1;
 	u_int const head = kring->rhead;
 	struct if_pkt_info pi;
 	int tx_pkts = 0, tx_bytes = 0;
 
 	/*
 	 * interrupts on every tx packet are expensive so request
 	 * them every half ring, or where NS_REPORT is set
 	 */
 	u_int report_frequency = kring->nkr_num_slots >> 1;
 	/* device-specific */
 	if_ctx_t ctx = if_getsoftc(ifp);
 	iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id];
 
 	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
 	/*
 	 * First part: process new packets to send.
 	 * nm_i is the current index in the netmap kring,
 	 * nic_i is the corresponding index in the NIC ring.
 	 *
 	 * If we have packets to send (nm_i != head)
 	 * iterate over the netmap ring, fetch length and update
 	 * the corresponding slot in the NIC ring. Some drivers also
 	 * need to update the buffer's physical address in the NIC slot
 	 * even NS_BUF_CHANGED is not set (PNMB computes the addresses).
 	 *
 	 * The netmap_reload_map() calls is especially expensive,
 	 * even when (as in this case) the tag is 0, so do only
 	 * when the buffer has actually changed.
 	 *
 	 * If possible do not set the report/intr bit on all slots,
 	 * but only a few times per ring or when NS_REPORT is set.
 	 *
 	 * Finally, on 10G and faster drivers, it might be useful
 	 * to prefetch the next slot and txr entry.
 	 */
 
 	nm_i = kring->nr_hwcur;
 	if (nm_i != head) {	/* we have new packets to send */
 		uint32_t pkt_len = 0, seg_idx = 0;
 		int nic_i_start = -1, flags = 0;
 		pkt_info_zero(&pi);
 		pi.ipi_segs = txq->ift_segs;
 		pi.ipi_qsidx = kring->ring_id;
 		nic_i = netmap_idx_k2n(kring, nm_i);
 
 		__builtin_prefetch(&ring->slot[nm_i]);
 		__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]);
 		__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
 
 		while (nm_i != head) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			uint64_t offset = nm_get_offset(kring, slot);
 			u_int len = slot->len;
 			uint64_t paddr;
 			void *addr = PNMB(na, slot, &paddr);
 
 			flags |= (slot->flags & NS_REPORT ||
 				nic_i == 0 || nic_i == report_frequency) ?
 				IPI_TX_INTR : 0;
 
 			/*
 			 * If this is the first packet fragment, save the
 			 * index of the first NIC slot for later.
 			 */
 			if (nic_i_start < 0)
 				nic_i_start = nic_i;
 
 			pi.ipi_segs[seg_idx].ds_addr = paddr + offset;
 			pi.ipi_segs[seg_idx].ds_len = len;
 			if (len) {
 				pkt_len += len;
 				seg_idx++;
 			}
 
 			if (!(slot->flags & NS_MOREFRAG)) {
 				pi.ipi_len = pkt_len;
 				pi.ipi_nsegs = seg_idx;
 				pi.ipi_pidx = nic_i_start;
 				pi.ipi_ndescs = 0;
 				pi.ipi_flags = flags;
 
 				/* Prepare the NIC TX ring. */
 				ctx->isc_txd_encap(ctx->ifc_softc, &pi);
 				DBG_COUNTER_INC(tx_encap);
 
 				/* Update transmit counters */
 				tx_bytes += pi.ipi_len;
 				tx_pkts++;
 
 				/* Reinit per-packet info for the next one. */
 				flags = seg_idx = pkt_len = 0;
 				nic_i_start = -1;
 			}
 
 			/* prefetch for next round */
 			__builtin_prefetch(&ring->slot[nm_i + 1]);
 			__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]);
 			__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
 
 			NM_CHECK_ADDR_LEN_OFF(na, len, offset);
 
 			if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
 				netmap_reload_map(na, txq->ift_buf_tag,
 				    txq->ift_sds.ifsd_map[nic_i], addr);
 			}
 			/* make sure changes to the buffer are synced */
 			bus_dmamap_sync(txq->ift_buf_tag,
 			    txq->ift_sds.ifsd_map[nic_i],
 			    BUS_DMASYNC_PREWRITE);
 
 			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED | NS_MOREFRAG);
 			nm_i = nm_next(nm_i, lim);
 			nic_i = nm_next(nic_i, lim);
 		}
 		kring->nr_hwcur = nm_i;
 
 		/* synchronize the NIC ring */
 		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
 		/* (re)start the tx unit up to slot nic_i (excluded) */
 		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i);
 	}
 
 	/*
 	 * Second part: reclaim buffers for completed transmissions.
 	 *
 	 * If there are unclaimed buffers, attempt to reclaim them.
 	 * If we don't manage to reclaim them all, and TX IRQs are not in use,
 	 * trigger a per-tx-queue timer to try again later.
 	 */
 	if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
 		if (iflib_tx_credits_update(ctx, txq)) {
 			/* some tx completed, increment avail */
 			nic_i = txq->ift_cidx_processed;
 			kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
 		}
 	}
 
 	if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ))
 		if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
 			callout_reset_sbt_on(&txq->ift_netmap_timer,
 			    NETMAP_TX_TIMER_US * SBT_1US, SBT_1US,
 			    iflib_netmap_timer, txq,
 			    txq->ift_netmap_timer.c_cpu, 0);
 		}
 
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, tx_bytes);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, tx_pkts);
 
 	return (0);
 }
 
 /*
  * Reconcile kernel and user view of the receive ring.
  * Same as for the txsync, this routine must be efficient.
  * The caller guarantees a single invocations, but races against
  * the rest of the driver should be handled here.
  *
  * On call, kring->rhead is the first packet that userspace wants
  * to keep, and kring->rcur is the wakeup point.
  * The kernel has previously reported packets up to kring->rtail.
  *
  * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
  * of whether or not we received an interrupt.
  */
 static int
 iflib_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
 	struct netmap_adapter *na = kring->na;
 	struct netmap_ring *ring = kring->ring;
 	if_t ifp = na->ifp;
 	uint32_t nm_i;	/* index into the netmap ring */
 	uint32_t nic_i;	/* index into the NIC ring */
 	u_int n;
 	u_int const lim = kring->nkr_num_slots - 1;
 	int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
 	int i = 0, rx_bytes = 0, rx_pkts = 0;
 
 	if_ctx_t ctx = if_getsoftc(ifp);
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id];
 	iflib_fl_t fl = &rxq->ifr_fl[0];
 	struct if_rxd_info ri;
 	qidx_t *cidxp;
 
 	/*
 	 * netmap only uses free list 0, to avoid out of order consumption
 	 * of receive buffers
 	 */
 
 	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
 	/*
 	 * First part: import newly received packets.
 	 *
 	 * nm_i is the index of the next free slot in the netmap ring,
 	 * nic_i is the index of the next received packet in the NIC ring
 	 * (or in the free list 0 if IFLIB_HAS_RXCQ is set), and they may
 	 * differ in case if_init() has been called while
 	 * in netmap mode. For the receive ring we have
 	 *
 	 *	nic_i = fl->ifl_cidx;
 	 *	nm_i = kring->nr_hwtail (previous)
 	 * and
 	 *	nm_i == (nic_i + kring->nkr_hwofs) % ring_size
 	 *
 	 * fl->ifl_cidx is set to 0 on a ring reinit
 	 */
 	if (netmap_no_pendintr || force_update) {
 		uint32_t hwtail_lim = nm_prev(kring->nr_hwcur, lim);
 		bool have_rxcq = sctx->isc_flags & IFLIB_HAS_RXCQ;
 		int crclen = iflib_crcstrip ? 0 : 4;
 		int error, avail;
 
 		/*
 		 * For the free list consumer index, we use the same
 		 * logic as in iflib_rxeof().
 		 */
 		if (have_rxcq)
 			cidxp = &rxq->ifr_cq_cidx;
 		else
 			cidxp = &fl->ifl_cidx;
 		avail = ctx->isc_rxd_available(ctx->ifc_softc,
 		    rxq->ifr_id, *cidxp, USHRT_MAX);
 
 		nic_i = fl->ifl_cidx;
 		nm_i = netmap_idx_n2k(kring, nic_i);
 		MPASS(nm_i == kring->nr_hwtail);
 		for (n = 0; avail > 0 && nm_i != hwtail_lim; n++, avail--) {
 			rxd_info_zero(&ri);
 			ri.iri_frags = rxq->ifr_frags;
 			ri.iri_qsidx = kring->ring_id;
 			ri.iri_ifp = ctx->ifc_ifp;
 			ri.iri_cidx = *cidxp;
 
 			error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
 			for (i = 0; i < ri.iri_nfrags; i++) {
 				if (error) {
 					ring->slot[nm_i].len = 0;
 					ring->slot[nm_i].flags = 0;
 				} else {
 					ring->slot[nm_i].len = ri.iri_frags[i].irf_len;
 					if (i == (ri.iri_nfrags - 1)) {
 						ring->slot[nm_i].len -= crclen;
 						ring->slot[nm_i].flags = 0;
 
 						/* Update receive counters */
 						rx_bytes += ri.iri_len;
 						rx_pkts++;
 					} else
 						ring->slot[nm_i].flags = NS_MOREFRAG;
 				}
 
 				bus_dmamap_sync(fl->ifl_buf_tag,
 				    fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD);
 				nm_i = nm_next(nm_i, lim);
 				fl->ifl_cidx = nic_i = nm_next(nic_i, lim);
 			}
 
 			if (have_rxcq) {
 				*cidxp = ri.iri_cidx;
 				while (*cidxp >= scctx->isc_nrxd[0])
 					*cidxp -= scctx->isc_nrxd[0];
 			}
 
 		}
 		if (n) { /* update the state variables */
 			if (netmap_no_pendintr && !force_update) {
 				/* diagnostics */
 				iflib_rx_miss ++;
 				iflib_rx_miss_bufs += n;
 			}
 			kring->nr_hwtail = nm_i;
 		}
 		kring->nr_kflags &= ~NKR_PENDINTR;
 	}
 	/*
 	 * Second part: skip past packets that userspace has released.
 	 * (kring->nr_hwcur to head excluded),
 	 * and make the buffers available for reception.
 	 * As usual nm_i is the index in the netmap ring,
 	 * nic_i is the index in the NIC ring, and
 	 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size
 	 */
 	netmap_fl_refill(rxq, kring, false);
 
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
 
 	return (0);
 }
 
 static void
 iflib_netmap_intr(struct netmap_adapter *na, int onoff)
 {
 	if_ctx_t ctx = if_getsoftc(na->ifp);
 
 	CTX_LOCK(ctx);
 	if (onoff) {
 		IFDI_INTR_ENABLE(ctx);
 	} else {
 		IFDI_INTR_DISABLE(ctx);
 	}
 	CTX_UNLOCK(ctx);
 }
 
 static int
 iflib_netmap_attach(if_ctx_t ctx)
 {
 	struct netmap_adapter na;
 
 	bzero(&na, sizeof(na));
 
 	na.ifp = ctx->ifc_ifp;
 	na.na_flags = NAF_BDG_MAYSLEEP | NAF_MOREFRAG | NAF_OFFSETS;
 	MPASS(ctx->ifc_softc_ctx.isc_ntxqsets);
 	MPASS(ctx->ifc_softc_ctx.isc_nrxqsets);
 
 	na.num_tx_desc = iflib_num_tx_descs(ctx);
 	na.num_rx_desc = iflib_num_rx_descs(ctx);
 	na.nm_txsync = iflib_netmap_txsync;
 	na.nm_rxsync = iflib_netmap_rxsync;
 	na.nm_register = iflib_netmap_register;
 	na.nm_intr = iflib_netmap_intr;
 	na.nm_config = iflib_netmap_config;
 	na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
 	na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
 	return (netmap_attach(&na));
 }
 
 static int
 iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq)
 {
 	struct netmap_adapter *na = NA(ctx->ifc_ifp);
 	struct netmap_slot *slot;
 
 	slot = netmap_reset(na, NR_TX, txq->ift_id, 0);
 	if (slot == NULL)
 		return (0);
 	for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) {
 		/*
 		 * In netmap mode, set the map for the packet buffer.
 		 * NOTE: Some drivers (not this one) also need to set
 		 * the physical buffer address in the NIC ring.
 		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
 		 * netmap slot index, si
 		 */
 		int si = netmap_idx_n2k(na->tx_rings[txq->ift_id], i);
 		netmap_load_map(na, txq->ift_buf_tag, txq->ift_sds.ifsd_map[i],
 		    NMB(na, slot + si));
 	}
 	return (1);
 }
 
 static int
 iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq)
 {
 	struct netmap_adapter *na = NA(ctx->ifc_ifp);
 	struct netmap_kring *kring;
 	struct netmap_slot *slot;
 
 	slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0);
 	if (slot == NULL)
 		return (0);
 	kring = na->rx_rings[rxq->ifr_id];
 	netmap_fl_refill(rxq, kring, true);
 	return (1);
 }
 
 static void
 iflib_netmap_timer(void *arg)
 {
 	iflib_txq_t txq = arg;
 	if_ctx_t ctx = txq->ift_ctx;
 
 	/*
 	 * Wake up the netmap application, to give it a chance to
 	 * call txsync and reclaim more completed TX buffers.
 	 */
 	netmap_tx_irq(ctx->ifc_ifp, txq->ift_id);
 }
 
 #define iflib_netmap_detach(ifp) netmap_detach(ifp)
 
 #else
 #define iflib_netmap_txq_init(ctx, txq) (0)
 #define iflib_netmap_rxq_init(ctx, rxq) (0)
 #define iflib_netmap_detach(ifp)
 #define netmap_enable_all_rings(ifp)
 #define netmap_disable_all_rings(ifp)
 
 #define iflib_netmap_attach(ctx) (0)
 #define netmap_rx_irq(ifp, qid, budget) (0)
 #endif
 
 #if defined(__i386__) || defined(__amd64__)
 static __inline void
 prefetch(void *x)
 {
 	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
 }
 
 static __inline void
 prefetch2cachelines(void *x)
 {
 	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
 #if (CACHE_LINE_SIZE < 128)
 	__asm volatile("prefetcht0 %0" :: "m" (*(((unsigned long *)x)+CACHE_LINE_SIZE/(sizeof(unsigned long)))));
 #endif
 }
 #else
 static __inline void
 prefetch(void *x)
 {
 }
 
 static __inline void
 prefetch2cachelines(void *x)
 {
 }
 #endif
 
 static void
 iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid)
 {
 	iflib_fl_t fl;
 
 	fl = &rxq->ifr_fl[flid];
 	iru->iru_paddrs = fl->ifl_bus_addrs;
 	iru->iru_idxs = fl->ifl_rxd_idxs;
 	iru->iru_qsidx = rxq->ifr_id;
 	iru->iru_buf_size = fl->ifl_buf_size;
 	iru->iru_flidx = fl->ifl_id;
 }
 
 static void
 _iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
 {
 	if (err)
 		return;
 	*(bus_addr_t *) arg = segs[0].ds_addr;
 }
 
 #define	DMA_WIDTH_TO_BUS_LOWADDR(width)				\
 	(((width) == 0) || (width) == flsll(BUS_SPACE_MAXADDR) ?	\
 	    BUS_SPACE_MAXADDR : (1ULL << (width)) - 1ULL)
 
 int
 iflib_dma_alloc_align(if_ctx_t ctx, int size, int align, iflib_dma_info_t dma, int mapflags)
 {
 	int err;
 	device_t dev = ctx->ifc_dev;
 	bus_addr_t lowaddr;
 
 	lowaddr = DMA_WIDTH_TO_BUS_LOWADDR(ctx->ifc_softc_ctx.isc_dma_width);
 
 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
 				align, 0,		/* alignment, bounds */
 				lowaddr,		/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
 				size,			/* maxsize */
 				1,			/* nsegments */
 				size,			/* maxsegsize */
 				BUS_DMA_ALLOCNOW,	/* flags */
 				NULL,			/* lockfunc */
 				NULL,			/* lockarg */
 				&dma->idi_tag);
 	if (err) {
 		device_printf(dev,
 		    "%s: bus_dma_tag_create failed: %d\n",
 		    __func__, err);
 		goto fail_0;
 	}
 
 	err = bus_dmamem_alloc(dma->idi_tag, (void**) &dma->idi_vaddr,
 	    BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map);
 	if (err) {
 		device_printf(dev,
 		    "%s: bus_dmamem_alloc(%ju) failed: %d\n",
 		    __func__, (uintmax_t)size, err);
 		goto fail_1;
 	}
 
 	dma->idi_paddr = IF_BAD_DMA;
 	err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr,
 	    size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT);
 	if (err || dma->idi_paddr == IF_BAD_DMA) {
 		device_printf(dev,
 		    "%s: bus_dmamap_load failed: %d\n",
 		    __func__, err);
 		goto fail_2;
 	}
 
 	dma->idi_size = size;
 	return (0);
 
 fail_2:
 	bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
 fail_1:
 	bus_dma_tag_destroy(dma->idi_tag);
 fail_0:
 	dma->idi_tag = NULL;
 
 	return (err);
 }
 
 int
 iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 
 	KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized"));
 
 	return (iflib_dma_alloc_align(ctx, size, sctx->isc_q_align, dma, mapflags));
 }
 
 int
 iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count)
 {
 	int i, err;
 	iflib_dma_info_t *dmaiter;
 
 	dmaiter = dmalist;
 	for (i = 0; i < count; i++, dmaiter++) {
 		if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0)
 			break;
 	}
 	if (err)
 		iflib_dma_free_multi(dmalist, i);
 	return (err);
 }
 
 void
 iflib_dma_free(iflib_dma_info_t dma)
 {
 	if (dma->idi_tag == NULL)
 		return;
 	if (dma->idi_paddr != IF_BAD_DMA) {
 		bus_dmamap_sync(dma->idi_tag, dma->idi_map,
 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(dma->idi_tag, dma->idi_map);
 		dma->idi_paddr = IF_BAD_DMA;
 	}
 	if (dma->idi_vaddr != NULL) {
 		bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
 		dma->idi_vaddr = NULL;
 	}
 	bus_dma_tag_destroy(dma->idi_tag);
 	dma->idi_tag = NULL;
 }
 
 void
 iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count)
 {
 	int i;
 	iflib_dma_info_t *dmaiter = dmalist;
 
 	for (i = 0; i < count; i++, dmaiter++)
 		iflib_dma_free(*dmaiter);
 }
 
 static int
 iflib_fast_intr(void *arg)
 {
 	iflib_filter_info_t info = arg;
 	struct grouptask *gtask = info->ifi_task;
 	int result;
 
 	DBG_COUNTER_INC(fast_intrs);
 	if (info->ifi_filter != NULL) {
 		result = info->ifi_filter(info->ifi_filter_arg);
 		if ((result & FILTER_SCHEDULE_THREAD) == 0)
 			return (result);
 	}
 
 	GROUPTASK_ENQUEUE(gtask);
 	return (FILTER_HANDLED);
 }
 
 static int
 iflib_fast_intr_rxtx(void *arg)
 {
 	iflib_filter_info_t info = arg;
 	struct grouptask *gtask = info->ifi_task;
 	if_ctx_t ctx;
 	iflib_rxq_t rxq = (iflib_rxq_t)info->ifi_ctx;
 	iflib_txq_t txq;
 	void *sc;
 	int i, cidx, result;
 	qidx_t txqid;
 	bool intr_enable, intr_legacy;
 
 	DBG_COUNTER_INC(fast_intrs);
 	if (info->ifi_filter != NULL) {
 		result = info->ifi_filter(info->ifi_filter_arg);
 		if ((result & FILTER_SCHEDULE_THREAD) == 0)
 			return (result);
 	}
 
 	ctx = rxq->ifr_ctx;
 	sc = ctx->ifc_softc;
 	intr_enable = false;
 	intr_legacy = !!(ctx->ifc_flags & IFC_LEGACY);
 	MPASS(rxq->ifr_ntxqirq);
 	for (i = 0; i < rxq->ifr_ntxqirq; i++) {
 		txqid = rxq->ifr_txqid[i];
 		txq = &ctx->ifc_txqs[txqid];
 		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 		    BUS_DMASYNC_POSTREAD);
 		if (!ctx->isc_txd_credits_update(sc, txqid, false)) {
 			if (intr_legacy)
 				intr_enable = true;
 			else
 				IFDI_TX_QUEUE_INTR_ENABLE(ctx, txqid);
 			continue;
 		}
 		GROUPTASK_ENQUEUE(&txq->ift_task);
 	}
 	if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_RXCQ)
 		cidx = rxq->ifr_cq_cidx;
 	else
 		cidx = rxq->ifr_fl[0].ifl_cidx;
 	if (iflib_rxd_avail(ctx, rxq, cidx, 1))
 		GROUPTASK_ENQUEUE(gtask);
 	else {
 		if (intr_legacy)
 			intr_enable = true;
 		else
 			IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
 		DBG_COUNTER_INC(rx_intr_enables);
 	}
 	if (intr_enable)
 		IFDI_INTR_ENABLE(ctx);
 	return (FILTER_HANDLED);
 }
 
 static int
 iflib_fast_intr_ctx(void *arg)
 {
 	iflib_filter_info_t info = arg;
 	struct grouptask *gtask = info->ifi_task;
 	int result;
 
 	DBG_COUNTER_INC(fast_intrs);
 	if (info->ifi_filter != NULL) {
 		result = info->ifi_filter(info->ifi_filter_arg);
 		if ((result & FILTER_SCHEDULE_THREAD) == 0)
 			return (result);
 	}
 
 	GROUPTASK_ENQUEUE(gtask);
 	return (FILTER_HANDLED);
 }
 
 static int
 _iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
 		 driver_filter_t filter, driver_intr_t handler, void *arg,
 		 const char *name)
 {
 	struct resource *res;
 	void *tag = NULL;
 	device_t dev = ctx->ifc_dev;
 	int flags, i, rc;
 
 	flags = RF_ACTIVE;
 	if (ctx->ifc_flags & IFC_LEGACY)
 		flags |= RF_SHAREABLE;
 	MPASS(rid < 512);
 	i = rid;
 	res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &i, flags);
 	if (res == NULL) {
 		device_printf(dev,
 		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
 		return (ENOMEM);
 	}
 	irq->ii_res = res;
 	KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL"));
 	rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET,
 						filter, handler, arg, &tag);
 	if (rc != 0) {
 		device_printf(dev,
 		    "failed to setup interrupt for rid %d, name %s: %d\n",
 					  rid, name ? name : "unknown", rc);
 		return (rc);
 	} else if (name)
 		bus_describe_intr(dev, res, tag, "%s", name);
 
 	irq->ii_tag = tag;
 	return (0);
 }
 
 /*********************************************************************
  *
  *  Allocate DMA resources for TX buffers as well as memory for the TX
  *  mbuf map.  TX DMA maps (non-TSO/TSO) and TX mbuf map are kept in a
  *  iflib_sw_tx_desc_array structure, storing all the information that
  *  is needed to transmit a packet on the wire.  This is called only
  *  once at attach, setup is done every reset.
  *
  **********************************************************************/
 static int
 iflib_txsd_alloc(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	bus_size_t tsomaxsize;
 	bus_addr_t lowaddr;
 	int err, nsegments, ntsosegments;
 	bool tso;
 
 	nsegments = scctx->isc_tx_nsegments;
 	ntsosegments = scctx->isc_tx_tso_segments_max;
 	tsomaxsize = scctx->isc_tx_tso_size_max;
 	if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_VLAN_MTU)
 		tsomaxsize += sizeof(struct ether_vlan_header);
 	MPASS(scctx->isc_ntxd[0] > 0);
 	MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0);
 	MPASS(nsegments > 0);
 	if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) {
 		MPASS(ntsosegments > 0);
 		MPASS(sctx->isc_tso_maxsize >= tsomaxsize);
 	}
 
 	lowaddr = DMA_WIDTH_TO_BUS_LOWADDR(scctx->isc_dma_width);
 
 	/*
 	 * Set up DMA tags for TX buffers.
 	 */
 	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
 			       1, 0,			/* alignment, bounds */
 			       lowaddr,			/* lowaddr */
 			       BUS_SPACE_MAXADDR,	/* highaddr */
 			       NULL, NULL,		/* filter, filterarg */
 			       sctx->isc_tx_maxsize,		/* maxsize */
 			       nsegments,	/* nsegments */
 			       sctx->isc_tx_maxsegsize,	/* maxsegsize */
 			       0,			/* flags */
 			       NULL,			/* lockfunc */
 			       NULL,			/* lockfuncarg */
 			       &txq->ift_buf_tag))) {
 		device_printf(dev,"Unable to allocate TX DMA tag: %d\n", err);
 		device_printf(dev,"maxsize: %ju nsegments: %d maxsegsize: %ju\n",
 		    (uintmax_t)sctx->isc_tx_maxsize, nsegments, (uintmax_t)sctx->isc_tx_maxsegsize);
 		goto fail;
 	}
 	tso = (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) != 0;
 	if (tso && (err = bus_dma_tag_create(bus_get_dma_tag(dev),
 			       1, 0,			/* alignment, bounds */
 			       lowaddr,			/* lowaddr */
 			       BUS_SPACE_MAXADDR,	/* highaddr */
 			       NULL, NULL,		/* filter, filterarg */
 			       tsomaxsize,		/* maxsize */
 			       ntsosegments,	/* nsegments */
 			       sctx->isc_tso_maxsegsize,/* maxsegsize */
 			       0,			/* flags */
 			       NULL,			/* lockfunc */
 			       NULL,			/* lockfuncarg */
 			       &txq->ift_tso_buf_tag))) {
 		device_printf(dev, "Unable to allocate TSO TX DMA tag: %d\n",
 		    err);
 		goto fail;
 	}
 
 	/* Allocate memory for the TX mbuf map. */
 	if (!(txq->ift_sds.ifsd_m =
 	    (struct mbuf **) malloc(sizeof(struct mbuf *) *
 	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate TX mbuf map memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 
 	/*
 	 * Create the DMA maps for TX buffers.
 	 */
 	if ((txq->ift_sds.ifsd_map = (bus_dmamap_t *)malloc(
 	    sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
 	    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
 		device_printf(dev,
 		    "Unable to allocate TX buffer DMA map memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 	if (tso && (txq->ift_sds.ifsd_tso_map = (bus_dmamap_t *)malloc(
 	    sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
 	    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
 		device_printf(dev,
 		    "Unable to allocate TSO TX buffer map memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 	for (int i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) {
 		err = bus_dmamap_create(txq->ift_buf_tag, 0,
 		    &txq->ift_sds.ifsd_map[i]);
 		if (err != 0) {
 			device_printf(dev, "Unable to create TX DMA map\n");
 			goto fail;
 		}
 		if (!tso)
 			continue;
 		err = bus_dmamap_create(txq->ift_tso_buf_tag, 0,
 		    &txq->ift_sds.ifsd_tso_map[i]);
 		if (err != 0) {
 			device_printf(dev, "Unable to create TSO TX DMA map\n");
 			goto fail;
 		}
 	}
 	return (0);
 fail:
 	/* We free all, it handles case where we are in the middle */
 	iflib_tx_structures_free(ctx);
 	return (err);
 }
 
 static void
 iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i)
 {
 	bus_dmamap_t map;
 
 	if (txq->ift_sds.ifsd_map != NULL) {
 		map = txq->ift_sds.ifsd_map[i];
 		bus_dmamap_sync(txq->ift_buf_tag, map, BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(txq->ift_buf_tag, map);
 		bus_dmamap_destroy(txq->ift_buf_tag, map);
 		txq->ift_sds.ifsd_map[i] = NULL;
 	}
 
 	if (txq->ift_sds.ifsd_tso_map != NULL) {
 		map = txq->ift_sds.ifsd_tso_map[i];
 		bus_dmamap_sync(txq->ift_tso_buf_tag, map,
 		    BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(txq->ift_tso_buf_tag, map);
 		bus_dmamap_destroy(txq->ift_tso_buf_tag, map);
 		txq->ift_sds.ifsd_tso_map[i] = NULL;
 	}
 }
 
 static void
 iflib_txq_destroy(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 
 	for (int i = 0; i < txq->ift_size; i++)
 		iflib_txsd_destroy(ctx, txq, i);
 
 	if (txq->ift_br != NULL) {
 		ifmp_ring_free(txq->ift_br);
 		txq->ift_br = NULL;
 	}
 
 	mtx_destroy(&txq->ift_mtx);
 
 	if (txq->ift_sds.ifsd_map != NULL) {
 		free(txq->ift_sds.ifsd_map, M_IFLIB);
 		txq->ift_sds.ifsd_map = NULL;
 	}
 	if (txq->ift_sds.ifsd_tso_map != NULL) {
 		free(txq->ift_sds.ifsd_tso_map, M_IFLIB);
 		txq->ift_sds.ifsd_tso_map = NULL;
 	}
 	if (txq->ift_sds.ifsd_m != NULL) {
 		free(txq->ift_sds.ifsd_m, M_IFLIB);
 		txq->ift_sds.ifsd_m = NULL;
 	}
 	if (txq->ift_buf_tag != NULL) {
 		bus_dma_tag_destroy(txq->ift_buf_tag);
 		txq->ift_buf_tag = NULL;
 	}
 	if (txq->ift_tso_buf_tag != NULL) {
 		bus_dma_tag_destroy(txq->ift_tso_buf_tag);
 		txq->ift_tso_buf_tag = NULL;
 	}
 	if (txq->ift_ifdi != NULL) {
 		free(txq->ift_ifdi, M_IFLIB);
 	}
 }
 
 static void
 iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i)
 {
 	struct mbuf **mp;
 
 	mp = &txq->ift_sds.ifsd_m[i];
 	if (*mp == NULL)
 		return;
 
 	if (txq->ift_sds.ifsd_map != NULL) {
 		bus_dmamap_sync(txq->ift_buf_tag,
 		    txq->ift_sds.ifsd_map[i], BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[i]);
 	}
 	if (txq->ift_sds.ifsd_tso_map != NULL) {
 		bus_dmamap_sync(txq->ift_tso_buf_tag,
 		    txq->ift_sds.ifsd_tso_map[i], BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(txq->ift_tso_buf_tag,
 		    txq->ift_sds.ifsd_tso_map[i]);
 	}
 	m_freem(*mp);
 	DBG_COUNTER_INC(tx_frees);
 	*mp = NULL;
 }
 
 static int
 iflib_txq_setup(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	iflib_dma_info_t di;
 	int i;
 
 	/* Set number of descriptors available */
 	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	/* XXX make configurable */
 	txq->ift_update_freq = IFLIB_DEFAULT_TX_UPDATE_FREQ;
 
 	/* Reset indices */
 	txq->ift_cidx_processed = 0;
 	txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
 	txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset];
 
 	for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
 		bzero((void *)di->idi_vaddr, di->idi_size);
 
 	IFDI_TXQ_SETUP(ctx, txq->ift_id);
 	for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
 		bus_dmamap_sync(di->idi_tag, di->idi_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	return (0);
 }
 
 /*********************************************************************
  *
  *  Allocate DMA resources for RX buffers as well as memory for the RX
  *  mbuf map, direct RX cluster pointer map and RX cluster bus address
  *  map.  RX DMA map, RX mbuf map, direct RX cluster pointer map and
  *  RX cluster map are kept in a iflib_sw_rx_desc_array structure.
  *  Since we use use one entry in iflib_sw_rx_desc_array per received
  *  packet, the maximum number of entries we'll need is equal to the
  *  number of hardware receive descriptors that we've allocated.
  *
  **********************************************************************/
 static int
 iflib_rxsd_alloc(iflib_rxq_t rxq)
 {
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	iflib_fl_t fl;
 	bus_addr_t lowaddr;
 	int			err;
 
 	MPASS(scctx->isc_nrxd[0] > 0);
 	MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0);
 
 	lowaddr = DMA_WIDTH_TO_BUS_LOWADDR(scctx->isc_dma_width);
 
 	fl = rxq->ifr_fl;
 	for (int i = 0; i <  rxq->ifr_nfl; i++, fl++) {
 		fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */
 		/* Set up DMA tag for RX buffers. */
 		err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
 					 1, 0,			/* alignment, bounds */
 					 lowaddr,		/* lowaddr */
 					 BUS_SPACE_MAXADDR,	/* highaddr */
 					 NULL, NULL,		/* filter, filterarg */
 					 sctx->isc_rx_maxsize,	/* maxsize */
 					 sctx->isc_rx_nsegments,	/* nsegments */
 					 sctx->isc_rx_maxsegsize,	/* maxsegsize */
 					 0,			/* flags */
 					 NULL,			/* lockfunc */
 					 NULL,			/* lockarg */
 					 &fl->ifl_buf_tag);
 		if (err) {
 			device_printf(dev,
 			    "Unable to allocate RX DMA tag: %d\n", err);
 			goto fail;
 		}
 
 		/* Allocate memory for the RX mbuf map. */
 		if (!(fl->ifl_sds.ifsd_m =
 		      (struct mbuf **) malloc(sizeof(struct mbuf *) *
 					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev,
 			    "Unable to allocate RX mbuf map memory\n");
 			err = ENOMEM;
 			goto fail;
 		}
 
 		/* Allocate memory for the direct RX cluster pointer map. */
 		if (!(fl->ifl_sds.ifsd_cl =
 		      (caddr_t *) malloc(sizeof(caddr_t) *
 					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev,
 			    "Unable to allocate RX cluster map memory\n");
 			err = ENOMEM;
 			goto fail;
 		}
 
 		/* Allocate memory for the RX cluster bus address map. */
 		if (!(fl->ifl_sds.ifsd_ba =
 		      (bus_addr_t *) malloc(sizeof(bus_addr_t) *
 					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev,
 			    "Unable to allocate RX bus address map memory\n");
 			err = ENOMEM;
 			goto fail;
 		}
 
 		/*
 		 * Create the DMA maps for RX buffers.
 		 */
 		if (!(fl->ifl_sds.ifsd_map =
 		      (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev,
 			    "Unable to allocate RX buffer DMA map memory\n");
 			err = ENOMEM;
 			goto fail;
 		}
 		for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++) {
 			err = bus_dmamap_create(fl->ifl_buf_tag, 0,
 			    &fl->ifl_sds.ifsd_map[i]);
 			if (err != 0) {
 				device_printf(dev, "Unable to create RX buffer DMA map\n");
 				goto fail;
 			}
 		}
 	}
 	return (0);
 
 fail:
 	iflib_rx_structures_free(ctx);
 	return (err);
 }
 
 /*
  * Internal service routines
  */
 
 struct rxq_refill_cb_arg {
 	int               error;
 	bus_dma_segment_t seg;
 	int               nseg;
 };
 
 static void
 _rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	struct rxq_refill_cb_arg *cb_arg = arg;
 
 	cb_arg->error = error;
 	cb_arg->seg = segs[0];
 	cb_arg->nseg = nseg;
 }
 
 /**
  * iflib_fl_refill - refill an rxq free-buffer list
  * @ctx: the iflib context
  * @fl: the free list to refill
  * @count: the number of new buffers to allocate
  *
  * (Re)populate an rxq free-buffer list with up to @count new packet buffers.
  * The caller must assure that @count does not exceed the queue's capacity
  * minus one (since we always leave a descriptor unavailable).
  */
 static uint8_t
 iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count)
 {
 	struct if_rxd_update iru;
 	struct rxq_refill_cb_arg cb_arg;
 	struct mbuf *m;
 	caddr_t cl, *sd_cl;
 	struct mbuf **sd_m;
 	bus_dmamap_t *sd_map;
 	bus_addr_t bus_addr, *sd_ba;
 	int err, frag_idx, i, idx, n, pidx;
 	qidx_t credits;
 
 	MPASS(count <= fl->ifl_size - fl->ifl_credits - 1);
 
 	sd_m = fl->ifl_sds.ifsd_m;
 	sd_map = fl->ifl_sds.ifsd_map;
 	sd_cl = fl->ifl_sds.ifsd_cl;
 	sd_ba = fl->ifl_sds.ifsd_ba;
 	pidx = fl->ifl_pidx;
 	idx = pidx;
 	frag_idx = fl->ifl_fragidx;
 	credits = fl->ifl_credits;
 
 	i = 0;
 	n = count;
 	MPASS(n > 0);
 	MPASS(credits + n <= fl->ifl_size);
 
 	if (pidx < fl->ifl_cidx)
 		MPASS(pidx + n <= fl->ifl_cidx);
 	if (pidx == fl->ifl_cidx && (credits < fl->ifl_size))
 		MPASS(fl->ifl_gen == 0);
 	if (pidx > fl->ifl_cidx)
 		MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx);
 
 	DBG_COUNTER_INC(fl_refills);
 	if (n > 8)
 		DBG_COUNTER_INC(fl_refills_large);
 	iru_init(&iru, fl->ifl_rxq, fl->ifl_id);
 	while (n-- > 0) {
 		/*
 		 * We allocate an uninitialized mbuf + cluster, mbuf is
 		 * initialized after rx.
 		 *
 		 * If the cluster is still set then we know a minimum sized
 		 * packet was received
 		 */
 		bit_ffc_at(fl->ifl_rx_bitmap, frag_idx, fl->ifl_size,
 		    &frag_idx);
 		if (frag_idx < 0)
 			bit_ffc(fl->ifl_rx_bitmap, fl->ifl_size, &frag_idx);
 		MPASS(frag_idx >= 0);
 		if ((cl = sd_cl[frag_idx]) == NULL) {
 			cl = uma_zalloc(fl->ifl_zone, M_NOWAIT);
 			if (__predict_false(cl == NULL))
 				break;
 
 			cb_arg.error = 0;
 			MPASS(sd_map != NULL);
 			err = bus_dmamap_load(fl->ifl_buf_tag, sd_map[frag_idx],
 			    cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg,
 			    BUS_DMA_NOWAIT);
 			if (__predict_false(err != 0 || cb_arg.error)) {
 				uma_zfree(fl->ifl_zone, cl);
 				break;
 			}
 
 			sd_ba[frag_idx] = bus_addr = cb_arg.seg.ds_addr;
 			sd_cl[frag_idx] = cl;
 #if MEMORY_LOGGING
 			fl->ifl_cl_enqueued++;
 #endif
 		} else {
 			bus_addr = sd_ba[frag_idx];
 		}
 		bus_dmamap_sync(fl->ifl_buf_tag, sd_map[frag_idx],
 		    BUS_DMASYNC_PREREAD);
 
 		if (sd_m[frag_idx] == NULL) {
 			m = m_gethdr_raw(M_NOWAIT, 0);
 			if (__predict_false(m == NULL))
 				break;
 			sd_m[frag_idx] = m;
 		}
 		bit_set(fl->ifl_rx_bitmap, frag_idx);
 #if MEMORY_LOGGING
 		fl->ifl_m_enqueued++;
 #endif
 
 		DBG_COUNTER_INC(rx_allocs);
 		fl->ifl_rxd_idxs[i] = frag_idx;
 		fl->ifl_bus_addrs[i] = bus_addr;
 		credits++;
 		i++;
 		MPASS(credits <= fl->ifl_size);
 		if (++idx == fl->ifl_size) {
 #ifdef INVARIANTS
 			fl->ifl_gen = 1;
 #endif
 			idx = 0;
 		}
 		if (n == 0 || i == IFLIB_MAX_RX_REFRESH) {
 			iru.iru_pidx = pidx;
 			iru.iru_count = i;
 			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
 			fl->ifl_pidx = idx;
 			fl->ifl_credits = credits;
 			pidx = idx;
 			i = 0;
 		}
 	}
 
 	if (n < count - 1) {
 		if (i != 0) {
 			iru.iru_pidx = pidx;
 			iru.iru_count = i;
 			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
 			fl->ifl_pidx = idx;
 			fl->ifl_credits = credits;
 		}
 		DBG_COUNTER_INC(rxd_flush);
 		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 		ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id,
 		    fl->ifl_id, fl->ifl_pidx);
 		if (__predict_true(bit_test(fl->ifl_rx_bitmap, frag_idx))) {
 			fl->ifl_fragidx = frag_idx + 1;
 			if (fl->ifl_fragidx == fl->ifl_size)
 				fl->ifl_fragidx = 0;
 		} else {
 			fl->ifl_fragidx = frag_idx;
 		}
 	}
 
 	return (n == -1 ? 0 : IFLIB_RXEOF_EMPTY);
 }
 
 static inline uint8_t
 iflib_fl_refill_all(if_ctx_t ctx, iflib_fl_t fl)
 {
 	/*
 	 * We leave an unused descriptor to avoid pidx to catch up with cidx.
 	 * This is important as it confuses most NICs. For instance,
 	 * Intel NICs have (per receive ring) RDH and RDT registers, where
 	 * RDH points to the next receive descriptor to be used by the NIC,
 	 * and RDT for the next receive descriptor to be published by the
 	 * driver to the NIC (RDT - 1 is thus the last valid one).
 	 * The condition RDH == RDT means no descriptors are available to
 	 * the NIC, and thus it would be ambiguous if it also meant that
 	 * all the descriptors are available to the NIC.
 	 */
 	int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1;
 #ifdef INVARIANTS
 	int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1;
 #endif
 
 	MPASS(fl->ifl_credits <= fl->ifl_size);
 	MPASS(reclaimable == delta);
 
 	if (reclaimable > 0)
 		return (iflib_fl_refill(ctx, fl, reclaimable));
 	return (0);
 }
 
 uint8_t
 iflib_in_detach(if_ctx_t ctx)
 {
 	bool in_detach;
 
 	STATE_LOCK(ctx);
 	in_detach = !!(ctx->ifc_flags & IFC_IN_DETACH);
 	STATE_UNLOCK(ctx);
 	return (in_detach);
 }
 
 static void
 iflib_fl_bufs_free(iflib_fl_t fl)
 {
 	iflib_dma_info_t idi = fl->ifl_ifdi;
 	bus_dmamap_t sd_map;
 	uint32_t i;
 
 	for (i = 0; i < fl->ifl_size; i++) {
 		struct mbuf **sd_m = &fl->ifl_sds.ifsd_m[i];
 		caddr_t *sd_cl = &fl->ifl_sds.ifsd_cl[i];
 
 		if (*sd_cl != NULL) {
 			sd_map = fl->ifl_sds.ifsd_map[i];
 			bus_dmamap_sync(fl->ifl_buf_tag, sd_map,
 			    BUS_DMASYNC_POSTREAD);
 			bus_dmamap_unload(fl->ifl_buf_tag, sd_map);
 			uma_zfree(fl->ifl_zone, *sd_cl);
 			*sd_cl = NULL;
 			if (*sd_m != NULL) {
 				m_init(*sd_m, M_NOWAIT, MT_DATA, 0);
 				m_free_raw(*sd_m);
 				*sd_m = NULL;
 			}
 		} else {
 			MPASS(*sd_m == NULL);
 		}
 #if MEMORY_LOGGING
 		fl->ifl_m_dequeued++;
 		fl->ifl_cl_dequeued++;
 #endif
 	}
 #ifdef INVARIANTS
 	for (i = 0; i < fl->ifl_size; i++) {
 		MPASS(fl->ifl_sds.ifsd_cl[i] == NULL);
 		MPASS(fl->ifl_sds.ifsd_m[i] == NULL);
 	}
 #endif
 	/*
 	 * Reset free list values
 	 */
 	fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = fl->ifl_fragidx = 0;
 	bzero(idi->idi_vaddr, idi->idi_size);
 }
 
 /*********************************************************************
  *
  *  Initialize a free list and its buffers.
  *
  **********************************************************************/
 static int
 iflib_fl_setup(iflib_fl_t fl)
 {
 	iflib_rxq_t rxq = fl->ifl_rxq;
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	int qidx;
 
 	bit_nclear(fl->ifl_rx_bitmap, 0, fl->ifl_size - 1);
 	/*
 	** Free current RX buffer structs and their mbufs
 	*/
 	iflib_fl_bufs_free(fl);
 	/* Now replenish the mbufs */
 	MPASS(fl->ifl_credits == 0);
 	qidx = rxq->ifr_fl_offset + fl->ifl_id;
 	if (scctx->isc_rxd_buf_size[qidx] != 0)
 		fl->ifl_buf_size = scctx->isc_rxd_buf_size[qidx];
 	else
 		fl->ifl_buf_size = ctx->ifc_rx_mbuf_sz;
 	/*
 	 * ifl_buf_size may be a driver-supplied value, so pull it up
 	 * to the selected mbuf size.
 	 */
 	fl->ifl_buf_size = iflib_get_mbuf_size_for(fl->ifl_buf_size);
 	if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size)
 		ctx->ifc_max_fl_buf_size = fl->ifl_buf_size;
 	fl->ifl_cltype = m_gettype(fl->ifl_buf_size);
 	fl->ifl_zone = m_getzone(fl->ifl_buf_size);
 
 	/*
 	 * Avoid pre-allocating zillions of clusters to an idle card
 	 * potentially speeding up attach. In any case make sure
 	 * to leave a descriptor unavailable. See the comment in
 	 * iflib_fl_refill_all().
 	 */
 	MPASS(fl->ifl_size > 0);
 	(void)iflib_fl_refill(ctx, fl, min(128, fl->ifl_size - 1));
 	if (min(128, fl->ifl_size - 1) != fl->ifl_credits)
 		return (ENOBUFS);
 	/*
 	 * handle failure
 	 */
 	MPASS(rxq != NULL);
 	MPASS(fl->ifl_ifdi != NULL);
 	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	return (0);
 }
 
 /*********************************************************************
  *
  *  Free receive ring data structures
  *
  **********************************************************************/
 static void
 iflib_rx_sds_free(iflib_rxq_t rxq)
 {
 	iflib_fl_t fl;
 	int i, j;
 
 	if (rxq->ifr_fl != NULL) {
 		for (i = 0; i < rxq->ifr_nfl; i++) {
 			fl = &rxq->ifr_fl[i];
 			if (fl->ifl_buf_tag != NULL) {
 				if (fl->ifl_sds.ifsd_map != NULL) {
 					for (j = 0; j < fl->ifl_size; j++) {
 						bus_dmamap_sync(
 						    fl->ifl_buf_tag,
 						    fl->ifl_sds.ifsd_map[j],
 						    BUS_DMASYNC_POSTREAD);
 						bus_dmamap_unload(
 						    fl->ifl_buf_tag,
 						    fl->ifl_sds.ifsd_map[j]);
 						bus_dmamap_destroy(
 						    fl->ifl_buf_tag,
 						    fl->ifl_sds.ifsd_map[j]);
 					}
 				}
 				bus_dma_tag_destroy(fl->ifl_buf_tag);
 				fl->ifl_buf_tag = NULL;
 			}
 			free(fl->ifl_sds.ifsd_m, M_IFLIB);
 			free(fl->ifl_sds.ifsd_cl, M_IFLIB);
 			free(fl->ifl_sds.ifsd_ba, M_IFLIB);
 			free(fl->ifl_sds.ifsd_map, M_IFLIB);
 			free(fl->ifl_rx_bitmap, M_IFLIB);
 			fl->ifl_sds.ifsd_m = NULL;
 			fl->ifl_sds.ifsd_cl = NULL;
 			fl->ifl_sds.ifsd_ba = NULL;
 			fl->ifl_sds.ifsd_map = NULL;
 			fl->ifl_rx_bitmap = NULL;
 		}
 		free(rxq->ifr_fl, M_IFLIB);
 		rxq->ifr_fl = NULL;
 		free(rxq->ifr_ifdi, M_IFLIB);
 		rxq->ifr_ifdi = NULL;
 		rxq->ifr_cq_cidx = 0;
 	}
 }
 
 /*
  * Timer routine
  */
 static void
 iflib_timer(void *arg)
 {
 	iflib_txq_t txq = arg;
 	if_ctx_t ctx = txq->ift_ctx;
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 	uint64_t this_tick = ticks;
 
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
 
 	/*
 	** Check on the state of the TX queue(s), this
 	** can be done without the lock because its RO
 	** and the HUNG state will be static if set.
 	*/
 	if (this_tick - txq->ift_last_timer_tick >= iflib_timer_default) {
 		txq->ift_last_timer_tick = this_tick;
 		IFDI_TIMER(ctx, txq->ift_id);
 		if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) &&
 		    ((txq->ift_cleaned_prev == txq->ift_cleaned) ||
 		     (sctx->isc_pause_frames == 0)))
 			goto hung;
 
 		if (txq->ift_qstatus != IFLIB_QUEUE_IDLE &&
 		    ifmp_ring_is_stalled(txq->ift_br)) {
 			KASSERT(ctx->ifc_link_state == LINK_STATE_UP,
 			    ("queue can't be marked as hung if interface is down"));
 			txq->ift_qstatus = IFLIB_QUEUE_HUNG;
 		}
 		txq->ift_cleaned_prev = txq->ift_cleaned;
 	}
 	/* handle any laggards */
 	if (txq->ift_db_pending)
 		GROUPTASK_ENQUEUE(&txq->ift_task);
 
 	sctx->isc_pause_frames = 0;
 	if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) 
 		callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer,
 		    txq, txq->ift_timer.c_cpu);
 	return;
 
  hung:
 	device_printf(ctx->ifc_dev,
 	    "Watchdog timeout (TX: %d desc avail: %d pidx: %d) -- resetting\n",
 	    txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx);
 	STATE_LOCK(ctx);
 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 	ctx->ifc_flags |= (IFC_DO_WATCHDOG|IFC_DO_RESET);
 	iflib_admin_intr_deferred(ctx);
 	STATE_UNLOCK(ctx);
 }
 
 static uint16_t
 iflib_get_mbuf_size_for(unsigned int size)
 {
 
 	if (size <= MCLBYTES)
 		return (MCLBYTES);
 	else
 		return (MJUMPAGESIZE);
 }
 
 static void
 iflib_calc_rx_mbuf_sz(if_ctx_t ctx)
 {
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 
 	/*
 	 * XXX don't set the max_frame_size to larger
 	 * than the hardware can handle
 	 */
 	ctx->ifc_rx_mbuf_sz =
 	    iflib_get_mbuf_size_for(sctx->isc_max_frame_size);
 }
 
 uint32_t
 iflib_get_rx_mbuf_sz(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_rx_mbuf_sz);
 }
 
 static void
 iflib_init_locked(if_ctx_t ctx)
 {
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_t ifp = ctx->ifc_ifp;
 	iflib_fl_t fl;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	int i, j, tx_ip_csum_flags, tx_ip6_csum_flags;
 
 	if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 	IFDI_INTR_DISABLE(ctx);
 
 	/*
 	 * See iflib_stop(). Useful in case iflib_init_locked() is
 	 * called without first calling iflib_stop().
 	 */
 	netmap_disable_all_rings(ifp);
 
 	tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP);
 	tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP);
 	/* Set hardware offload abilities */
 	if_clearhwassist(ifp);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
 		if_sethwassistbits(ifp, tx_ip_csum_flags, 0);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
 		if_sethwassistbits(ifp,  tx_ip6_csum_flags, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO4)
 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO6)
 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
 
 	for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) {
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 #ifdef DEV_NETMAP
 		callout_stop(&txq->ift_netmap_timer);
 #endif /* DEV_NETMAP */
 		CALLOUT_UNLOCK(txq);
 		(void)iflib_netmap_txq_init(ctx, txq);
 	}
 
 	/*
 	 * Calculate a suitable Rx mbuf size prior to calling IFDI_INIT, so
 	 * that drivers can use the value when setting up the hardware receive
 	 * buffers.
 	 */
 	iflib_calc_rx_mbuf_sz(ctx);
 
 #ifdef INVARIANTS
 	i = if_getdrvflags(ifp);
 #endif
 	IFDI_INIT(ctx);
 	MPASS(if_getdrvflags(ifp) == i);
 	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
 		if (iflib_netmap_rxq_init(ctx, rxq) > 0) {
 			/* This rxq is in netmap mode. Skip normal init. */
 			continue;
 		}
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
 			if (iflib_fl_setup(fl)) {
 				device_printf(ctx->ifc_dev,
 				    "setting up free list %d failed - "
 				    "check cluster settings\n", j);
 				goto done;
 			}
 		}
 	}
 done:
 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
 	IFDI_INTR_ENABLE(ctx);
 	txq = ctx->ifc_txqs;
 	for (i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 		callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, txq,
 			txq->ift_timer.c_cpu);
 
         /* Re-enable txsync/rxsync. */
 	netmap_enable_all_rings(ifp);
 }
 
 static int
 iflib_media_change(if_t ifp)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	int err;
 
 	CTX_LOCK(ctx);
 	if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0)
 		iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 	return (err);
 }
 
 static void
 iflib_media_status(if_t ifp, struct ifmediareq *ifmr)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	CTX_LOCK(ctx);
 	IFDI_UPDATE_ADMIN_STATUS(ctx);
 	IFDI_MEDIA_STATUS(ctx, ifmr);
 	CTX_UNLOCK(ctx);
 }
 
 void
 iflib_stop(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	iflib_dma_info_t di;
 	iflib_fl_t fl;
 	int i, j;
 
 	/* Tell the stack that the interface is no longer active */
 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 
 	IFDI_INTR_DISABLE(ctx);
 	DELAY(1000);
 	IFDI_STOP(ctx);
 	DELAY(1000);
 
 	/*
 	 * Stop any pending txsync/rxsync and prevent new ones
 	 * form starting. Processes blocked in poll() will get
 	 * POLLERR.
 	 */
 	netmap_disable_all_rings(ctx->ifc_ifp);
 
 	iflib_debug_reset();
 	/* Wait for current tx queue users to exit to disarm watchdog timer. */
 	for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) {
 		/* make sure all transmitters have completed before proceeding XXX */
 
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 #ifdef DEV_NETMAP
 		callout_stop(&txq->ift_netmap_timer);
 #endif /* DEV_NETMAP */
 		CALLOUT_UNLOCK(txq);
 
 		/* clean any enqueued buffers */
 		iflib_ifmp_purge(txq);
 		/* Free any existing tx buffers. */
 		for (j = 0; j < txq->ift_size; j++) {
 			iflib_txsd_free(ctx, txq, j);
 		}
 		txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0;
 		txq->ift_in_use = txq->ift_gen = txq->ift_no_desc_avail = 0;
 		if (sctx->isc_flags & IFLIB_PRESERVE_TX_INDICES)
 			txq->ift_cidx = txq->ift_pidx;
 		else
 			txq->ift_cidx = txq->ift_pidx = 0;
 
 		txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0;
 		txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0;
 		txq->ift_pullups = 0;
 		ifmp_ring_reset_stats(txq->ift_br);
 		for (j = 0, di = txq->ift_ifdi; j < sctx->isc_ntxqs; j++, di++)
 			bzero((void *)di->idi_vaddr, di->idi_size);
 	}
 	for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) {
 		gtaskqueue_drain(rxq->ifr_task.gt_taskqueue,
 		    &rxq->ifr_task.gt_task);
 
 		rxq->ifr_cq_cidx = 0;
 		for (j = 0, di = rxq->ifr_ifdi; j < sctx->isc_nrxqs; j++, di++)
 			bzero((void *)di->idi_vaddr, di->idi_size);
 		/* also resets the free lists pidx/cidx */
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
 			iflib_fl_bufs_free(fl);
 	}
 }
 
 static inline caddr_t
 calc_next_rxd(iflib_fl_t fl, int cidx)
 {
 	qidx_t size;
 	int nrxd;
 	caddr_t start, end, cur, next;
 
 	nrxd = fl->ifl_size;
 	size = fl->ifl_rxd_size;
 	start = fl->ifl_ifdi->idi_vaddr;
 
 	if (__predict_false(size == 0))
 		return (start);
 	cur = start + size*cidx;
 	end = start + size*nrxd;
 	next = CACHE_PTR_NEXT(cur);
 	return (next < end ? next : start);
 }
 
 static inline void
 prefetch_pkts(iflib_fl_t fl, int cidx)
 {
 	int nextptr;
 	int nrxd = fl->ifl_size;
 	caddr_t next_rxd;
 
 	nextptr = (cidx + CACHE_PTR_INCREMENT) & (nrxd-1);
 	prefetch(&fl->ifl_sds.ifsd_m[nextptr]);
 	prefetch(&fl->ifl_sds.ifsd_cl[nextptr]);
 	next_rxd = calc_next_rxd(fl, cidx);
 	prefetch(next_rxd);
 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 1) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 2) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 3) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 4) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 1) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 2) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 3) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]);
 }
 
 static struct mbuf *
 rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, bool unload, if_rxsd_t sd,
     int *pf_rv, if_rxd_info_t ri)
 {
 	bus_dmamap_t map;
 	iflib_fl_t fl;
 	caddr_t payload;
 	struct mbuf *m;
 	int flid, cidx, len, next;
 
 	map = NULL;
 	flid = irf->irf_flid;
 	cidx = irf->irf_idx;
 	fl = &rxq->ifr_fl[flid];
 	sd->ifsd_fl = fl;
 	m = fl->ifl_sds.ifsd_m[cidx];
 	sd->ifsd_cl = &fl->ifl_sds.ifsd_cl[cidx];
 	fl->ifl_credits--;
 #if MEMORY_LOGGING
 	fl->ifl_m_dequeued++;
 #endif
 	if (rxq->ifr_ctx->ifc_flags & IFC_PREFETCH)
 		prefetch_pkts(fl, cidx);
 	next = (cidx + CACHE_PTR_INCREMENT) & (fl->ifl_size-1);
 	prefetch(&fl->ifl_sds.ifsd_map[next]);
 	map = fl->ifl_sds.ifsd_map[cidx];
 
 	bus_dmamap_sync(fl->ifl_buf_tag, map, BUS_DMASYNC_POSTREAD);
 
 	if (rxq->pfil != NULL && PFIL_HOOKED_IN(rxq->pfil) && pf_rv != NULL &&
 	    irf->irf_len != 0) {
 		payload  = *sd->ifsd_cl;
 		payload +=  ri->iri_pad;
 		len = ri->iri_len - ri->iri_pad;
 		*pf_rv = pfil_run_hooks(rxq->pfil, payload, ri->iri_ifp,
 		    len | PFIL_MEMPTR | PFIL_IN, NULL);
 		switch (*pf_rv) {
 		case PFIL_DROPPED:
 		case PFIL_CONSUMED:
 			/*
 			 * The filter ate it.  Everything is recycled.
 			 */
 			m = NULL;
 			unload = 0;
 			break;
 		case PFIL_REALLOCED:
 			/*
 			 * The filter copied it.  Everything is recycled.
 			 */
 			m = pfil_mem2mbuf(payload);
 			unload = 0;
 			break;
 		case PFIL_PASS:
 			/*
 			 * Filter said it was OK, so receive like
 			 * normal
 			 */
 			fl->ifl_sds.ifsd_m[cidx] = NULL;
 			break;
 		default:
 			MPASS(0);
 		}
 	} else {
 		fl->ifl_sds.ifsd_m[cidx] = NULL;
 		if (pf_rv != NULL)
 			*pf_rv = PFIL_PASS;
 	}
 
 	if (unload && irf->irf_len != 0)
 		bus_dmamap_unload(fl->ifl_buf_tag, map);
 	fl->ifl_cidx = (fl->ifl_cidx + 1) & (fl->ifl_size-1);
 	if (__predict_false(fl->ifl_cidx == 0))
 		fl->ifl_gen = 0;
 	bit_clear(fl->ifl_rx_bitmap, cidx);
 	return (m);
 }
 
 static struct mbuf *
 assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri, if_rxsd_t sd, int *pf_rv)
 {
 	struct mbuf *m, *mh, *mt;
 	caddr_t cl;
 	int  *pf_rv_ptr, flags, i, padlen;
 	bool consumed;
 
 	i = 0;
 	mh = NULL;
 	consumed = false;
 	*pf_rv = PFIL_PASS;
 	pf_rv_ptr = pf_rv;
 	do {
 		m = rxd_frag_to_sd(rxq, &ri->iri_frags[i], !consumed, sd,
 		    pf_rv_ptr, ri);
 
 		MPASS(*sd->ifsd_cl != NULL);
 
 		/*
 		 * Exclude zero-length frags & frags from
 		 * packets the filter has consumed or dropped
 		 */
 		if (ri->iri_frags[i].irf_len == 0 || consumed ||
 		    *pf_rv == PFIL_CONSUMED || *pf_rv == PFIL_DROPPED) {
 			if (mh == NULL) {
 				/* everything saved here */
 				consumed = true;
 				pf_rv_ptr = NULL;
 				continue;
 			}
 			/* XXX we can save the cluster here, but not the mbuf */
 			m_init(m, M_NOWAIT, MT_DATA, 0);
 			m_free(m);
 			continue;
 		}
 		if (mh == NULL) {
 			flags = M_PKTHDR|M_EXT;
 			mh = mt = m;
 			padlen = ri->iri_pad;
 		} else {
 			flags = M_EXT;
 			mt->m_next = m;
 			mt = m;
 			/* assuming padding is only on the first fragment */
 			padlen = 0;
 		}
 		cl = *sd->ifsd_cl;
 		*sd->ifsd_cl = NULL;
 
 		/* Can these two be made one ? */
 		m_init(m, M_NOWAIT, MT_DATA, flags);
 		m_cljset(m, cl, sd->ifsd_fl->ifl_cltype);
 		/*
 		 * These must follow m_init and m_cljset
 		 */
 		m->m_data += padlen;
 		ri->iri_len -= padlen;
 		m->m_len = ri->iri_frags[i].irf_len;
 	} while (++i < ri->iri_nfrags);
 
 	return (mh);
 }
 
 /*
  * Process one software descriptor
  */
 static struct mbuf *
 iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri)
 {
 	struct if_rxsd sd;
 	struct mbuf *m;
 	int pf_rv;
 
 	/* should I merge this back in now that the two paths are basically duplicated? */
 	if (ri->iri_nfrags == 1 &&
 	    ri->iri_frags[0].irf_len != 0 &&
 	    ri->iri_frags[0].irf_len <= MIN(IFLIB_RX_COPY_THRESH, MHLEN)) {
 		m = rxd_frag_to_sd(rxq, &ri->iri_frags[0], false, &sd,
 		    &pf_rv, ri);
 		if (pf_rv != PFIL_PASS && pf_rv != PFIL_REALLOCED)
 			return (m);
 		if (pf_rv == PFIL_PASS) {
 			m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
 #ifndef __NO_STRICT_ALIGNMENT
 			if (!IP_ALIGNED(m) && ri->iri_pad == 0)
 				m->m_data += 2;
 #endif
 			memcpy(m->m_data, *sd.ifsd_cl, ri->iri_len);
 			m->m_len = ri->iri_frags[0].irf_len;
 			m->m_data += ri->iri_pad;
 			ri->iri_len -= ri->iri_pad;
 		}
 	} else {
 		m = assemble_segments(rxq, ri, &sd, &pf_rv);
 		if (m == NULL)
 			return (NULL);
 		if (pf_rv != PFIL_PASS && pf_rv != PFIL_REALLOCED)
 			return (m);
 	}
 	m->m_pkthdr.len = ri->iri_len;
 	m->m_pkthdr.rcvif = ri->iri_ifp;
 	m->m_flags |= ri->iri_flags;
 	m->m_pkthdr.ether_vtag = ri->iri_vtag;
 	m->m_pkthdr.flowid = ri->iri_flowid;
 	M_HASHTYPE_SET(m, ri->iri_rsstype);
 	m->m_pkthdr.csum_flags = ri->iri_csum_flags;
 	m->m_pkthdr.csum_data = ri->iri_csum_data;
 	return (m);
 }
 
 #if defined(INET6) || defined(INET)
 static void
 iflib_get_ip_forwarding(struct lro_ctrl *lc, bool *v4, bool *v6)
 {
 	CURVNET_SET(lc->ifp->if_vnet); /* XXX - DRVAPI */
 #if defined(INET6)
 	*v6 = V_ip6_forwarding;
 #endif
 #if defined(INET)
 	*v4 = V_ipforwarding;
 #endif
 	CURVNET_RESTORE();
 }
 
 /*
  * Returns true if it's possible this packet could be LROed.
  * if it returns false, it is guaranteed that tcp_lro_rx()
  * would not return zero.
  */
 static bool
 iflib_check_lro_possible(struct mbuf *m, bool v4_forwarding, bool v6_forwarding)
 {
 	struct ether_header *eh;
 
 	eh = mtod(m, struct ether_header *);
 	switch (eh->ether_type) {
 #if defined(INET6)
 		case htons(ETHERTYPE_IPV6):
 			return (!v6_forwarding);
 #endif
 #if defined (INET)
 		case htons(ETHERTYPE_IP):
 			return (!v4_forwarding);
 #endif
 	}
 
 	return false;
 }
 #else
 static void
 iflib_get_ip_forwarding(struct lro_ctrl *lc __unused, bool *v4 __unused, bool *v6 __unused)
 {
 }
 #endif
 
 static void
 _task_fn_rx_watchdog(void *context)
 {
 	iflib_rxq_t rxq = context;
 
 	GROUPTASK_ENQUEUE(&rxq->ifr_task);
 }
 
 static uint8_t
 iflib_rxeof(iflib_rxq_t rxq, qidx_t budget)
 {
 	if_t ifp;
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	int avail, i;
 	qidx_t *cidxp;
 	struct if_rxd_info ri;
 	int err, budget_left, rx_bytes, rx_pkts;
 	iflib_fl_t fl;
 	int lro_enabled;
 	bool v4_forwarding, v6_forwarding, lro_possible;
 	uint8_t retval = 0;
 
 	/*
 	 * XXX early demux data packets so that if_input processing only handles
 	 * acks in interrupt context
 	 */
 	struct mbuf *m, *mh, *mt, *mf;
 
 	NET_EPOCH_ASSERT();
 
 	lro_possible = v4_forwarding = v6_forwarding = false;
 	ifp = ctx->ifc_ifp;
 	mh = mt = NULL;
 	MPASS(budget > 0);
 	rx_pkts	= rx_bytes = 0;
 	if (sctx->isc_flags & IFLIB_HAS_RXCQ)
 		cidxp = &rxq->ifr_cq_cidx;
 	else
 		cidxp = &rxq->ifr_fl[0].ifl_cidx;
 	if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) {
 		for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
 			retval |= iflib_fl_refill_all(ctx, fl);
 		DBG_COUNTER_INC(rx_unavail);
 		return (retval);
 	}
 
 	/* pfil needs the vnet to be set */
 	CURVNET_SET_QUIET(ifp->if_vnet); /* XXX - DRVAPI */
 	for (budget_left = budget; budget_left > 0 && avail > 0;) {
 		if (__predict_false(!CTX_ACTIVE(ctx))) {
 			DBG_COUNTER_INC(rx_ctx_inactive);
 			break;
 		}
 		/*
 		 * Reset client set fields to their default values
 		 */
 		rxd_info_zero(&ri);
 		ri.iri_qsidx = rxq->ifr_id;
 		ri.iri_cidx = *cidxp;
 		ri.iri_ifp = ifp;
 		ri.iri_frags = rxq->ifr_frags;
 		err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
 
 		if (err)
 			goto err;
 		rx_pkts += 1;
 		rx_bytes += ri.iri_len;
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			*cidxp = ri.iri_cidx;
 			/* Update our consumer index */
 			/* XXX NB: shurd - check if this is still safe */
 			while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0])
 				rxq->ifr_cq_cidx -= scctx->isc_nrxd[0];
 			/* was this only a completion queue message? */
 			if (__predict_false(ri.iri_nfrags == 0))
 				continue;
 		}
 		MPASS(ri.iri_nfrags != 0);
 		MPASS(ri.iri_len != 0);
 
 		/* will advance the cidx on the corresponding free lists */
 		m = iflib_rxd_pkt_get(rxq, &ri);
 		avail--;
 		budget_left--;
 		if (avail == 0 && budget_left)
 			avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left);
 
 		if (__predict_false(m == NULL))
 			continue;
 
 		/* imm_pkt: -- cxgb */
 		if (mh == NULL)
 			mh = mt = m;
 		else {
 			mt->m_nextpkt = m;
 			mt = m;
 		}
 	}
 	CURVNET_RESTORE();
 	/* make sure that we can refill faster than drain */
 	for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
 		retval |= iflib_fl_refill_all(ctx, fl);
 
 	lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO);
 	if (lro_enabled)
 		iflib_get_ip_forwarding(&rxq->ifr_lc, &v4_forwarding, &v6_forwarding);
 	mt = mf = NULL;
 	while (mh != NULL) {
 		m = mh;
 		mh = mh->m_nextpkt;
 		m->m_nextpkt = NULL;
 #ifndef __NO_STRICT_ALIGNMENT
 		if (!IP_ALIGNED(m) && (m = iflib_fixup_rx(m)) == NULL)
 			continue;
 #endif
 #if defined(INET6) || defined(INET)
 		if (lro_enabled) {
 			if (!lro_possible) {
 				lro_possible = iflib_check_lro_possible(m, v4_forwarding, v6_forwarding);
 				if (lro_possible && mf != NULL) {
 					if_input(ifp, mf);
 					DBG_COUNTER_INC(rx_if_input);
 					mt = mf = NULL;
 				}
 			}
 			if ((m->m_pkthdr.csum_flags & (CSUM_L4_CALC|CSUM_L4_VALID)) ==
 			    (CSUM_L4_CALC|CSUM_L4_VALID)) {
 				if (lro_possible && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
 					continue;
 			}
 		}
 #endif
 		if (lro_possible) {
 			if_input(ifp, m);
 			DBG_COUNTER_INC(rx_if_input);
 			continue;
 		}
 
 		if (mf == NULL)
 			mf = m;
 		if (mt != NULL)
 			mt->m_nextpkt = m;
 		mt = m;
 	}
 	if (mf != NULL) {
 		if_input(ifp, mf);
 		DBG_COUNTER_INC(rx_if_input);
 	}
 
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
 
 	/*
 	 * Flush any outstanding LRO work
 	 */
 #if defined(INET6) || defined(INET)
 	tcp_lro_flush_all(&rxq->ifr_lc);
 #endif
 	if (avail != 0 || iflib_rxd_avail(ctx, rxq, *cidxp, 1) != 0)
 		retval |= IFLIB_RXEOF_MORE;
 	return (retval);
 err:
 	STATE_LOCK(ctx);
 	ctx->ifc_flags |= IFC_DO_RESET;
 	iflib_admin_intr_deferred(ctx);
 	STATE_UNLOCK(ctx);
 	return (0);
 }
 
 #define TXD_NOTIFY_COUNT(txq) (((txq)->ift_size / (txq)->ift_update_freq)-1)
 static inline qidx_t
 txq_max_db_deferred(iflib_txq_t txq, qidx_t in_use)
 {
 	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
 	qidx_t minthresh = txq->ift_size / 8;
 	if (in_use > 4*minthresh)
 		return (notify_count);
 	if (in_use > 2*minthresh)
 		return (notify_count >> 1);
 	if (in_use > minthresh)
 		return (notify_count >> 3);
 	return (0);
 }
 
 static inline qidx_t
 txq_max_rs_deferred(iflib_txq_t txq)
 {
 	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
 	qidx_t minthresh = txq->ift_size / 8;
 	if (txq->ift_in_use > 4*minthresh)
 		return (notify_count);
 	if (txq->ift_in_use > 2*minthresh)
 		return (notify_count >> 1);
 	if (txq->ift_in_use > minthresh)
 		return (notify_count >> 2);
 	return (2);
 }
 
 #define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags)
 #define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG)
 
 #define TXQ_MAX_DB_DEFERRED(txq, in_use) txq_max_db_deferred((txq), (in_use))
 #define TXQ_MAX_RS_DEFERRED(txq) txq_max_rs_deferred(txq)
 #define TXQ_MAX_DB_CONSUMED(size) (size >> 4)
 
 /* forward compatibility for cxgb */
 #define FIRST_QSET(ctx) 0
 #define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
 #define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
 #define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
 #define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
 
 /* XXX we should be setting this to something other than zero */
 #define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
 #define	MAX_TX_DESC(ctx) MAX((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max, \
     (ctx)->ifc_softc_ctx.isc_tx_nsegments)
 
 static inline bool
 iflib_txd_db_check(iflib_txq_t txq, int ring)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 	qidx_t dbval, max;
 
 	max = TXQ_MAX_DB_DEFERRED(txq, txq->ift_in_use);
 
 	/* force || threshold exceeded || at the edge of the ring */
 	if (ring || (txq->ift_db_pending >= max) || (TXQ_AVAIL(txq) <= MAX_TX_DESC(ctx) + 2)) {
 
 		/*
 		 * 'npending' is used if the card's doorbell is in terms of the number of descriptors
 		 * pending flush (BRCM). 'pidx' is used in cases where the card's doorbeel uses the
 		 * producer index explicitly (INTC).
 		 */
 		dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx;
 		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval);
 
 		/*
 		 * Absent bugs there are zero packets pending so reset pending counts to zero.
 		 */
 		txq->ift_db_pending = txq->ift_npending = 0;
 		return (true);
 	}
 	return (false);
 }
 
 #ifdef PKT_DEBUG
 static void
 print_pkt(if_pkt_info_t pi)
 {
 	printf("pi len:  %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n",
 	       pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx);
 	printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n",
 	       pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag);
 	printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n",
 	       pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto);
 }
 #endif
 
 #define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO)
 #define IS_TX_OFFLOAD4(pi) ((pi)->ipi_csum_flags & (CSUM_IP_TCP | CSUM_IP_TSO))
 #define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO)
 #define IS_TX_OFFLOAD6(pi) ((pi)->ipi_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_TSO))
 
 /**
  * Parses out ethernet header information in the given mbuf.
  * Returns in pi: ipi_etype (EtherType) and ipi_ehdrlen (Ethernet header length)
  *
  * This will account for the VLAN header if present.
  *
  * XXX: This doesn't handle QinQ, which could prevent TX offloads for those
  * types of packets.
  */
 static int
 iflib_parse_ether_header(if_pkt_info_t pi, struct mbuf **mp, uint64_t *pullups)
 {
 	struct ether_vlan_header *eh;
 	struct mbuf *m;
 
 	m = *mp;
 	if (__predict_false(m->m_len < sizeof(*eh))) {
 		(*pullups)++;
 		if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL))
 			return (ENOMEM);
 	}
 	eh = mtod(m, struct ether_vlan_header *);
 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		pi->ipi_etype = ntohs(eh->evl_proto);
 		pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	} else {
 		pi->ipi_etype = ntohs(eh->evl_encap_proto);
 		pi->ipi_ehdrlen = ETHER_HDR_LEN;
 	}
 	*mp = m;
 
 	return (0);
 }
 
 /**
  * Parse up to the L3 header and extract IPv4/IPv6 header information into pi.
  * Currently this information includes: IP ToS value, IP header version/presence
  *
  * This is missing some checks and doesn't edit the packet content as it goes,
  * unlike iflib_parse_header(), in order to keep the amount of code here minimal.
  */
 static int
 iflib_parse_header_partial(if_pkt_info_t pi, struct mbuf **mp, uint64_t *pullups)
 {
 	struct mbuf *m;
 	int err;
 
 	*pullups = 0;
 	m = *mp;
 	if (!M_WRITABLE(m)) {
 		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
 			return (ENOMEM);
 		} else {
 			m_freem(*mp);
 			DBG_COUNTER_INC(tx_frees);
 			*mp = m;
 		}
 	}
 
 	/* Fills out pi->ipi_etype */
 	err = iflib_parse_ether_header(pi, mp, pullups);
 	if (err)
 		return (err);
 	m = *mp;
 
 	switch (pi->ipi_etype) {
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		struct mbuf *n;
 		struct ip *ip = NULL;
 		int miniplen;
 
 		miniplen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip));
 		if (__predict_false(m->m_len < miniplen)) {
 			/*
 			 * Check for common case where the first mbuf only contains
 			 * the Ethernet header
 			 */
 			if (m->m_len == pi->ipi_ehdrlen) {
 				n = m->m_next;
 				MPASS(n);
 				/* If next mbuf contains at least the minimal IP header, then stop */
 				if (n->m_len >= sizeof(*ip)) {
 					ip = (struct ip *)n->m_data;
 				} else {
 					(*pullups)++;
 					if (__predict_false((m = m_pullup(m, miniplen)) == NULL))
 						return (ENOMEM);
 					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 				}
 			} else {
 				(*pullups)++;
 				if (__predict_false((m = m_pullup(m, miniplen)) == NULL))
 					return (ENOMEM);
 				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 			}
 		} else {
 			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 		}
 
 		/* Have the IPv4 header w/ no options here */
 		pi->ipi_ip_hlen = ip->ip_hl << 2;
 		pi->ipi_ipproto = ip->ip_p;
 		pi->ipi_ip_tos = ip->ip_tos;
 		pi->ipi_flags |= IPI_TX_IPV4;
 
 		break;
 	}
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 	{
 		struct ip6_hdr *ip6;
 
 		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
 			(*pullups)++;
 			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
 				return (ENOMEM);
 		}
 		ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
 
 		/* Have the IPv6 fixed header here */
 		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
 		pi->ipi_ipproto = ip6->ip6_nxt;
 		pi->ipi_ip_tos = IPV6_TRAFFIC_CLASS(ip6);
 		pi->ipi_flags |= IPI_TX_IPV6;
 
 		break;
 	}
 #endif
 	default:
 		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
 		pi->ipi_ip_hlen = 0;
 		break;
 	}
 	*mp = m;
 
 	return (0);
 
 }
 
 static int
 iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
 {
 	if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx;
 	struct mbuf *m;
 	int err;
 
 	m = *mp;
 	if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) &&
 	    M_WRITABLE(m) == 0) {
 		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
 			return (ENOMEM);
 		} else {
 			m_freem(*mp);
 			DBG_COUNTER_INC(tx_frees);
 			*mp = m;
 		}
 	}
 
 	/* Fills out pi->ipi_etype */
 	err = iflib_parse_ether_header(pi, mp, &txq->ift_pullups);
 	if (__predict_false(err))
 		return (err);
 	m = *mp;
 
 	switch (pi->ipi_etype) {
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		struct mbuf *n;
 		struct ip *ip = NULL;
 		struct tcphdr *th = NULL;
 		int minthlen;
 
 		minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th));
 		if (__predict_false(m->m_len < minthlen)) {
 			/*
 			 * if this code bloat is causing too much of a hit
 			 * move it to a separate function and mark it noinline
 			 */
 			if (m->m_len == pi->ipi_ehdrlen) {
 				n = m->m_next;
 				MPASS(n);
 				if (n->m_len >= sizeof(*ip))  {
 					ip = (struct ip *)n->m_data;
 					if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 						th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 				} else {
 					txq->ift_pullups++;
 					if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
 						return (ENOMEM);
 					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 				}
 			} else {
 				txq->ift_pullups++;
 				if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
 					return (ENOMEM);
 				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 				if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 					th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 			}
 		} else {
 			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 			if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 				th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		}
 		pi->ipi_ip_hlen = ip->ip_hl << 2;
 		pi->ipi_ipproto = ip->ip_p;
 		pi->ipi_ip_tos = ip->ip_tos;
 		pi->ipi_flags |= IPI_TX_IPV4;
 
 		/* TCP checksum offload may require TCP header length */
 		if (IS_TX_OFFLOAD4(pi)) {
 			if (__predict_true(pi->ipi_ipproto == IPPROTO_TCP)) {
 				if (__predict_false(th == NULL)) {
 					txq->ift_pullups++;
 					if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
 						return (ENOMEM);
 					th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
 				}
 				pi->ipi_tcp_hflags = th->th_flags;
 				pi->ipi_tcp_hlen = th->th_off << 2;
 				pi->ipi_tcp_seq = th->th_seq;
 			}
 			if (IS_TSO4(pi)) {
 				if (__predict_false(ip->ip_p != IPPROTO_TCP))
 					return (ENXIO);
 				/*
 				 * TSO always requires hardware checksum offload.
 				 */
 				pi->ipi_csum_flags |= (CSUM_IP_TCP | CSUM_IP);
 				th->th_sum = in_pseudo(ip->ip_src.s_addr,
 						       ip->ip_dst.s_addr, htons(IPPROTO_TCP));
 				pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
 				if (sctx->isc_flags & IFLIB_TSO_INIT_IP) {
 					ip->ip_sum = 0;
 					ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz);
 				}
 			}
 		}
 		if ((sctx->isc_flags & IFLIB_NEED_ZERO_CSUM) && (pi->ipi_csum_flags & CSUM_IP))
                        ip->ip_sum = 0;
 
 		break;
 	}
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 	{
 		struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
 		struct tcphdr *th;
 		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
 
 		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
 			txq->ift_pullups++;
 			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
 				return (ENOMEM);
 		}
 		th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen);
 
 		/* XXX-BZ this will go badly in case of ext hdrs. */
 		pi->ipi_ipproto = ip6->ip6_nxt;
 		pi->ipi_ip_tos = IPV6_TRAFFIC_CLASS(ip6);
 		pi->ipi_flags |= IPI_TX_IPV6;
 
 		/* TCP checksum offload may require TCP header length */
 		if (IS_TX_OFFLOAD6(pi)) {
 			if (pi->ipi_ipproto == IPPROTO_TCP) {
 				if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
 					txq->ift_pullups++;
 					if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
 						return (ENOMEM);
 				}
 				pi->ipi_tcp_hflags = th->th_flags;
 				pi->ipi_tcp_hlen = th->th_off << 2;
 				pi->ipi_tcp_seq = th->th_seq;
 			}
 			if (IS_TSO6(pi)) {
 				if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP))
 					return (ENXIO);
 				/*
 				 * TSO always requires hardware checksum offload.
 				 */
 				pi->ipi_csum_flags |= CSUM_IP6_TCP;
 				th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
 				pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
 			}
 		}
 		break;
 	}
 #endif
 	default:
 		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
 		pi->ipi_ip_hlen = 0;
 		break;
 	}
 	*mp = m;
 
 	return (0);
 }
 
 /*
  * If dodgy hardware rejects the scatter gather chain we've handed it
  * we'll need to remove the mbuf chain from ifsg_m[] before we can add the
  * m_defrag'd mbufs
  */
 static __noinline struct mbuf *
 iflib_remove_mbuf(iflib_txq_t txq)
 {
 	int ntxd, pidx;
 	struct mbuf *m, **ifsd_m;
 
 	ifsd_m = txq->ift_sds.ifsd_m;
 	ntxd = txq->ift_size;
 	pidx = txq->ift_pidx & (ntxd - 1);
 	ifsd_m = txq->ift_sds.ifsd_m;
 	m = ifsd_m[pidx];
 	ifsd_m[pidx] = NULL;
 	bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[pidx]);
 	if (txq->ift_sds.ifsd_tso_map != NULL)
 		bus_dmamap_unload(txq->ift_tso_buf_tag,
 		    txq->ift_sds.ifsd_tso_map[pidx]);
 #if MEMORY_LOGGING
 	txq->ift_dequeued++;
 #endif
 	return (m);
 }
 
 static inline caddr_t
 calc_next_txd(iflib_txq_t txq, int cidx, uint8_t qid)
 {
 	qidx_t size;
 	int ntxd;
 	caddr_t start, end, cur, next;
 
 	ntxd = txq->ift_size;
 	size = txq->ift_txd_size[qid];
 	start = txq->ift_ifdi[qid].idi_vaddr;
 
 	if (__predict_false(size == 0))
 		return (start);
 	cur = start + size*cidx;
 	end = start + size*ntxd;
 	next = CACHE_PTR_NEXT(cur);
 	return (next < end ? next : start);
 }
 
 /*
  * Pad an mbuf to ensure a minimum ethernet frame size.
  * min_frame_size is the frame size (less CRC) to pad the mbuf to
  */
 static __noinline int
 iflib_ether_pad(device_t dev, struct mbuf **m_head, uint16_t min_frame_size)
 {
 	/*
 	 * 18 is enough bytes to pad an ARP packet to 46 bytes, and
 	 * and ARP message is the smallest common payload I can think of
 	 */
 	static char pad[18];	/* just zeros */
 	int n;
 	struct mbuf *new_head;
 
 	if (!M_WRITABLE(*m_head)) {
 		new_head = m_dup(*m_head, M_NOWAIT);
 		if (new_head == NULL) {
 			m_freem(*m_head);
 			device_printf(dev, "cannot pad short frame, m_dup() failed");
 			DBG_COUNTER_INC(encap_pad_mbuf_fail);
 			DBG_COUNTER_INC(tx_frees);
 			return ENOMEM;
 		}
 		m_freem(*m_head);
 		*m_head = new_head;
 	}
 
 	for (n = min_frame_size - (*m_head)->m_pkthdr.len;
 	     n > 0; n -= sizeof(pad))
 		if (!m_append(*m_head, min(n, sizeof(pad)), pad))
 			break;
 
 	if (n > 0) {
 		m_freem(*m_head);
 		device_printf(dev, "cannot pad short frame\n");
 		DBG_COUNTER_INC(encap_pad_mbuf_fail);
 		DBG_COUNTER_INC(tx_frees);
 		return (ENOBUFS);
 	}
 
 	return 0;
 }
 
 static int
 iflib_encap(iflib_txq_t txq, struct mbuf **m_headp)
 {
 	if_ctx_t		ctx;
 	if_shared_ctx_t		sctx;
 	if_softc_ctx_t		scctx;
 	bus_dma_tag_t		buf_tag;
 	bus_dma_segment_t	*segs;
 	struct mbuf		*m_head, **ifsd_m;
 	void			*next_txd;
 	bus_dmamap_t		map;
 	struct if_pkt_info	pi;
 	int remap = 0;
 	int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd;
 
 	ctx = txq->ift_ctx;
 	sctx = ctx->ifc_sctx;
 	scctx = &ctx->ifc_softc_ctx;
 	segs = txq->ift_segs;
 	ntxd = txq->ift_size;
 	m_head = *m_headp;
 	map = NULL;
 
 	/*
 	 * If we're doing TSO the next descriptor to clean may be quite far ahead
 	 */
 	cidx = txq->ift_cidx;
 	pidx = txq->ift_pidx;
 	if (ctx->ifc_flags & IFC_PREFETCH) {
 		next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1);
 		if (!(ctx->ifc_flags & IFLIB_HAS_TXCQ)) {
 			next_txd = calc_next_txd(txq, cidx, 0);
 			prefetch(next_txd);
 		}
 
 		/* prefetch the next cache line of mbuf pointers and flags */
 		prefetch(&txq->ift_sds.ifsd_m[next]);
 		prefetch(&txq->ift_sds.ifsd_map[next]);
 		next = (cidx + CACHE_LINE_SIZE) & (ntxd-1);
 	}
 	map = txq->ift_sds.ifsd_map[pidx];
 	ifsd_m = txq->ift_sds.ifsd_m;
 
 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
 		buf_tag = txq->ift_tso_buf_tag;
 		max_segs = scctx->isc_tx_tso_segments_max;
 		map = txq->ift_sds.ifsd_tso_map[pidx];
 		MPASS(buf_tag != NULL);
 		MPASS(max_segs > 0);
 	} else {
 		buf_tag = txq->ift_buf_tag;
 		max_segs = scctx->isc_tx_nsegments;
 		map = txq->ift_sds.ifsd_map[pidx];
 	}
 	if ((sctx->isc_flags & IFLIB_NEED_ETHER_PAD) &&
 	    __predict_false(m_head->m_pkthdr.len < scctx->isc_min_frame_size)) {
 		err = iflib_ether_pad(ctx->ifc_dev, m_headp, scctx->isc_min_frame_size);
 		if (err) {
 			DBG_COUNTER_INC(encap_txd_encap_fail);
 			return err;
 		}
 	}
 	m_head = *m_headp;
 
 	pkt_info_zero(&pi);
 	pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST));
 	pi.ipi_pidx = pidx;
 	pi.ipi_qsidx = txq->ift_id;
 	pi.ipi_len = m_head->m_pkthdr.len;
 	pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
 	pi.ipi_vtag = M_HAS_VLANTAG(m_head) ? m_head->m_pkthdr.ether_vtag : 0;
 
 	/* deliberate bitwise OR to make one condition */
 	if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) {
 		if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0)) {
 			DBG_COUNTER_INC(encap_txd_encap_fail);
 			return (err);
 		}
 		m_head = *m_headp;
 	}
 
 retry:
 	err = bus_dmamap_load_mbuf_sg(buf_tag, map, m_head, segs, &nsegs,
 	    BUS_DMA_NOWAIT);
 defrag:
 	if (__predict_false(err)) {
 		switch (err) {
 		case EFBIG:
 			/* try collapse once and defrag once */
 			if (remap == 0) {
 				m_head = m_collapse(*m_headp, M_NOWAIT, max_segs);
 				/* try defrag if collapsing fails */
 				if (m_head == NULL)
 					remap++;
 			}
 			if (remap == 1) {
 				txq->ift_mbuf_defrag++;
 				m_head = m_defrag(*m_headp, M_NOWAIT);
 			}
 			/*
 			 * remap should never be >1 unless bus_dmamap_load_mbuf_sg
 			 * failed to map an mbuf that was run through m_defrag
 			 */
 			MPASS(remap <= 1);
 			if (__predict_false(m_head == NULL || remap > 1))
 				goto defrag_failed;
 			remap++;
 			*m_headp = m_head;
 			goto retry;
 			break;
 		case ENOMEM:
 			txq->ift_no_tx_dma_setup++;
 			break;
 		default:
 			txq->ift_no_tx_dma_setup++;
 			m_freem(*m_headp);
 			DBG_COUNTER_INC(tx_frees);
 			*m_headp = NULL;
 			break;
 		}
 		txq->ift_map_failed++;
 		DBG_COUNTER_INC(encap_load_mbuf_fail);
 		DBG_COUNTER_INC(encap_txd_encap_fail);
 		return (err);
 	}
 	ifsd_m[pidx] = m_head;
 	/*
 	 * XXX assumes a 1 to 1 relationship between segments and
 	 *        descriptors - this does not hold true on all drivers, e.g.
 	 *        cxgb
 	 */
 	if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
 		txq->ift_no_desc_avail++;
 		bus_dmamap_unload(buf_tag, map);
 		DBG_COUNTER_INC(encap_txq_avail_fail);
 		DBG_COUNTER_INC(encap_txd_encap_fail);
 		if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
 			GROUPTASK_ENQUEUE(&txq->ift_task);
 		return (ENOBUFS);
 	}
 	/*
 	 * On Intel cards we can greatly reduce the number of TX interrupts
 	 * we see by only setting report status on every Nth descriptor.
 	 * However, this also means that the driver will need to keep track
 	 * of the descriptors that RS was set on to check them for the DD bit.
 	 */
 	txq->ift_rs_pending += nsegs + 1;
 	if (txq->ift_rs_pending > TXQ_MAX_RS_DEFERRED(txq) ||
 	     iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs) <= MAX_TX_DESC(ctx) + 2) {
 		pi.ipi_flags |= IPI_TX_INTR;
 		txq->ift_rs_pending = 0;
 	}
 
 	pi.ipi_segs = segs;
 	pi.ipi_nsegs = nsegs;
 
 	MPASS(pidx >= 0 && pidx < txq->ift_size);
 #ifdef PKT_DEBUG
 	print_pkt(&pi);
 #endif
 	if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) {
 		bus_dmamap_sync(buf_tag, map, BUS_DMASYNC_PREWRITE);
 		DBG_COUNTER_INC(tx_encap);
 		MPASS(pi.ipi_new_pidx < txq->ift_size);
 
 		ndesc = pi.ipi_new_pidx - pi.ipi_pidx;
 		if (pi.ipi_new_pidx < pi.ipi_pidx) {
 			ndesc += txq->ift_size;
 			txq->ift_gen = 1;
 		}
 		/*
 		 * drivers can need as many as 
 		 * two sentinels
 		 */
 		MPASS(ndesc <= pi.ipi_nsegs + 2);
 		MPASS(pi.ipi_new_pidx != pidx);
 		MPASS(ndesc > 0);
 		txq->ift_in_use += ndesc;
 		txq->ift_db_pending += ndesc;
 
 		/*
 		 * We update the last software descriptor again here because there may
 		 * be a sentinel and/or there may be more mbufs than segments
 		 */
 		txq->ift_pidx = pi.ipi_new_pidx;
 		txq->ift_npending += pi.ipi_ndescs;
 	} else {
 		*m_headp = m_head = iflib_remove_mbuf(txq);
 		if (err == EFBIG) {
 			txq->ift_txd_encap_efbig++;
 			if (remap < 2) {
 				remap = 1;
 				goto defrag;
 			}
 		}
 		goto defrag_failed;
 	}
 	/*
 	 * err can't possibly be non-zero here, so we don't neet to test it
 	 * to see if we need to DBG_COUNTER_INC(encap_txd_encap_fail).
 	 */
 	return (err);
 
 defrag_failed:
 	txq->ift_mbuf_defrag_failed++;
 	txq->ift_map_failed++;
 	m_freem(*m_headp);
 	DBG_COUNTER_INC(tx_frees);
 	*m_headp = NULL;
 	DBG_COUNTER_INC(encap_txd_encap_fail);
 	return (ENOMEM);
 }
 
 static void
 iflib_tx_desc_free(iflib_txq_t txq, int n)
 {
 	uint32_t qsize, cidx, mask, gen;
 	struct mbuf *m, **ifsd_m;
 	bool do_prefetch;
 
 	cidx = txq->ift_cidx;
 	gen = txq->ift_gen;
 	qsize = txq->ift_size;
 	mask = qsize-1;
 	ifsd_m = txq->ift_sds.ifsd_m;
 	do_prefetch = (txq->ift_ctx->ifc_flags & IFC_PREFETCH);
 
 	while (n-- > 0) {
 		if (do_prefetch) {
 			prefetch(ifsd_m[(cidx + 3) & mask]);
 			prefetch(ifsd_m[(cidx + 4) & mask]);
 		}
 		if ((m = ifsd_m[cidx]) != NULL) {
 			prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]);
 			if (m->m_pkthdr.csum_flags & CSUM_TSO) {
 				bus_dmamap_sync(txq->ift_tso_buf_tag,
 				    txq->ift_sds.ifsd_tso_map[cidx],
 				    BUS_DMASYNC_POSTWRITE);
 				bus_dmamap_unload(txq->ift_tso_buf_tag,
 				    txq->ift_sds.ifsd_tso_map[cidx]);
 			} else {
 				bus_dmamap_sync(txq->ift_buf_tag,
 				    txq->ift_sds.ifsd_map[cidx],
 				    BUS_DMASYNC_POSTWRITE);
 				bus_dmamap_unload(txq->ift_buf_tag,
 				    txq->ift_sds.ifsd_map[cidx]);
 			}
 			/* XXX we don't support any drivers that batch packets yet */
 			MPASS(m->m_nextpkt == NULL);
 			m_freem(m);
 			ifsd_m[cidx] = NULL;
 #if MEMORY_LOGGING
 			txq->ift_dequeued++;
 #endif
 			DBG_COUNTER_INC(tx_frees);
 		}
 		if (__predict_false(++cidx == qsize)) {
 			cidx = 0;
 			gen = 0;
 		}
 	}
 	txq->ift_cidx = cidx;
 	txq->ift_gen = gen;
 }
 
 static __inline int
 iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh)
 {
 	int reclaim;
 	if_ctx_t ctx = txq->ift_ctx;
 
 	KASSERT(thresh >= 0, ("invalid threshold to reclaim"));
 	MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size);
 
 	/*
 	 * Need a rate-limiting check so that this isn't called every time
 	 */
 	iflib_tx_credits_update(ctx, txq);
 	reclaim = DESC_RECLAIMABLE(txq);
 
 	if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) {
 #ifdef INVARIANTS
 		if (iflib_verbose_debug) {
 			printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __FUNCTION__,
 			       txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments,
 			       reclaim, thresh);
 		}
 #endif
 		return (0);
 	}
 	iflib_tx_desc_free(txq, reclaim);
 	txq->ift_cleaned += reclaim;
 	txq->ift_in_use -= reclaim;
 
 	return (reclaim);
 }
 
 static struct mbuf **
 _ring_peek_one(struct ifmp_ring *r, int cidx, int offset, int remaining)
 {
 	int next, size;
 	struct mbuf **items;
 
 	size = r->size;
 	next = (cidx + CACHE_PTR_INCREMENT) & (size-1);
 	items = __DEVOLATILE(struct mbuf **, &r->items[0]);
 
 	prefetch(items[(cidx + offset) & (size-1)]);
 	if (remaining > 1) {
 		prefetch2cachelines(&items[next]);
 		prefetch2cachelines(items[(cidx + offset + 1) & (size-1)]);
 		prefetch2cachelines(items[(cidx + offset + 2) & (size-1)]);
 		prefetch2cachelines(items[(cidx + offset + 3) & (size-1)]);
 	}
 	return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (size-1)]));
 }
 
 static void
 iflib_txq_check_drain(iflib_txq_t txq, int budget)
 {
 
 	ifmp_ring_check_drainage(txq->ift_br, budget);
 }
 
 static uint32_t
 iflib_txq_can_drain(struct ifmp_ring *r)
 {
 	iflib_txq_t txq = r->cookie;
 	if_ctx_t ctx = txq->ift_ctx;
 
 	if (TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2)
 		return (1);
 	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 	    BUS_DMASYNC_POSTREAD);
 	return (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id,
 	    false));
 }
 
 static uint32_t
 iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
 {
 	iflib_txq_t txq = r->cookie;
 	if_ctx_t ctx = txq->ift_ctx;
 	if_t ifp = ctx->ifc_ifp;
 	struct mbuf *m, **mp;
 	int avail, bytes_sent, skipped, count, err, i;
 	int mcast_sent, pkt_sent, reclaimed;
 	bool do_prefetch, rang, ring;
 
 	if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) ||
 			    !LINK_ACTIVE(ctx))) {
 		DBG_COUNTER_INC(txq_drain_notready);
 		return (0);
 	}
 	reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
 	rang = iflib_txd_db_check(txq, reclaimed && txq->ift_db_pending);
 	avail = IDXDIFF(pidx, cidx, r->size);
 
 	if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) {
 		/*
 		 * The driver is unloading so we need to free all pending packets.
 		 */
 		DBG_COUNTER_INC(txq_drain_flushing);
 		for (i = 0; i < avail; i++) {
 			if (__predict_true(r->items[(cidx + i) & (r->size-1)] != (void *)txq))
 				m_freem(r->items[(cidx + i) & (r->size-1)]);
 			r->items[(cidx + i) & (r->size-1)] = NULL;
 		}
 		return (avail);
 	}
 
 	if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
 		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 		CALLOUT_UNLOCK(txq);
 		DBG_COUNTER_INC(txq_drain_oactive);
 		return (0);
 	}
 
 	/*
 	 * If we've reclaimed any packets this queue cannot be hung.
 	 */
 	if (reclaimed)
 		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	skipped = mcast_sent = bytes_sent = pkt_sent = 0;
 	count = MIN(avail, TX_BATCH_SIZE);
 #ifdef INVARIANTS
 	if (iflib_verbose_debug)
 		printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __FUNCTION__,
 		       avail, ctx->ifc_flags, TXQ_AVAIL(txq));
 #endif
 	do_prefetch = (ctx->ifc_flags & IFC_PREFETCH);
 	err = 0;
 	for (i = 0; i < count && TXQ_AVAIL(txq) >= MAX_TX_DESC(ctx) + 2; i++) {
 		int rem = do_prefetch ? count - i : 0;
 
 		mp = _ring_peek_one(r, cidx, i, rem);
 		MPASS(mp != NULL && *mp != NULL);
 
 		/*
 		 * Completion interrupts will use the address of the txq
 		 * as a sentinel to enqueue _something_ in order to acquire
 		 * the lock on the mp_ring (there's no direct lock call).
 		 * We obviously whave to check for these sentinel cases
 		 * and skip them.
 		 */
 		if (__predict_false(*mp == (struct mbuf *)txq)) {
 			skipped++;
 			continue;
 		}
 		err = iflib_encap(txq, mp);
 		if (__predict_false(err)) {
 			/* no room - bail out */
 			if (err == ENOBUFS)
 				break;
 			skipped++;
 			/* we can't send this packet - skip it */
 			continue;
 		}
 		pkt_sent++;
 		m = *mp;
 		DBG_COUNTER_INC(tx_sent);
 		bytes_sent += m->m_pkthdr.len;
 		mcast_sent += !!(m->m_flags & M_MCAST);
 
 		if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)))
 			break;
 		ETHER_BPF_MTAP(ifp, m);
 		rang = iflib_txd_db_check(txq, false);
 	}
 
 	/* deliberate use of bitwise or to avoid gratuitous short-circuit */
 	ring = rang ? false  : (iflib_min_tx_latency | err);
 	iflib_txd_db_check(txq, ring);
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
 	if (mcast_sent)
 		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
 #ifdef INVARIANTS
 	if (iflib_verbose_debug)
 		printf("consumed=%d\n", skipped + pkt_sent);
 #endif
 	return (skipped + pkt_sent);
 }
 
 static uint32_t
 iflib_txq_drain_always(struct ifmp_ring *r)
 {
 	return (1);
 }
 
 static uint32_t
 iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
 {
 	int i, avail;
 	struct mbuf **mp;
 	iflib_txq_t txq;
 
 	txq = r->cookie;
 
 	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	CALLOUT_LOCK(txq);
 	callout_stop(&txq->ift_timer);
 	CALLOUT_UNLOCK(txq);
 
 	avail = IDXDIFF(pidx, cidx, r->size);
 	for (i = 0; i < avail; i++) {
 		mp = _ring_peek_one(r, cidx, i, avail - i);
 		if (__predict_false(*mp == (struct mbuf *)txq))
 			continue;
 		m_freem(*mp);
 		DBG_COUNTER_INC(tx_frees);
 	}
 	MPASS(ifmp_ring_is_stalled(r) == 0);
 	return (avail);
 }
 
 static void
 iflib_ifmp_purge(iflib_txq_t txq)
 {
 	struct ifmp_ring *r;
 
 	r = txq->ift_br;
 	r->drain = iflib_txq_drain_free;
 	r->can_drain = iflib_txq_drain_always;
 
 	ifmp_ring_check_drainage(r, r->size);
 
 	r->drain = iflib_txq_drain;
 	r->can_drain = iflib_txq_can_drain;
 }
 
 static void
 _task_fn_tx(void *context)
 {
 	iflib_txq_t txq = context;
 	if_ctx_t ctx = txq->ift_ctx;
 	if_t ifp = ctx->ifc_ifp;
 	int abdicate = ctx->ifc_sysctl_tx_abdicate;
 
 #ifdef IFLIB_DIAGNOSTICS
 	txq->ift_cpu_exec_count[curcpu]++;
 #endif
 	if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
 		return;
 #ifdef DEV_NETMAP
 	if ((if_getcapenable(ifp) & IFCAP_NETMAP) &&
 	    netmap_tx_irq(ifp, txq->ift_id))
 		goto skip_ifmp;
 #endif
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) /* XXX - DRVAPI */
 		iflib_altq_if_start(ifp);
 #endif
 	if (txq->ift_db_pending)
 		ifmp_ring_enqueue(txq->ift_br, (void **)&txq, 1, TX_BATCH_SIZE, abdicate);
 	else if (!abdicate)
 		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
 	/*
 	 * When abdicating, we always need to check drainage, not just when we don't enqueue
 	 */
 	if (abdicate)
 		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
 #ifdef DEV_NETMAP
 skip_ifmp:
 #endif
 	if (ctx->ifc_flags & IFC_LEGACY)
 		IFDI_INTR_ENABLE(ctx);
 	else
 		IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
 }
 
 static void
 _task_fn_rx(void *context)
 {
 	iflib_rxq_t rxq = context;
 	if_ctx_t ctx = rxq->ifr_ctx;
 	uint8_t more;
 	uint16_t budget;
 #ifdef DEV_NETMAP
 	u_int work = 0;
 	int nmirq;
 #endif
 
 #ifdef IFLIB_DIAGNOSTICS
 	rxq->ifr_cpu_exec_count[curcpu]++;
 #endif
 	DBG_COUNTER_INC(task_fn_rxs);
 	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 		return;
 #ifdef DEV_NETMAP
 	nmirq = netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &work);
 	if (nmirq != NM_IRQ_PASS) {
 		more = (nmirq == NM_IRQ_RESCHED) ? IFLIB_RXEOF_MORE : 0;
 		goto skip_rxeof;
 	}
 #endif
 	budget = ctx->ifc_sysctl_rx_budget;
 	if (budget == 0)
 		budget = 16;	/* XXX */
 	more = iflib_rxeof(rxq, budget);
 #ifdef DEV_NETMAP
 skip_rxeof:
 #endif
 	if ((more & IFLIB_RXEOF_MORE) == 0) {
 		if (ctx->ifc_flags & IFC_LEGACY)
 			IFDI_INTR_ENABLE(ctx);
 		else
 			IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
 		DBG_COUNTER_INC(rx_intr_enables);
 	}
 	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 		return;
 
 	if (more & IFLIB_RXEOF_MORE)
 		GROUPTASK_ENQUEUE(&rxq->ifr_task);
 	else if (more & IFLIB_RXEOF_EMPTY)
 		callout_reset_curcpu(&rxq->ifr_watchdog, 1, &_task_fn_rx_watchdog, rxq);
 }
 
 static void
 _task_fn_admin(void *context)
 {
 	if_ctx_t ctx = context;
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 	iflib_txq_t txq;
 	int i;
 	bool oactive, running, do_reset, do_watchdog, in_detach;
 
 	STATE_LOCK(ctx);
 	running = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING);
 	oactive = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE);
 	do_reset = (ctx->ifc_flags & IFC_DO_RESET);
 	do_watchdog = (ctx->ifc_flags & IFC_DO_WATCHDOG);
 	in_detach = (ctx->ifc_flags & IFC_IN_DETACH);
 	ctx->ifc_flags &= ~(IFC_DO_RESET|IFC_DO_WATCHDOG);
 	STATE_UNLOCK(ctx);
 
 	if ((!running && !oactive) && !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
 		return;
 	if (in_detach)
 		return;
 
 	CTX_LOCK(ctx);
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 		CALLOUT_UNLOCK(txq);
 	}
 	if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_ADMINCQ)
 		IFDI_ADMIN_COMPLETION_HANDLE(ctx);
 	if (do_watchdog) {
 		ctx->ifc_watchdog_events++;
 		IFDI_WATCHDOG_RESET(ctx);
 	}
 	IFDI_UPDATE_ADMIN_STATUS(ctx);
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
 		callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, txq,
 		    txq->ift_timer.c_cpu);
 	}
 	IFDI_LINK_INTR_ENABLE(ctx);
 	if (do_reset)
 		iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 
 	if (LINK_ACTIVE(ctx) == 0)
 		return;
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
 }
 
 static void
 _task_fn_iov(void *context)
 {
 	if_ctx_t ctx = context;
 
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) &&
 	    !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
 		return;
 
 	CTX_LOCK(ctx);
 	IFDI_VFLR_HANDLE(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static int
 iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS)
 {
 	int err;
 	if_int_delay_info_t info;
 	if_ctx_t ctx;
 
 	info = (if_int_delay_info_t)arg1;
 	ctx = info->iidi_ctx;
 	info->iidi_req = req;
 	info->iidi_oidp = oidp;
 	CTX_LOCK(ctx);
 	err = IFDI_SYSCTL_INT_DELAY(ctx, info);
 	CTX_UNLOCK(ctx);
 	return (err);
 }
 
 /*********************************************************************
  *
  *  IFNET FUNCTIONS
  *
  **********************************************************************/
 
 static void
 iflib_if_init_locked(if_ctx_t ctx)
 {
 	iflib_stop(ctx);
 	iflib_init_locked(ctx);
 }
 
 static void
 iflib_if_init(void *arg)
 {
 	if_ctx_t ctx = arg;
 
 	CTX_LOCK(ctx);
 	iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static int
 iflib_if_transmit(if_t ifp, struct mbuf *m)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	iflib_txq_t txq;
 	int err, qidx;
 	int abdicate;
 
 	if (__predict_false((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) {
 		DBG_COUNTER_INC(tx_frees);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	MPASS(m->m_nextpkt == NULL);
 	/* ALTQ-enabled interfaces always use queue 0. */
 	qidx = 0;
 	/* Use driver-supplied queue selection method if it exists */
 	if (ctx->isc_txq_select_v2) {
 		struct if_pkt_info pi;
 		uint64_t early_pullups = 0;
 		pkt_info_zero(&pi);
 
 		err = iflib_parse_header_partial(&pi, &m, &early_pullups);
 		if (__predict_false(err != 0)) {
 			/* Assign pullups for bad pkts to default queue */
 			ctx->ifc_txqs[0].ift_pullups += early_pullups;
 			DBG_COUNTER_INC(encap_txd_encap_fail);
 			return (err);
 		}
 		/* Let driver make queueing decision */
 		qidx = ctx->isc_txq_select_v2(ctx->ifc_softc, m, &pi);
 		ctx->ifc_txqs[qidx].ift_pullups += early_pullups;
 	}
 	/* Backwards compatibility w/ simpler queue select */
 	else if (ctx->isc_txq_select)
 		qidx = ctx->isc_txq_select(ctx->ifc_softc, m);
 	/* If not, use iflib's standard method */
 	else if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m) && !ALTQ_IS_ENABLED(&ifp->if_snd))
 		qidx = QIDX(ctx, m);
 
 	/* Set TX queue */
 	txq = &ctx->ifc_txqs[qidx];
 
 #ifdef DRIVER_BACKPRESSURE
 	if (txq->ift_closed) {
 		while (m != NULL) {
 			next = m->m_nextpkt;
 			m->m_nextpkt = NULL;
 			m_freem(m);
 			DBG_COUNTER_INC(tx_frees);
 			m = next;
 		}
 		return (ENOBUFS);
 	}
 #endif
 #ifdef notyet
 	qidx = count = 0;
 	mp = marr;
 	next = m;
 	do {
 		count++;
 		next = next->m_nextpkt;
 	} while (next != NULL);
 
 	if (count > nitems(marr))
 		if ((mp = malloc(count*sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) {
 			/* XXX check nextpkt */
 			m_freem(m);
 			/* XXX simplify for now */
 			DBG_COUNTER_INC(tx_frees);
 			return (ENOBUFS);
 		}
 	for (next = m, i = 0; next != NULL; i++) {
 		mp[i] = next;
 		next = next->m_nextpkt;
 		mp[i]->m_nextpkt = NULL;
 	}
 #endif
 	DBG_COUNTER_INC(tx_seen);
 	abdicate = ctx->ifc_sysctl_tx_abdicate;
 
 	err = ifmp_ring_enqueue(txq->ift_br, (void **)&m, 1, TX_BATCH_SIZE, abdicate);
 
 	if (abdicate)
 		GROUPTASK_ENQUEUE(&txq->ift_task);
  	if (err) {
 		if (!abdicate)
 			GROUPTASK_ENQUEUE(&txq->ift_task);
 		/* support forthcoming later */
 #ifdef DRIVER_BACKPRESSURE
 		txq->ift_closed = TRUE;
 #endif
 		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
 		m_freem(m);
 		DBG_COUNTER_INC(tx_frees);
 	}
 
 	return (err);
 }
 
 #ifdef ALTQ
 /*
  * The overall approach to integrating iflib with ALTQ is to continue to use
  * the iflib mp_ring machinery between the ALTQ queue(s) and the hardware
  * ring.  Technically, when using ALTQ, queueing to an intermediate mp_ring
  * is redundant/unnecessary, but doing so minimizes the amount of
  * ALTQ-specific code required in iflib.  It is assumed that the overhead of
  * redundantly queueing to an intermediate mp_ring is swamped by the
  * performance limitations inherent in using ALTQ.
  *
  * When ALTQ support is compiled in, all iflib drivers will use a transmit
  * routine, iflib_altq_if_transmit(), that checks if ALTQ is enabled for the
  * given interface.  If ALTQ is enabled for an interface, then all
  * transmitted packets for that interface will be submitted to the ALTQ
  * subsystem via IFQ_ENQUEUE().  We don't use the legacy if_transmit()
  * implementation because it uses IFQ_HANDOFF(), which will duplicatively
  * update stats that the iflib machinery handles, and which is sensitve to
  * the disused IFF_DRV_OACTIVE flag.  Additionally, iflib_altq_if_start()
  * will be installed as the start routine for use by ALTQ facilities that
  * need to trigger queue drains on a scheduled basis.
  *
  */
 static void
 iflib_altq_if_start(if_t ifp)
 {
 	struct ifaltq *ifq = &ifp->if_snd; /* XXX - DRVAPI */
 	struct mbuf *m;
 
 	IFQ_LOCK(ifq);
 	IFQ_DEQUEUE_NOLOCK(ifq, m);
 	while (m != NULL) {
 		iflib_if_transmit(ifp, m);
 		IFQ_DEQUEUE_NOLOCK(ifq, m);
 	}
 	IFQ_UNLOCK(ifq);
 }
 
 static int
 iflib_altq_if_transmit(if_t ifp, struct mbuf *m)
 {
 	int err;
 
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) { /* XXX - DRVAPI */
 		IFQ_ENQUEUE(&ifp->if_snd, m, err); /* XXX - DRVAPI */
 		if (err == 0)
 			iflib_altq_if_start(ifp);
 	} else
 		err = iflib_if_transmit(ifp, m);
 
 	return (err);
 }
 #endif /* ALTQ */
 
 static void
 iflib_if_qflush(if_t ifp)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	iflib_txq_t txq = ctx->ifc_txqs;
 	int i;
 
 	STATE_LOCK(ctx);
 	ctx->ifc_flags |= IFC_QFLUSH;
 	STATE_UNLOCK(ctx);
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
 		while (!(ifmp_ring_is_idle(txq->ift_br) || ifmp_ring_is_stalled(txq->ift_br)))
 			iflib_txq_check_drain(txq, 0);
 	STATE_LOCK(ctx);
 	ctx->ifc_flags &= ~IFC_QFLUSH;
 	STATE_UNLOCK(ctx);
 
 	/*
 	 * When ALTQ is enabled, this will also take care of purging the
 	 * ALTQ queue(s).
 	 */
 	if_qflush(ifp);
 }
 
 #define IFCAP_FLAGS (IFCAP_HWCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \
 		     IFCAP_TSO | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \
 		     IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | \
 		     IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWCSUM | IFCAP_MEXTPG)
 
 static int
 iflib_if_ioctl(if_t ifp, u_long command, caddr_t data)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	struct ifreq	*ifr = (struct ifreq *)data;
 #if defined(INET) || defined(INET6)
 	struct ifaddr	*ifa = (struct ifaddr *)data;
 #endif
 	bool		avoid_reset = false;
 	int		err = 0, reinit = 0, bits;
 
 	switch (command) {
 	case SIOCSIFADDR:
 #ifdef INET
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			avoid_reset = true;
 #endif
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6)
 			avoid_reset = true;
 #endif
 		/*
 		** Calling init results in link renegotiation,
 		** so we avoid doing it when possible.
 		*/
 		if (avoid_reset) {
 			if_setflagbits(ifp, IFF_UP,0);
 			if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
 				reinit = 1;
 #ifdef INET
 			if (!(if_getflags(ifp) & IFF_NOARP))
 				arp_ifinit(ifp, ifa);
 #endif
 		} else
 			err = ether_ioctl(ifp, command, data);
 		break;
 	case SIOCSIFMTU:
 		CTX_LOCK(ctx);
 		if (ifr->ifr_mtu == if_getmtu(ifp)) {
 			CTX_UNLOCK(ctx);
 			break;
 		}
 		bits = if_getdrvflags(ifp);
 		/* stop the driver and free any clusters before proceeding */
 		iflib_stop(ctx);
 
 		if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) {
 			STATE_LOCK(ctx);
 			if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size)
 				ctx->ifc_flags |= IFC_MULTISEG;
 			else
 				ctx->ifc_flags &= ~IFC_MULTISEG;
 			STATE_UNLOCK(ctx);
 			err = if_setmtu(ifp, ifr->ifr_mtu);
 		}
 		iflib_init_locked(ctx);
 		STATE_LOCK(ctx);
 		if_setdrvflags(ifp, bits);
 		STATE_UNLOCK(ctx);
 		CTX_UNLOCK(ctx);
 		break;
 	case SIOCSIFFLAGS:
 		CTX_LOCK(ctx);
 		if (if_getflags(ifp) & IFF_UP) {
 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 				if ((if_getflags(ifp) ^ ctx->ifc_if_flags) &
 				    (IFF_PROMISC | IFF_ALLMULTI)) {
 					CTX_UNLOCK(ctx);
 					err = IFDI_PROMISC_SET(ctx, if_getflags(ifp));
 					CTX_LOCK(ctx);
 				}
 			} else
 				reinit = 1;
 		} else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 			iflib_stop(ctx);
 		}
 		ctx->ifc_if_flags = if_getflags(ifp);
 		CTX_UNLOCK(ctx);
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 			CTX_LOCK(ctx);
 			IFDI_INTR_DISABLE(ctx);
 			IFDI_MULTI_SET(ctx);
 			IFDI_INTR_ENABLE(ctx);
 			CTX_UNLOCK(ctx);
 		}
 		break;
 	case SIOCSIFMEDIA:
 		CTX_LOCK(ctx);
 		IFDI_MEDIA_SET(ctx);
 		CTX_UNLOCK(ctx);
 		/* FALLTHROUGH */
 	case SIOCGIFMEDIA:
 	case SIOCGIFXMEDIA:
 		err = ifmedia_ioctl(ifp, ifr, ctx->ifc_mediap, command);
 		break;
 	case SIOCGI2C:
 	{
 		struct ifi2creq i2c;
 
 		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
 		if (err != 0)
 			break;
 		if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
 			err = EINVAL;
 			break;
 		}
 		if (i2c.len > sizeof(i2c.data)) {
 			err = EINVAL;
 			break;
 		}
 
 		if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0)
 			err = copyout(&i2c, ifr_data_get_ptr(ifr),
 			    sizeof(i2c));
 		break;
 	}
 	case SIOCSIFCAP:
 	{
 		int mask, setmask, oldmask;
 
 		oldmask = if_getcapenable(ifp);
 		mask = ifr->ifr_reqcap ^ oldmask;
 		mask &= ctx->ifc_softc_ctx.isc_capabilities | IFCAP_MEXTPG;
 		setmask = 0;
 #ifdef TCP_OFFLOAD
 		setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6);
 #endif
 		setmask |= (mask & IFCAP_FLAGS);
 		setmask |= (mask & IFCAP_WOL);
 
 		/*
 		 * If any RX csum has changed, change all the ones that
 		 * are supported by the driver.
 		 */
 		if (setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
 			setmask |= ctx->ifc_softc_ctx.isc_capabilities &
 			    (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6);
 		}
 
 		/*
 		 * want to ensure that traffic has stopped before we change any of the flags
 		 */
 		if (setmask) {
 			CTX_LOCK(ctx);
 			bits = if_getdrvflags(ifp);
 			if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
 				iflib_stop(ctx);
 			STATE_LOCK(ctx);
 			if_togglecapenable(ifp, setmask);
 			ctx->ifc_softc_ctx.isc_capenable ^= setmask;
 			STATE_UNLOCK(ctx);
 			if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
 				iflib_init_locked(ctx);
 			STATE_LOCK(ctx);
 			if_setdrvflags(ifp, bits);
 			STATE_UNLOCK(ctx);
 			CTX_UNLOCK(ctx);
 		}
 		if_vlancap(ifp);
 		break;
 	}
 	case SIOCGPRIVATE_0:
 	case SIOCSDRVSPEC:
 	case SIOCGDRVSPEC:
 		CTX_LOCK(ctx);
 		err = IFDI_PRIV_IOCTL(ctx, command, data);
 		CTX_UNLOCK(ctx);
 		break;
 	default:
 		err = ether_ioctl(ifp, command, data);
 		break;
 	}
 	if (reinit)
 		iflib_if_init(ctx);
 	return (err);
 }
 
 static uint64_t
 iflib_if_get_counter(if_t ifp, ift_counter cnt)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	return (IFDI_GET_COUNTER(ctx, cnt));
 }
 
 /*********************************************************************
  *
  *  OTHER FUNCTIONS EXPORTED TO THE STACK
  *
  **********************************************************************/
 
 static void
 iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	if ((void *)ctx != arg)
 		return;
 
 	if ((vtag == 0) || (vtag > 4095))
 		return;
 
 	if (iflib_in_detach(ctx))
 		return;
 
 	CTX_LOCK(ctx);
 	/* Driver may need all untagged packets to be flushed */
 	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
 		iflib_stop(ctx);
 	IFDI_VLAN_REGISTER(ctx, vtag);
 	/* Re-init to load the changes, if required */
 	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
 		iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	if ((void *)ctx != arg)
 		return;
 
 	if ((vtag == 0) || (vtag > 4095))
 		return;
 
 	CTX_LOCK(ctx);
 	/* Driver may need all tagged packets to be flushed */
 	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
 		iflib_stop(ctx);
 	IFDI_VLAN_UNREGISTER(ctx, vtag);
 	/* Re-init to load the changes, if required */
 	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
 		iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_led_func(void *arg, int onoff)
 {
 	if_ctx_t ctx = arg;
 
 	CTX_LOCK(ctx);
 	IFDI_LED_FUNC(ctx, onoff);
 	CTX_UNLOCK(ctx);
 }
 
 /*********************************************************************
  *
  *  BUS FUNCTION DEFINITIONS
  *
  **********************************************************************/
 
 int
 iflib_device_probe(device_t dev)
 {
 	const pci_vendor_info_t *ent;
 	if_shared_ctx_t sctx;
 	uint16_t pci_device_id, pci_rev_id, pci_subdevice_id, pci_subvendor_id;
 	uint16_t pci_vendor_id;
 
 	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
 		return (ENOTSUP);
 
 	pci_vendor_id = pci_get_vendor(dev);
 	pci_device_id = pci_get_device(dev);
 	pci_subvendor_id = pci_get_subvendor(dev);
 	pci_subdevice_id = pci_get_subdevice(dev);
 	pci_rev_id = pci_get_revid(dev);
 	if (sctx->isc_parse_devinfo != NULL)
 		sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id);
 
 	ent = sctx->isc_vendor_info;
 	while (ent->pvi_vendor_id != 0) {
 		if (pci_vendor_id != ent->pvi_vendor_id) {
 			ent++;
 			continue;
 		}
 		if ((pci_device_id == ent->pvi_device_id) &&
 		    ((pci_subvendor_id == ent->pvi_subvendor_id) ||
 		     (ent->pvi_subvendor_id == 0)) &&
 		    ((pci_subdevice_id == ent->pvi_subdevice_id) ||
 		     (ent->pvi_subdevice_id == 0)) &&
 		    ((pci_rev_id == ent->pvi_rev_id) ||
 		     (ent->pvi_rev_id == 0))) {
 			device_set_desc_copy(dev, ent->pvi_name);
 			/* this needs to be changed to zero if the bus probing code
 			 * ever stops re-probing on best match because the sctx
 			 * may have its values over written by register calls
 			 * in subsequent probes
 			 */
 			return (BUS_PROBE_DEFAULT);
 		}
 		ent++;
 	}
 	return (ENXIO);
 }
 
 int
 iflib_device_probe_vendor(device_t dev)
 {
 	int probe;
 
 	probe = iflib_device_probe(dev);
 	if (probe == BUS_PROBE_DEFAULT)
 		return (BUS_PROBE_VENDOR);
 	else
 		return (probe);
 }
 
 static void
 iflib_reset_qvalues(if_ctx_t ctx)
 {
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	device_t dev = ctx->ifc_dev;
 	int i;
 
 	if (ctx->ifc_sysctl_ntxqs != 0)
 		scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs;
 	if (ctx->ifc_sysctl_nrxqs != 0)
 		scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs;
 
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (ctx->ifc_sysctl_ntxds[i] != 0)
 			scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i];
 		else
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
 	}
 
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (ctx->ifc_sysctl_nrxds[i] != 0)
 			scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i];
 		else
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
 	}
 
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) {
 			device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n",
 				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]);
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i];
 		}
 		if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) {
 			device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n",
 				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]);
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i];
 		}
 		if (!powerof2(scctx->isc_nrxd[i])) {
 			device_printf(dev, "nrxd%d: %d is not a power of 2 - using default value of %d\n",
 				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_default[i]);
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
 		}
 	}
 
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) {
 			device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n",
 				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]);
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i];
 		}
 		if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) {
 			device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n",
 				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]);
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i];
 		}
 		if (!powerof2(scctx->isc_ntxd[i])) {
 			device_printf(dev, "ntxd%d: %d is not a power of 2 - using default value of %d\n",
 				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_default[i]);
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
 		}
 	}
 }
 
 static void
 iflib_add_pfil(if_ctx_t ctx)
 {
 	struct pfil_head *pfil;
 	struct pfil_head_args pa;
 	iflib_rxq_t rxq;
 	int i;
 
 	pa.pa_version = PFIL_VERSION;
 	pa.pa_flags = PFIL_IN;
 	pa.pa_type = PFIL_TYPE_ETHERNET;
 	pa.pa_headname = if_name(ctx->ifc_ifp);
 	pfil = pfil_head_register(&pa);
 
 	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
 		rxq->pfil = pfil;
 	}
 }
 
 static void
 iflib_rem_pfil(if_ctx_t ctx)
 {
 	struct pfil_head *pfil;
 	iflib_rxq_t rxq;
 	int i;
 
 	rxq = ctx->ifc_rxqs;
 	pfil = rxq->pfil;
 	for (i = 0; i < NRXQSETS(ctx); i++, rxq++) {
 		rxq->pfil = NULL;
 	}
 	pfil_head_unregister(pfil);
 }
 
 
 /*
  * Advance forward by n members of the cpuset ctx->ifc_cpus starting from
  * cpuid and wrapping as necessary.
  */
 static unsigned int
 cpuid_advance(if_ctx_t ctx, unsigned int cpuid, unsigned int n)
 {
 	unsigned int first_valid;
 	unsigned int last_valid;
 
 	/* cpuid should always be in the valid set */
 	MPASS(CPU_ISSET(cpuid, &ctx->ifc_cpus));
 
 	/* valid set should never be empty */
 	MPASS(!CPU_EMPTY(&ctx->ifc_cpus));
 
 	first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
 	last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
 	n = n % CPU_COUNT(&ctx->ifc_cpus);
 	while (n > 0) {
 		do {
 			cpuid++;
 			if (cpuid > last_valid)
 				cpuid = first_valid;
 		} while (!CPU_ISSET(cpuid, &ctx->ifc_cpus));
 		n--;
 	}
 
 	return (cpuid);
 }
 
 #if defined(SMP) && defined(SCHED_ULE)
 extern struct cpu_group *cpu_top;              /* CPU topology */
 
 static int
 find_child_with_core(int cpu, struct cpu_group *grp)
 {
 	int i;
 
 	if (grp->cg_children == 0)
 		return -1;
 
 	MPASS(grp->cg_child);
 	for (i = 0; i < grp->cg_children; i++) {
 		if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask))
 			return i;
 	}
 
 	return -1;
 }
 
 
 /*
  * Find an L2 neighbor of the given CPU or return -1 if none found.  This
  * does not distinguish among multiple L2 neighbors if the given CPU has
  * more than one (it will always return the same result in that case).
  */
 static int
 find_l2_neighbor(int cpu)
 {
 	struct cpu_group *grp;
 	int i;
 
 	grp = cpu_top;
 	if (grp == NULL)
 		return -1;
 
 	/*
 	 * Find the smallest CPU group that contains the given core.
 	 */
 	i = 0;
 	while ((i = find_child_with_core(cpu, grp)) != -1) {
 		/*
 		 * If the smallest group containing the given CPU has less
 		 * than two members, we conclude the given CPU has no
 		 * L2 neighbor.
 		 */
 		if (grp->cg_child[i].cg_count <= 1)
 			return (-1);
 		grp = &grp->cg_child[i];
 	}
 
 	/* Must share L2. */
 	if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE)
 		return -1;
 
 	/*
 	 * Select the first member of the set that isn't the reference
 	 * CPU, which at this point is guaranteed to exist.
 	 */
 	for (i = 0; i < CPU_SETSIZE; i++) {
 		if (CPU_ISSET(i, &grp->cg_mask) && i != cpu)
 			return (i);
 	}
 
 	/* Should never be reached */
 	return (-1);
 }
 
 #else
 static int
 find_l2_neighbor(int cpu)
 {
 
 	return (-1);
 }
 #endif
 
 /*
  * CPU mapping behaviors
  * ---------------------
  * 'separate txrx' refers to the separate_txrx sysctl
  * 'use logical' refers to the use_logical_cores sysctl
  * 'INTR CPUS' indicates whether bus_get_cpus(INTR_CPUS) succeeded
  *
  *  separate     use     INTR
  *    txrx     logical   CPUS   result
  * ---------- --------- ------ ------------------------------------------------
  *     -          -       X     RX and TX queues mapped to consecutive physical
  *                              cores with RX/TX pairs on same core and excess
  *                              of either following
  *     -          X       X     RX and TX queues mapped to consecutive cores
  *                              of any type with RX/TX pairs on same core and
  *                              excess of either following
  *     X          -       X     RX and TX queues mapped to consecutive physical
  *                              cores; all RX then all TX
  *     X          X       X     RX queues mapped to consecutive physical cores
  *                              first, then TX queues mapped to L2 neighbor of
  *                              the corresponding RX queue if one exists,
  *                              otherwise to consecutive physical cores
  *     -         n/a      -     RX and TX queues mapped to consecutive cores of
  *                              any type with RX/TX pairs on same core and excess
  *                              of either following
  *     X         n/a      -     RX and TX queues mapped to consecutive cores of
  *                              any type; all RX then all TX
  */
 static unsigned int
 get_cpuid_for_queue(if_ctx_t ctx, unsigned int base_cpuid, unsigned int qid,
     bool is_tx)
 {
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	unsigned int core_index;
 
 	if (ctx->ifc_sysctl_separate_txrx) {
 		/*
 		 * When using separate CPUs for TX and RX, the assignment
 		 * will always be of a consecutive CPU out of the set of
 		 * context CPUs, except for the specific case where the
 		 * context CPUs are phsyical cores, the use of logical cores
 		 * has been enabled, the assignment is for TX, the TX qid
 		 * corresponds to an RX qid, and the CPU assigned to the
 		 * corresponding RX queue has an L2 neighbor.
 		 */
 		if (ctx->ifc_sysctl_use_logical_cores &&
 		    ctx->ifc_cpus_are_physical_cores &&
 		    is_tx && qid < scctx->isc_nrxqsets) {
 			int l2_neighbor;
 			unsigned int rx_cpuid;
 
 			rx_cpuid = cpuid_advance(ctx, base_cpuid, qid);
 			l2_neighbor = find_l2_neighbor(rx_cpuid);
 			if (l2_neighbor != -1) {
 				return (l2_neighbor);
 			}
 			/*
 			 * ... else fall through to the normal
 			 * consecutive-after-RX assignment scheme.
 			 *
 			 * Note that we are assuming that all RX queue CPUs
 			 * have an L2 neighbor, or all do not.  If a mixed
 			 * scenario is possible, we will have to keep track
 			 * separately of how many queues prior to this one
 			 * were not able to be assigned to an L2 neighbor.
 			 */
 		}
 		if (is_tx)
 			core_index = scctx->isc_nrxqsets + qid;
 		else
 			core_index = qid;
 	} else {
 		core_index = qid;
 	}
 
 	return (cpuid_advance(ctx, base_cpuid, core_index));
 }
 
 static uint16_t
 get_ctx_core_offset(if_ctx_t ctx)
 {
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	struct cpu_offset *op;
 	cpuset_t assigned_cpus;
 	unsigned int cores_consumed;
 	unsigned int base_cpuid = ctx->ifc_sysctl_core_offset;
 	unsigned int first_valid;
 	unsigned int last_valid;
 	unsigned int i;
 
 	first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
 	last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
 
 	if (base_cpuid != CORE_OFFSET_UNSPECIFIED) {
 		/*
 		 * Align the user-chosen base CPU ID to the next valid CPU
 		 * for this device.  If the chosen base CPU ID is smaller
 		 * than the first valid CPU or larger than the last valid
 		 * CPU, we assume the user does not know what the valid
 		 * range is for this device and is thinking in terms of a
 		 * zero-based reference frame, and so we shift the given
 		 * value into the valid range (and wrap accordingly) so the
 		 * intent is translated to the proper frame of reference.
 		 * If the base CPU ID is within the valid first/last, but
 		 * does not correspond to a valid CPU, it is advanced to the
 		 * next valid CPU (wrapping if necessary).
 		 */
 		if (base_cpuid < first_valid || base_cpuid > last_valid) {
 			/* shift from zero-based to first_valid-based */
 			base_cpuid += first_valid;
 			/* wrap to range [first_valid, last_valid] */
 			base_cpuid = (base_cpuid - first_valid) %
 			    (last_valid - first_valid + 1);
 		}
 		if (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus)) {
 			/*
 			 * base_cpuid is in [first_valid, last_valid], but
 			 * not a member of the valid set.  In this case,
 			 * there will always be a member of the valid set
 			 * with a CPU ID that is greater than base_cpuid,
 			 * and we simply advance to it.
 			 */
 			while (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus))
 				base_cpuid++;
 		}
 		return (base_cpuid);
 	}
 
 	/*
 	 * Determine how many cores will be consumed by performing the CPU
 	 * assignments and counting how many of the assigned CPUs correspond
 	 * to CPUs in the set of context CPUs.  This is done using the CPU
 	 * ID first_valid as the base CPU ID, as the base CPU must be within
 	 * the set of context CPUs.
 	 *
 	 * Note not all assigned CPUs will be in the set of context CPUs
 	 * when separate CPUs are being allocated to TX and RX queues,
 	 * assignment to logical cores has been enabled, the set of context
 	 * CPUs contains only physical CPUs, and TX queues are mapped to L2
 	 * neighbors of CPUs that RX queues have been mapped to - in this
 	 * case we do only want to count how many CPUs in the set of context
 	 * CPUs have been consumed, as that determines the next CPU in that
 	 * set to start allocating at for the next device for which
 	 * core_offset is not set.
 	 */
 	CPU_ZERO(&assigned_cpus);
 	for (i = 0; i < scctx->isc_ntxqsets; i++)
 		CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, true),
 		    &assigned_cpus);
 	for (i = 0; i < scctx->isc_nrxqsets; i++)
 		CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, false),
 		    &assigned_cpus);
 	CPU_AND(&assigned_cpus, &assigned_cpus, &ctx->ifc_cpus);
 	cores_consumed = CPU_COUNT(&assigned_cpus);
 
 	mtx_lock(&cpu_offset_mtx);
 	SLIST_FOREACH(op, &cpu_offsets, entries) {
 		if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
 			base_cpuid = op->next_cpuid;
 			op->next_cpuid = cpuid_advance(ctx, op->next_cpuid,
 			    cores_consumed);
 			MPASS(op->refcount < UINT_MAX);
 			op->refcount++;
 			break;
 		}
 	}
 	if (base_cpuid == CORE_OFFSET_UNSPECIFIED) {
 		base_cpuid = first_valid;
 		op = malloc(sizeof(struct cpu_offset), M_IFLIB,
 		    M_NOWAIT | M_ZERO);
 		if (op == NULL) {
 			device_printf(ctx->ifc_dev,
 			    "allocation for cpu offset failed.\n");
 		} else {
 			op->next_cpuid = cpuid_advance(ctx, base_cpuid,
 			    cores_consumed);
 			op->refcount = 1;
 			CPU_COPY(&ctx->ifc_cpus, &op->set);
 			SLIST_INSERT_HEAD(&cpu_offsets, op, entries);
 		}
 	}
 	mtx_unlock(&cpu_offset_mtx);
 
 	return (base_cpuid);
 }
 
 static void
 unref_ctx_core_offset(if_ctx_t ctx)
 {
 	struct cpu_offset *op, *top;
 
 	mtx_lock(&cpu_offset_mtx);
 	SLIST_FOREACH_SAFE(op, &cpu_offsets, entries, top) {
 		if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
 			MPASS(op->refcount > 0);
 			op->refcount--;
 			if (op->refcount == 0) {
 				SLIST_REMOVE(&cpu_offsets, op, cpu_offset, entries);
 				free(op, M_IFLIB);
 			}
 			break;
 		}
 	}
 	mtx_unlock(&cpu_offset_mtx);
 }
 
 int
 iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp)
 {
 	if_ctx_t ctx;
 	if_t ifp;
 	if_softc_ctx_t scctx;
 	kobjop_desc_t kobj_desc;
 	kobj_method_t *kobj_method;
 	int err, msix, rid;
 	int num_txd, num_rxd;
 
 	ctx = malloc(sizeof(* ctx), M_IFLIB, M_WAITOK|M_ZERO);
 
 	if (sc == NULL) {
 		sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
 		device_set_softc(dev, ctx);
 		ctx->ifc_flags |= IFC_SC_ALLOCATED;
 	}
 
 	ctx->ifc_sctx = sctx;
 	ctx->ifc_dev = dev;
 	ctx->ifc_softc = sc;
 
 	if ((err = iflib_register(ctx)) != 0) {
 		device_printf(dev, "iflib_register failed %d\n", err);
 		goto fail_ctx_free;
 	}
 	iflib_add_device_sysctl_pre(ctx);
 
 	scctx = &ctx->ifc_softc_ctx;
 	ifp = ctx->ifc_ifp;
 
 	iflib_reset_qvalues(ctx);
 	IFNET_WLOCK();
 	CTX_LOCK(ctx);
 	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
 		goto fail_unlock;
 	}
 	_iflib_pre_assert(scctx);
 	ctx->ifc_txrx = *scctx->isc_txrx;
 
 	MPASS(scctx->isc_dma_width <= flsll(BUS_SPACE_MAXADDR));
 
 	if (sctx->isc_flags & IFLIB_DRIVER_MEDIA)
 		ctx->ifc_mediap = scctx->isc_media;
 
 #ifdef INVARIANTS
 	if (scctx->isc_capabilities & IFCAP_TXCSUM)
 		MPASS(scctx->isc_tx_csum_flags);
 #endif
 
 	if_setcapabilities(ifp,
 	    scctx->isc_capabilities | IFCAP_HWSTATS | IFCAP_MEXTPG);
 	if_setcapenable(ifp,
 	    scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_MEXTPG);
 
 	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
 		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
 	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
 		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
 
 	num_txd = iflib_num_tx_descs(ctx);
 	num_rxd = iflib_num_rx_descs(ctx);
 
 	/* XXX change for per-queue sizes */
 	device_printf(dev, "Using %d TX descriptors and %d RX descriptors\n",
 	    num_txd, num_rxd);
 
 	if (scctx->isc_tx_nsegments > num_txd / MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_nsegments = max(1, num_txd /
 		    MAX_SINGLE_PACKET_FRACTION);
 	if (scctx->isc_tx_tso_segments_max > num_txd /
 	    MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_tso_segments_max = max(1,
 		    num_txd / MAX_SINGLE_PACKET_FRACTION);
 
 	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
 	if (if_getcapabilities(ifp) & IFCAP_TSO) {
 		/*
 		 * The stack can't handle a TSO size larger than IP_MAXPACKET,
 		 * but some MACs do.
 		 */
 		if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max,
 		    IP_MAXPACKET));
 		/*
 		 * Take maximum number of m_pullup(9)'s in iflib_parse_header()
 		 * into account.  In the worst case, each of these calls will
 		 * add another mbuf and, thus, the requirement for another DMA
 		 * segment.  So for best performance, it doesn't make sense to
 		 * advertize a maximum of TSO segments that typically will
 		 * require defragmentation in iflib_encap().
 		 */
 		if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3);
 		if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max);
 	}
 	if (scctx->isc_rss_table_size == 0)
 		scctx->isc_rss_table_size = 64;
 	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
 
 	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
 	/* XXX format name */
 	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx,
 	    NULL, NULL, "admin");
 
 	/* Set up cpu set.  If it fails, use the set of all CPUs. */
 	if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) != 0) {
 		device_printf(dev, "Unable to fetch CPU list\n");
 		CPU_COPY(&all_cpus, &ctx->ifc_cpus);
 		ctx->ifc_cpus_are_physical_cores = false;
 	} else
 		ctx->ifc_cpus_are_physical_cores = true;
 	MPASS(CPU_COUNT(&ctx->ifc_cpus) > 0);
 
 	/*
 	** Now set up MSI or MSI-X, should return us the number of supported
 	** vectors (will be 1 for a legacy interrupt and MSI).
 	*/
 	if (sctx->isc_flags & IFLIB_SKIP_MSIX) {
 		msix = scctx->isc_vectors;
 	} else if (scctx->isc_msix_bar != 0)
 	       /*
 		* The simple fact that isc_msix_bar is not 0 does not mean we
 		* we have a good value there that is known to work.
 		*/
 		msix = iflib_msix_init(ctx);
 	else {
 		scctx->isc_vectors = 1;
 		scctx->isc_ntxqsets = 1;
 		scctx->isc_nrxqsets = 1;
 		scctx->isc_intr = IFLIB_INTR_LEGACY;
 		msix = 0;
 	}
 	/* Get memory for the station queues */
 	if ((err = iflib_queues_alloc(ctx))) {
 		device_printf(dev, "Unable to allocate queue memory\n");
 		goto fail_intr_free;
 	}
 
 	if ((err = iflib_qset_structures_setup(ctx)))
 		goto fail_queues;
 
 	/*
 	 * Now that we know how many queues there are, get the core offset.
 	 */
 	ctx->ifc_sysctl_core_offset = get_ctx_core_offset(ctx);
 
 	if (msix > 1) {
 		/*
 		 * When using MSI-X, ensure that ifdi_{r,t}x_queue_intr_enable
 		 * aren't the default NULL implementation.
 		 */
 		kobj_desc = &ifdi_rx_queue_intr_enable_desc;
 		kobj_method = kobj_lookup_method(((kobj_t)ctx)->ops->cls, NULL,
 		    kobj_desc);
 		if (kobj_method == &kobj_desc->deflt) {
 			device_printf(dev,
 			    "MSI-X requires ifdi_rx_queue_intr_enable method");
 			err = EOPNOTSUPP;
 			goto fail_queues;
 		}
 		kobj_desc = &ifdi_tx_queue_intr_enable_desc;
 		kobj_method = kobj_lookup_method(((kobj_t)ctx)->ops->cls, NULL,
 		    kobj_desc);
 		if (kobj_method == &kobj_desc->deflt) {
 			device_printf(dev,
 			    "MSI-X requires ifdi_tx_queue_intr_enable method");
 			err = EOPNOTSUPP;
 			goto fail_queues;
 		}
 
 		/*
 		 * Assign the MSI-X vectors.
 		 * Note that the default NULL ifdi_msix_intr_assign method will
 		 * fail here, too.
 		 */
 		err = IFDI_MSIX_INTR_ASSIGN(ctx, msix);
 		if (err != 0) {
 			device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n",
 			    err);
 			goto fail_queues;
 		}
 	} else if (scctx->isc_intr != IFLIB_INTR_MSIX) {
 		rid = 0;
 		if (scctx->isc_intr == IFLIB_INTR_MSI) {
 			MPASS(msix == 1);
 			rid = 1;
 		}
 		if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) {
 			device_printf(dev, "iflib_legacy_setup failed %d\n", err);
 			goto fail_queues;
 		}
 	} else {
 		device_printf(dev,
 		    "Cannot use iflib with only 1 MSI-X interrupt!\n");
 		err = ENODEV;
 		goto fail_queues;
 	}
 
 	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
 
 	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
 		goto fail_detach;
 	}
 
 	/*
 	 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
 	 * This must appear after the call to ether_ifattach() because
 	 * ether_ifattach() sets if_hdrlen to the default value.
 	 */
 	if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
 		if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
 
 	if ((err = iflib_netmap_attach(ctx))) {
 		device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err);
 		goto fail_detach;
 	}
 	*ctxp = ctx;
 
 	DEBUGNET_SET(ctx->ifc_ifp, iflib);
 
 	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
 	iflib_add_device_sysctl_post(ctx);
 	iflib_add_pfil(ctx);
 	ctx->ifc_flags |= IFC_INIT_DONE;
 	CTX_UNLOCK(ctx);
 	IFNET_WUNLOCK();
 
 	return (0);
 
 fail_detach:
 	ether_ifdetach(ctx->ifc_ifp);
 fail_queues:
 	iflib_tqg_detach(ctx);
 	iflib_tx_structures_free(ctx);
 	iflib_rx_structures_free(ctx);
 	IFDI_DETACH(ctx);
 	IFDI_QUEUES_FREE(ctx);
 fail_intr_free:
 	iflib_free_intr_mem(ctx);
 fail_unlock:
 	CTX_UNLOCK(ctx);
 	IFNET_WUNLOCK();
 	iflib_deregister(ctx);
 fail_ctx_free:
 	device_set_softc(ctx->ifc_dev, NULL);
         if (ctx->ifc_flags & IFC_SC_ALLOCATED)
                 free(ctx->ifc_softc, M_IFLIB);
         free(ctx, M_IFLIB);
 	return (err);
 }
 
 int
 iflib_pseudo_register(device_t dev, if_shared_ctx_t sctx, if_ctx_t *ctxp,
 					  struct iflib_cloneattach_ctx *clctx)
 {
 	int num_txd, num_rxd;
 	int err;
 	if_ctx_t ctx;
 	if_t ifp;
 	if_softc_ctx_t scctx;
 	int i;
 	void *sc;
 
 	ctx = malloc(sizeof(*ctx), M_IFLIB, M_WAITOK|M_ZERO);
 	sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
 	ctx->ifc_flags |= IFC_SC_ALLOCATED;
 	if (sctx->isc_flags & (IFLIB_PSEUDO|IFLIB_VIRTUAL))
 		ctx->ifc_flags |= IFC_PSEUDO;
 
 	ctx->ifc_sctx = sctx;
 	ctx->ifc_softc = sc;
 	ctx->ifc_dev = dev;
 
 	if ((err = iflib_register(ctx)) != 0) {
 		device_printf(dev, "%s: iflib_register failed %d\n", __func__, err);
 		goto fail_ctx_free;
 	}
 	iflib_add_device_sysctl_pre(ctx);
 
 	scctx = &ctx->ifc_softc_ctx;
 	ifp = ctx->ifc_ifp;
 
 	iflib_reset_qvalues(ctx);
 	CTX_LOCK(ctx);
 	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
 		goto fail_unlock;
 	}
 	if (sctx->isc_flags & IFLIB_GEN_MAC)
 		ether_gen_addr(ifp, &ctx->ifc_mac);
 	if ((err = IFDI_CLONEATTACH(ctx, clctx->cc_ifc, clctx->cc_name,
 								clctx->cc_params)) != 0) {
 		device_printf(dev, "IFDI_CLONEATTACH failed %d\n", err);
 		goto fail_unlock;
 	}
 #ifdef INVARIANTS
 	if (scctx->isc_capabilities & IFCAP_TXCSUM)
 		MPASS(scctx->isc_tx_csum_flags);
 #endif
 
 	if_setcapabilities(ifp, scctx->isc_capabilities | IFCAP_HWSTATS | IFCAP_LINKSTATE);
 	if_setcapenable(ifp, scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_LINKSTATE);
 
 	if_setflagbits(ifp, IFF_NOGROUP, 0);
 	if (sctx->isc_flags & IFLIB_PSEUDO) {
 		ifmedia_add(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO, 0, NULL);
 		ifmedia_set(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO);
 		if (sctx->isc_flags & IFLIB_PSEUDO_ETHER) {
 			ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
 		} else {
 			if_attach(ctx->ifc_ifp);
 			bpfattach(ctx->ifc_ifp, DLT_NULL, sizeof(u_int32_t));
 		}
 
 		if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
 			device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
 			goto fail_detach;
 		}
 		*ctxp = ctx;
 
 		/*
 		 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
 		 * This must appear after the call to ether_ifattach() because
 		 * ether_ifattach() sets if_hdrlen to the default value.
 		 */
 		if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
 			if_setifheaderlen(ifp,
 			    sizeof(struct ether_vlan_header));
 
 		if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
 		iflib_add_device_sysctl_post(ctx);
 		ctx->ifc_flags |= IFC_INIT_DONE;
 		CTX_UNLOCK(ctx);
 		return (0);
 	}
 	ifmedia_add(ctx->ifc_mediap, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL);
 	ifmedia_add(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO, 0, NULL);
 	ifmedia_set(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO);
 
 	_iflib_pre_assert(scctx);
 	ctx->ifc_txrx = *scctx->isc_txrx;
 
 	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
 		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
 	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
 		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
 
 	num_txd = iflib_num_tx_descs(ctx);
 	num_rxd = iflib_num_rx_descs(ctx);
 
 	/* XXX change for per-queue sizes */
 	device_printf(dev, "Using %d TX descriptors and %d RX descriptors\n",
 	    num_txd, num_rxd);
 
 	if (scctx->isc_tx_nsegments > num_txd / MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_nsegments = max(1, num_txd /
 		    MAX_SINGLE_PACKET_FRACTION);
 	if (scctx->isc_tx_tso_segments_max > num_txd /
 	    MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_tso_segments_max = max(1,
 		    num_txd / MAX_SINGLE_PACKET_FRACTION);
 
 	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
 	if (if_getcapabilities(ifp) & IFCAP_TSO) {
 		/*
 		 * The stack can't handle a TSO size larger than IP_MAXPACKET,
 		 * but some MACs do.
 		 */
 		if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max,
 		    IP_MAXPACKET));
 		/*
 		 * Take maximum number of m_pullup(9)'s in iflib_parse_header()
 		 * into account.  In the worst case, each of these calls will
 		 * add another mbuf and, thus, the requirement for another DMA
 		 * segment.  So for best performance, it doesn't make sense to
 		 * advertize a maximum of TSO segments that typically will
 		 * require defragmentation in iflib_encap().
 		 */
 		if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3);
 		if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max);
 	}
 	if (scctx->isc_rss_table_size == 0)
 		scctx->isc_rss_table_size = 64;
 	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
 
 	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
 	/* XXX format name */
 	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx,
 	    NULL, NULL, "admin");
 
 	/* XXX --- can support > 1 -- but keep it simple for now */
 	scctx->isc_intr = IFLIB_INTR_LEGACY;
 
 	/* Get memory for the station queues */
 	if ((err = iflib_queues_alloc(ctx))) {
 		device_printf(dev, "Unable to allocate queue memory\n");
 		goto fail_iflib_detach;
 	}
 
 	if ((err = iflib_qset_structures_setup(ctx))) {
 		device_printf(dev, "qset structure setup failed %d\n", err);
 		goto fail_queues;
 	}
 
 	/*
 	 * XXX What if anything do we want to do about interrupts?
 	 */
 	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
 	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
 		goto fail_detach;
 	}
 
 	/*
 	 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
 	 * This must appear after the call to ether_ifattach() because
 	 * ether_ifattach() sets if_hdrlen to the default value.
 	 */
 	if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
 		if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
 
 	/* XXX handle more than one queue */
 	for (i = 0; i < scctx->isc_nrxqsets; i++)
 		IFDI_RX_CLSET(ctx, 0, i, ctx->ifc_rxqs[i].ifr_fl[0].ifl_sds.ifsd_cl);
 
 	*ctxp = ctx;
 
 	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
 	iflib_add_device_sysctl_post(ctx);
 	ctx->ifc_flags |= IFC_INIT_DONE;
 	CTX_UNLOCK(ctx);
 
 	return (0);
 fail_detach:
 	ether_ifdetach(ctx->ifc_ifp);
 fail_queues:
 	iflib_tqg_detach(ctx);
 	iflib_tx_structures_free(ctx);
 	iflib_rx_structures_free(ctx);
 fail_iflib_detach:
 	IFDI_DETACH(ctx);
 	IFDI_QUEUES_FREE(ctx);
 fail_unlock:
 	CTX_UNLOCK(ctx);
 	iflib_deregister(ctx);
 fail_ctx_free:
 	free(ctx->ifc_softc, M_IFLIB);
 	free(ctx, M_IFLIB);
 	return (err);
 }
 
 int
 iflib_pseudo_deregister(if_ctx_t ctx)
 {
 	if_t ifp = ctx->ifc_ifp;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 
 	/* Unregister VLAN event handlers early */
 	iflib_unregister_vlan_handlers(ctx);
 
 	if ((sctx->isc_flags & IFLIB_PSEUDO)  &&
 		(sctx->isc_flags & IFLIB_PSEUDO_ETHER) == 0) {
 		bpfdetach(ifp);
 		if_detach(ifp);
 	} else {
 		ether_ifdetach(ifp);
 	}
 
 	iflib_tqg_detach(ctx);
 	iflib_tx_structures_free(ctx);
 	iflib_rx_structures_free(ctx);
 	IFDI_DETACH(ctx);
 	IFDI_QUEUES_FREE(ctx);
 
 	iflib_deregister(ctx);
 
 	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
 		free(ctx->ifc_softc, M_IFLIB);
 	free(ctx, M_IFLIB);
 	return (0);
 }
 
 int
 iflib_device_attach(device_t dev)
 {
 	if_ctx_t ctx;
 	if_shared_ctx_t sctx;
 
 	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
 		return (ENOTSUP);
 
 	pci_enable_busmaster(dev);
 
 	return (iflib_device_register(dev, NULL, sctx, &ctx));
 }
 
 int
 iflib_device_deregister(if_ctx_t ctx)
 {
 	if_t ifp = ctx->ifc_ifp;
 	device_t dev = ctx->ifc_dev;
 
 	/* Make sure VLANS are not using driver */
 	if (if_vlantrunkinuse(ifp)) {
 		device_printf(dev, "Vlan in use, detach first\n");
 		return (EBUSY);
 	}
 #ifdef PCI_IOV
 	if (!CTX_IS_VF(ctx) && pci_iov_detach(dev) != 0) {
 		device_printf(dev, "SR-IOV in use; detach first.\n");
 		return (EBUSY);
 	}
 #endif
 
 	STATE_LOCK(ctx);
 	ctx->ifc_flags |= IFC_IN_DETACH;
 	STATE_UNLOCK(ctx);
 
 	/* Unregister VLAN handlers before calling iflib_stop() */
 	iflib_unregister_vlan_handlers(ctx);
 
 	iflib_netmap_detach(ifp);
 	ether_ifdetach(ifp);
 
 	CTX_LOCK(ctx);
 	iflib_stop(ctx);
 	CTX_UNLOCK(ctx);
 
 	iflib_rem_pfil(ctx);
 	if (ctx->ifc_led_dev != NULL)
 		led_destroy(ctx->ifc_led_dev);
 
 	iflib_tqg_detach(ctx);
 	iflib_tx_structures_free(ctx);
 	iflib_rx_structures_free(ctx);
 
 	CTX_LOCK(ctx);
 	IFDI_DETACH(ctx);
 	IFDI_QUEUES_FREE(ctx);
 	CTX_UNLOCK(ctx);
 
 	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
 	iflib_free_intr_mem(ctx);
 
 	bus_generic_detach(dev);
 
 	iflib_deregister(ctx);
 
 	device_set_softc(ctx->ifc_dev, NULL);
 	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
 		free(ctx->ifc_softc, M_IFLIB);
 	unref_ctx_core_offset(ctx);
 	free(ctx, M_IFLIB);
 	return (0);
 }
 
 static void
 iflib_tqg_detach(if_ctx_t ctx)
 {
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	int i;
 	struct taskqgroup *tqg;
 
 	/* XXX drain any dependent tasks */
 	tqg = qgroup_if_io_tqg;
 	for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
 		callout_drain(&txq->ift_timer);
 #ifdef DEV_NETMAP
 		callout_drain(&txq->ift_netmap_timer);
 #endif /* DEV_NETMAP */
 		if (txq->ift_task.gt_uniq != NULL)
 			taskqgroup_detach(tqg, &txq->ift_task);
 	}
 	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
 		if (rxq->ifr_task.gt_uniq != NULL)
 			taskqgroup_detach(tqg, &rxq->ifr_task);
 	}
 	tqg = qgroup_if_config_tqg;
 	if (ctx->ifc_admin_task.gt_uniq != NULL)
 		taskqgroup_detach(tqg, &ctx->ifc_admin_task);
 	if (ctx->ifc_vflr_task.gt_uniq != NULL)
 		taskqgroup_detach(tqg, &ctx->ifc_vflr_task);
 }
 
 static void
 iflib_free_intr_mem(if_ctx_t ctx)
 {
 
 	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) {
 		iflib_irq_free(ctx, &ctx->ifc_legacy_irq);
 	}
 	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) {
 		pci_release_msi(ctx->ifc_dev);
 	}
 	if (ctx->ifc_msix_mem != NULL) {
 		bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY,
 		    rman_get_rid(ctx->ifc_msix_mem), ctx->ifc_msix_mem);
 		ctx->ifc_msix_mem = NULL;
 	}
 }
 
 int
 iflib_device_detach(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	return (iflib_device_deregister(ctx));
 }
 
 int
 iflib_device_suspend(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_SUSPEND(ctx);
 	CTX_UNLOCK(ctx);
 
 	return bus_generic_suspend(dev);
 }
 int
 iflib_device_shutdown(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_SHUTDOWN(ctx);
 	CTX_UNLOCK(ctx);
 
 	return bus_generic_suspend(dev);
 }
 
 int
 iflib_device_resume(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 	iflib_txq_t txq = ctx->ifc_txqs;
 
 	CTX_LOCK(ctx);
 	IFDI_RESUME(ctx);
 	iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 	for (int i = 0; i < NTXQSETS(ctx); i++, txq++)
 		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
 
 	return (bus_generic_resume(dev));
 }
 
 int
 iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params)
 {
 	int error;
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	error = IFDI_IOV_INIT(ctx, num_vfs, params);
 	CTX_UNLOCK(ctx);
 
 	return (error);
 }
 
 void
 iflib_device_iov_uninit(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_IOV_UNINIT(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 int
 iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params)
 {
 	int error;
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	error = IFDI_IOV_VF_ADD(ctx, vfnum, params);
 	CTX_UNLOCK(ctx);
 
 	return (error);
 }
 
 /*********************************************************************
  *
  *  MODULE FUNCTION DEFINITIONS
  *
  **********************************************************************/
 
 /*
  * - Start a fast taskqueue thread for each core
  * - Start a taskqueue for control operations
  */
 static int
 iflib_module_init(void)
 {
 	iflib_timer_default = hz / 2;
 	return (0);
 }
 
 static int
 iflib_module_event_handler(module_t mod, int what, void *arg)
 {
 	int err;
 
 	switch (what) {
 	case MOD_LOAD:
 		if ((err = iflib_module_init()) != 0)
 			return (err);
 		break;
 	case MOD_UNLOAD:
 		return (EBUSY);
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 /*********************************************************************
  *
  *  PUBLIC FUNCTION DEFINITIONS
  *     ordered as in iflib.h
  *
  **********************************************************************/
 
 static void
 _iflib_assert(if_shared_ctx_t sctx)
 {
 	int i;
 
 	MPASS(sctx->isc_tx_maxsize);
 	MPASS(sctx->isc_tx_maxsegsize);
 
 	MPASS(sctx->isc_rx_maxsize);
 	MPASS(sctx->isc_rx_nsegments);
 	MPASS(sctx->isc_rx_maxsegsize);
 
 	MPASS(sctx->isc_nrxqs >= 1 && sctx->isc_nrxqs <= 8);
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		MPASS(sctx->isc_nrxd_min[i]);
 		MPASS(powerof2(sctx->isc_nrxd_min[i]));
 		MPASS(sctx->isc_nrxd_max[i]);
 		MPASS(powerof2(sctx->isc_nrxd_max[i]));
 		MPASS(sctx->isc_nrxd_default[i]);
 		MPASS(powerof2(sctx->isc_nrxd_default[i]));
 	}
 
 	MPASS(sctx->isc_ntxqs >= 1 && sctx->isc_ntxqs <= 8);
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		MPASS(sctx->isc_ntxd_min[i]);
 		MPASS(powerof2(sctx->isc_ntxd_min[i]));
 		MPASS(sctx->isc_ntxd_max[i]);
 		MPASS(powerof2(sctx->isc_ntxd_max[i]));
 		MPASS(sctx->isc_ntxd_default[i]);
 		MPASS(powerof2(sctx->isc_ntxd_default[i]));
 	}
 }
 
 static void
 _iflib_pre_assert(if_softc_ctx_t scctx)
 {
 
 	MPASS(scctx->isc_txrx->ift_txd_encap);
 	MPASS(scctx->isc_txrx->ift_txd_flush);
 	MPASS(scctx->isc_txrx->ift_txd_credits_update);
 	MPASS(scctx->isc_txrx->ift_rxd_available);
 	MPASS(scctx->isc_txrx->ift_rxd_pkt_get);
 	MPASS(scctx->isc_txrx->ift_rxd_refill);
 	MPASS(scctx->isc_txrx->ift_rxd_flush);
 }
 
 static int
 iflib_register(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	driver_t *driver = sctx->isc_driver;
 	device_t dev = ctx->ifc_dev;
 	if_t ifp;
 	u_char type;
 	int iflags;
 
 	if ((sctx->isc_flags & IFLIB_PSEUDO) == 0)
 		_iflib_assert(sctx);
 
 	CTX_LOCK_INIT(ctx);
 	STATE_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev));
 	if (sctx->isc_flags & IFLIB_PSEUDO) {
 		if (sctx->isc_flags & IFLIB_PSEUDO_ETHER)
 			type = IFT_ETHER;
 		else
 			type = IFT_PPP;
 	} else
 		type = IFT_ETHER;
 	ifp = ctx->ifc_ifp = if_alloc(type);
 	if (ifp == NULL) {
 		device_printf(dev, "can not allocate ifnet structure\n");
 		return (ENOMEM);
 	}
 
 	/*
 	 * Initialize our context's device specific methods
 	 */
 	kobj_init((kobj_t) ctx, (kobj_class_t) driver);
 	kobj_class_compile((kobj_class_t) driver);
 
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 	if_setsoftc(ifp, ctx);
 	if_setdev(ifp, dev);
 	if_setinitfn(ifp, iflib_if_init);
 	if_setioctlfn(ifp, iflib_if_ioctl);
 #ifdef ALTQ
 	if_setstartfn(ifp, iflib_altq_if_start);
 	if_settransmitfn(ifp, iflib_altq_if_transmit);
 	if_setsendqready(ifp);
 #else
 	if_settransmitfn(ifp, iflib_if_transmit);
 #endif
 	if_setqflushfn(ifp, iflib_if_qflush);
 	iflags = IFF_MULTICAST | IFF_KNOWSEPOCH;
 
 	if ((sctx->isc_flags & IFLIB_PSEUDO) &&
 		(sctx->isc_flags & IFLIB_PSEUDO_ETHER) == 0)
 		iflags |= IFF_POINTOPOINT;
 	else
 		iflags |= IFF_BROADCAST | IFF_SIMPLEX;
 	if_setflags(ifp, iflags);
 	ctx->ifc_vlan_attach_event =
 		EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx,
 							  EVENTHANDLER_PRI_FIRST);
 	ctx->ifc_vlan_detach_event =
 		EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx,
 							  EVENTHANDLER_PRI_FIRST);
 
 	if ((sctx->isc_flags & IFLIB_DRIVER_MEDIA) == 0) {
 		ctx->ifc_mediap = &ctx->ifc_media;
 		ifmedia_init(ctx->ifc_mediap, IFM_IMASK,
 		    iflib_media_change, iflib_media_status);
 	}
 	return (0);
 }
 
 static void
 iflib_unregister_vlan_handlers(if_ctx_t ctx)
 {
 	/* Unregister VLAN events */
 	if (ctx->ifc_vlan_attach_event != NULL) {
 		EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event);
 		ctx->ifc_vlan_attach_event = NULL;
 	}
 	if (ctx->ifc_vlan_detach_event != NULL) {
 		EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event);
 		ctx->ifc_vlan_detach_event = NULL;
 	}
 
 }
 
 static void
 iflib_deregister(if_ctx_t ctx)
 {
 	if_t ifp = ctx->ifc_ifp;
 
 	/* Remove all media */
 	ifmedia_removeall(&ctx->ifc_media);
 
 	/* Ensure that VLAN event handlers are unregistered */
 	iflib_unregister_vlan_handlers(ctx);
 
 	/* Release kobject reference */
 	kobj_delete((kobj_t) ctx, NULL);
 
 	/* Free the ifnet structure */
 	if_free(ifp);
 
 	STATE_LOCK_DESTROY(ctx);
 
 	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
 	CTX_LOCK_DESTROY(ctx);
 }
 
 static int
 iflib_queues_alloc(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	int nrxqsets = scctx->isc_nrxqsets;
 	int ntxqsets = scctx->isc_ntxqsets;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	iflib_fl_t fl = NULL;
 	int i, j, cpu, err, txconf, rxconf;
 	iflib_dma_info_t ifdip;
 	uint32_t *rxqsizes = scctx->isc_rxqsizes;
 	uint32_t *txqsizes = scctx->isc_txqsizes;
 	uint8_t nrxqs = sctx->isc_nrxqs;
 	uint8_t ntxqs = sctx->isc_ntxqs;
 	int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1;
 	int fl_offset = (sctx->isc_flags & IFLIB_HAS_RXCQ ? 1 : 0);
 	caddr_t *vaddrs;
 	uint64_t *paddrs;
 
 	KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1"));
 	KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1"));
 	KASSERT(nrxqs >= fl_offset + nfree_lists,
            ("there must be at least a rxq for each free list"));
 
 	/* Allocate the TX ring struct memory */
 	if (!(ctx->ifc_txqs =
 	    (iflib_txq_t) malloc(sizeof(struct iflib_txq) *
 	    ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate TX ring memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 
 	/* Now allocate the RX */
 	if (!(ctx->ifc_rxqs =
 	    (iflib_rxq_t) malloc(sizeof(struct iflib_rxq) *
 	    nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate RX ring memory\n");
 		err = ENOMEM;
 		goto rx_fail;
 	}
 
 	txq = ctx->ifc_txqs;
 	rxq = ctx->ifc_rxqs;
 
 	/*
 	 * XXX handle allocation failure
 	 */
 	for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) {
 		/* Set up some basics */
 
 		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs,
 		    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
 			device_printf(dev,
 			    "Unable to allocate TX DMA info memory\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 		txq->ift_ifdi = ifdip;
 		for (j = 0; j < ntxqs; j++, ifdip++) {
 			if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, 0)) {
 				device_printf(dev,
 				    "Unable to allocate TX descriptors\n");
 				err = ENOMEM;
 				goto err_tx_desc;
 			}
 			txq->ift_txd_size[j] = scctx->isc_txd_size[j];
 			bzero((void *)ifdip->idi_vaddr, txqsizes[j]);
 		}
 		txq->ift_ctx = ctx;
 		txq->ift_id = i;
 		if (sctx->isc_flags & IFLIB_HAS_TXCQ) {
 			txq->ift_br_offset = 1;
 		} else {
 			txq->ift_br_offset = 0;
 		}
 
 		if (iflib_txsd_alloc(txq)) {
 			device_printf(dev, "Critical Failure setting up TX buffers\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 
 		/* Initialize the TX lock */
 		snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:TX(%d):callout",
 		    device_get_nameunit(dev), txq->ift_id);
 		mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF);
 		callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0);
 		txq->ift_timer.c_cpu = cpu;
 #ifdef DEV_NETMAP
 		callout_init_mtx(&txq->ift_netmap_timer, &txq->ift_mtx, 0);
 		txq->ift_netmap_timer.c_cpu = cpu;
 #endif /* DEV_NETMAP */
 
 		err = ifmp_ring_alloc(&txq->ift_br, 2048, txq, iflib_txq_drain,
 				      iflib_txq_can_drain, M_IFLIB, M_WAITOK);
 		if (err) {
 			/* XXX free any allocated rings */
 			device_printf(dev, "Unable to allocate buf_ring\n");
 			goto err_tx_desc;
 		}
 	}
 
 	for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) {
 		/* Set up some basics */
 		callout_init(&rxq->ifr_watchdog, 1);
 
 		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs,
 		   M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
 			device_printf(dev,
 			    "Unable to allocate RX DMA info memory\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 
 		rxq->ifr_ifdi = ifdip;
 		/* XXX this needs to be changed if #rx queues != #tx queues */
 		rxq->ifr_ntxqirq = 1;
 		rxq->ifr_txqid[0] = i;
 		for (j = 0; j < nrxqs; j++, ifdip++) {
 			if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, 0)) {
 				device_printf(dev,
 				    "Unable to allocate RX descriptors\n");
 				err = ENOMEM;
 				goto err_tx_desc;
 			}
 			bzero((void *)ifdip->idi_vaddr, rxqsizes[j]);
 		}
 		rxq->ifr_ctx = ctx;
 		rxq->ifr_id = i;
 		rxq->ifr_fl_offset = fl_offset;
 		rxq->ifr_nfl = nfree_lists;
 		if (!(fl =
 			  (iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev, "Unable to allocate free list memory\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 		rxq->ifr_fl = fl;
 		for (j = 0; j < nfree_lists; j++) {
 			fl[j].ifl_rxq = rxq;
 			fl[j].ifl_id = j;
 			fl[j].ifl_ifdi = &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
 			fl[j].ifl_rxd_size = scctx->isc_rxd_size[j];
 		}
 		/* Allocate receive buffers for the ring */
 		if (iflib_rxsd_alloc(rxq)) {
 			device_printf(dev,
 			    "Critical Failure setting up receive buffers\n");
 			err = ENOMEM;
 			goto err_rx_desc;
 		}
 
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) 
 			fl->ifl_rx_bitmap = bit_alloc(fl->ifl_size, M_IFLIB,
 			    M_WAITOK);
 	}
 
 	/* TXQs */
 	vaddrs = malloc(sizeof(caddr_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
 	paddrs = malloc(sizeof(uint64_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
 	for (i = 0; i < ntxqsets; i++) {
 		iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi;
 
 		for (j = 0; j < ntxqs; j++, di++) {
 			vaddrs[i*ntxqs + j] = di->idi_vaddr;
 			paddrs[i*ntxqs + j] = di->idi_paddr;
 		}
 	}
 	if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) {
 		device_printf(ctx->ifc_dev,
 		    "Unable to allocate device TX queue\n");
 		iflib_tx_structures_free(ctx);
 		free(vaddrs, M_IFLIB);
 		free(paddrs, M_IFLIB);
 		goto err_rx_desc;
 	}
 	free(vaddrs, M_IFLIB);
 	free(paddrs, M_IFLIB);
 
 	/* RXQs */
 	vaddrs = malloc(sizeof(caddr_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
 	paddrs = malloc(sizeof(uint64_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
 	for (i = 0; i < nrxqsets; i++) {
 		iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi;
 
 		for (j = 0; j < nrxqs; j++, di++) {
 			vaddrs[i*nrxqs + j] = di->idi_vaddr;
 			paddrs[i*nrxqs + j] = di->idi_paddr;
 		}
 	}
 	if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) {
 		device_printf(ctx->ifc_dev,
 		    "Unable to allocate device RX queue\n");
 		iflib_tx_structures_free(ctx);
 		free(vaddrs, M_IFLIB);
 		free(paddrs, M_IFLIB);
 		goto err_rx_desc;
 	}
 	free(vaddrs, M_IFLIB);
 	free(paddrs, M_IFLIB);
 
 	return (0);
 
 /* XXX handle allocation failure changes */
 err_rx_desc:
 err_tx_desc:
 rx_fail:
 	if (ctx->ifc_rxqs != NULL)
 		free(ctx->ifc_rxqs, M_IFLIB);
 	ctx->ifc_rxqs = NULL;
 	if (ctx->ifc_txqs != NULL)
 		free(ctx->ifc_txqs, M_IFLIB);
 	ctx->ifc_txqs = NULL;
 fail:
 	return (err);
 }
 
 static int
 iflib_tx_structures_setup(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	int i;
 
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
 		iflib_txq_setup(txq);
 
 	return (0);
 }
 
 static void
 iflib_tx_structures_free(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	int i, j;
 
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
 		for (j = 0; j < sctx->isc_ntxqs; j++)
 			iflib_dma_free(&txq->ift_ifdi[j]);
 		iflib_txq_destroy(txq);
 	}
 	free(ctx->ifc_txqs, M_IFLIB);
 	ctx->ifc_txqs = NULL;
 }
 
 /*********************************************************************
  *
  *  Initialize all receive rings.
  *
  **********************************************************************/
 static int
 iflib_rx_structures_setup(if_ctx_t ctx)
 {
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	int q;
 #if defined(INET6) || defined(INET)
 	int err, i;
 #endif
 
 	for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) {
 #if defined(INET6) || defined(INET)
 		err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp,
 		    TCP_LRO_ENTRIES, min(1024,
 		    ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset]));
 		if (err != 0) {
 			device_printf(ctx->ifc_dev,
 			    "LRO Initialization failed!\n");
 			goto fail;
 		}
 #endif
 		IFDI_RXQ_SETUP(ctx, rxq->ifr_id);
 	}
 	return (0);
 #if defined(INET6) || defined(INET)
 fail:
 	/*
 	 * Free LRO resources allocated so far, we will only handle
 	 * the rings that completed, the failing case will have
 	 * cleaned up for itself.  'q' failed, so its the terminus.
 	 */
 	rxq = ctx->ifc_rxqs;
 	for (i = 0; i < q; ++i, rxq++) {
 		tcp_lro_free(&rxq->ifr_lc);
 	}
 	return (err);
 #endif
 }
 
 /*********************************************************************
  *
  *  Free all receive rings.
  *
  **********************************************************************/
 static void
 iflib_rx_structures_free(if_ctx_t ctx)
 {
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	int i, j;
 
 	for (i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) {
 		for (j = 0; j < sctx->isc_nrxqs; j++)
 			iflib_dma_free(&rxq->ifr_ifdi[j]);
 		iflib_rx_sds_free(rxq);
 #if defined(INET6) || defined(INET)
 		tcp_lro_free(&rxq->ifr_lc);
 #endif
 	}
 	free(ctx->ifc_rxqs, M_IFLIB);
 	ctx->ifc_rxqs = NULL;
 }
 
 static int
 iflib_qset_structures_setup(if_ctx_t ctx)
 {
 	int err;
 
 	/*
 	 * It is expected that the caller takes care of freeing queues if this
 	 * fails.
 	 */
 	if ((err = iflib_tx_structures_setup(ctx)) != 0) {
 		device_printf(ctx->ifc_dev, "iflib_tx_structures_setup failed: %d\n", err);
 		return (err);
 	}
 
 	if ((err = iflib_rx_structures_setup(ctx)) != 0)
 		device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err);
 
 	return (err);
 }
 
 int
 iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
 		driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, const char *name)
 {
 
 	return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
 }
 
 /* Just to avoid copy/paste */
 static inline int
 iflib_irq_set_affinity(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,
     int qid, struct grouptask *gtask, struct taskqgroup *tqg, void *uniq,
     const char *name)
 {
 	device_t dev;
 	unsigned int base_cpuid, cpuid;
 	int err;
 
 	dev = ctx->ifc_dev;
 	base_cpuid = ctx->ifc_sysctl_core_offset;
 	cpuid = get_cpuid_for_queue(ctx, base_cpuid, qid, type == IFLIB_INTR_TX);
 	err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, dev,
 	    irq ? irq->ii_res : NULL, name);
 	if (err) {
 		device_printf(dev, "taskqgroup_attach_cpu failed %d\n", err);
 		return (err);
 	}
 #ifdef notyet
 	if (cpuid > ctx->ifc_cpuid_highest)
 		ctx->ifc_cpuid_highest = cpuid;
 #endif
 	return (0);
 }
 
 int
 iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
 			iflib_intr_type_t type, driver_filter_t *filter,
 			void *filter_arg, int qid, const char *name)
 {
 	device_t dev;
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	iflib_filter_info_t info;
 	gtask_fn_t *fn;
 	int tqrid, err;
 	driver_filter_t *intr_fast;
 	void *q;
 
 	info = &ctx->ifc_filter_info;
 	tqrid = rid;
 
 	switch (type) {
 	/* XXX merge tx/rx for netmap? */
 	case IFLIB_INTR_TX:
 		q = &ctx->ifc_txqs[qid];
 		info = &ctx->ifc_txqs[qid].ift_filter_info;
 		gtask = &ctx->ifc_txqs[qid].ift_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_tx;
 		intr_fast = iflib_fast_intr;
 		GROUPTASK_INIT(gtask, 0, fn, q);
 		ctx->ifc_flags |= IFC_NETMAP_TX_IRQ;
 		break;
 	case IFLIB_INTR_RX:
 		q = &ctx->ifc_rxqs[qid];
 		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
 		intr_fast = iflib_fast_intr;
 		NET_GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	case IFLIB_INTR_RXTX:
 		q = &ctx->ifc_rxqs[qid];
 		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
 		intr_fast = iflib_fast_intr_rxtx;
 		NET_GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	case IFLIB_INTR_ADMIN:
 		q = ctx;
 		tqrid = -1;
 		info = &ctx->ifc_filter_info;
 		gtask = &ctx->ifc_admin_task;
 		tqg = qgroup_if_config_tqg;
 		fn = _task_fn_admin;
 		intr_fast = iflib_fast_intr_ctx;
 		break;
 	default:
 		device_printf(ctx->ifc_dev, "%s: unknown net intr type\n",
 		    __func__);
 		return (EINVAL);
 	}
 
 	info->ifi_filter = filter;
 	info->ifi_filter_arg = filter_arg;
 	info->ifi_task = gtask;
 	info->ifi_ctx = q;
 
 	dev = ctx->ifc_dev;
 	err = _iflib_irq_alloc(ctx, irq, rid, intr_fast, NULL, info,  name);
 	if (err != 0) {
 		device_printf(dev, "_iflib_irq_alloc failed %d\n", err);
 		return (err);
 	}
 	if (type == IFLIB_INTR_ADMIN)
 		return (0);
 
 	if (tqrid != -1) {
 		err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q,
 		    name);
 		if (err)
 			return (err);
 	} else {
 		taskqgroup_attach(tqg, gtask, q, dev, irq->ii_res, name);
 	}
 
 	return (0);
 }
 
 void
 iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type, void *arg, int qid, const char *name)
 {
 	device_t dev;
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	gtask_fn_t *fn;
 	void *q;
 	int err;
 
 	switch (type) {
 	case IFLIB_INTR_TX:
 		q = &ctx->ifc_txqs[qid];
 		gtask = &ctx->ifc_txqs[qid].ift_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_tx;
 		GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	case IFLIB_INTR_RX:
 		q = &ctx->ifc_rxqs[qid];
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
 		NET_GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	case IFLIB_INTR_IOV:
 		q = ctx;
 		gtask = &ctx->ifc_vflr_task;
 		tqg = qgroup_if_config_tqg;
 		fn = _task_fn_iov;
 		GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	default:
 		panic("unknown net intr type");
 	}
 	err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q, name);
 	if (err) {
 		dev = ctx->ifc_dev;
 		taskqgroup_attach(tqg, gtask, q, dev, irq ? irq->ii_res : NULL,
 		    name);
 	}
 }
 
 void
 iflib_irq_free(if_ctx_t ctx, if_irq_t irq)
 {
 
 	if (irq->ii_tag)
 		bus_teardown_intr(ctx->ifc_dev, irq->ii_res, irq->ii_tag);
 
 	if (irq->ii_res)
 		bus_release_resource(ctx->ifc_dev, SYS_RES_IRQ,
 		    rman_get_rid(irq->ii_res), irq->ii_res);
 }
 
 static int
 iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filter_arg, int *rid, const char *name)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	if_irq_t irq = &ctx->ifc_legacy_irq;
 	iflib_filter_info_t info;
 	device_t dev;
 	struct grouptask *gtask;
 	struct resource *res;
 	struct taskqgroup *tqg;
 	void *q;
 	int err, tqrid;
 	bool rx_only;
 
 	q = &ctx->ifc_rxqs[0];
 	info = &rxq[0].ifr_filter_info;
 	gtask = &rxq[0].ifr_task;
 	tqg = qgroup_if_io_tqg;
 	tqrid = *rid;
 	rx_only = (ctx->ifc_sctx->isc_flags & IFLIB_SINGLE_IRQ_RX_ONLY) != 0;
 
 	ctx->ifc_flags |= IFC_LEGACY;
 	info->ifi_filter = filter;
 	info->ifi_filter_arg = filter_arg;
 	info->ifi_task = gtask;
 	info->ifi_ctx = rx_only ? ctx : q;
 
 	dev = ctx->ifc_dev;
 	/* We allocate a single interrupt resource */
 	err = _iflib_irq_alloc(ctx, irq, tqrid, rx_only ? iflib_fast_intr_ctx :
 	    iflib_fast_intr_rxtx, NULL, info, name);
 	if (err != 0)
 		return (err);
 	NET_GROUPTASK_INIT(gtask, 0, _task_fn_rx, q);
 	res = irq->ii_res;
 	taskqgroup_attach(tqg, gtask, q, dev, res, name);
 
 	GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq);
 	taskqgroup_attach(qgroup_if_io_tqg, &txq->ift_task, txq, dev, res,
 	    "tx");
 	return (0);
 }
 
 void
 iflib_led_create(if_ctx_t ctx)
 {
 
 	ctx->ifc_led_dev = led_create(iflib_led_func, ctx,
 	    device_get_nameunit(ctx->ifc_dev));
 }
 
 void
 iflib_tx_intr_deferred(if_ctx_t ctx, int txqid)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task);
 }
 
 void
 iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_rxqs[rxqid].ifr_task);
 }
 
 void
 iflib_admin_intr_deferred(if_ctx_t ctx)
 {
 
 	MPASS(ctx->ifc_admin_task.gt_taskqueue != NULL);
 	GROUPTASK_ENQUEUE(&ctx->ifc_admin_task);
 }
 
 void
 iflib_iov_intr_deferred(if_ctx_t ctx)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_vflr_task);
 }
 
 void
 iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, const char *name)
 {
 
 	taskqgroup_attach_cpu(qgroup_if_io_tqg, gt, uniq, cpu, NULL, NULL,
 	    name);
 }
 
 void
 iflib_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn,
 	const char *name)
 {
 
 	GROUPTASK_INIT(gtask, 0, fn, ctx);
 	taskqgroup_attach(qgroup_if_config_tqg, gtask, gtask, NULL, NULL,
 	    name);
 }
 
 void
 iflib_config_gtask_deinit(struct grouptask *gtask)
 {
 
 	taskqgroup_detach(qgroup_if_config_tqg, gtask);	
 }
 
 void
 iflib_link_state_change(if_ctx_t ctx, int link_state, uint64_t baudrate)
 {
 	if_t ifp = ctx->ifc_ifp;
 	iflib_txq_t txq = ctx->ifc_txqs;
 
 	if_setbaudrate(ifp, baudrate);
 	if (baudrate >= IF_Gbps(10)) {
 		STATE_LOCK(ctx);
 		ctx->ifc_flags |= IFC_PREFETCH;
 		STATE_UNLOCK(ctx);
 	}
 	/* If link down, disable watchdog */
 	if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) {
 		for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxqsets; i++, txq++)
 			txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	}
 	ctx->ifc_link_state = link_state;
 	if_link_state_change(ifp, link_state);
 }
 
 static int
 iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq)
 {
 	int credits;
 #ifdef INVARIANTS
 	int credits_pre = txq->ift_cidx_processed;
 #endif
 
 	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 	    BUS_DMASYNC_POSTREAD);
 	if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, true)) == 0)
 		return (0);
 
 	txq->ift_processed += credits;
 	txq->ift_cidx_processed += credits;
 
 	MPASS(credits_pre + credits == txq->ift_cidx_processed);
 	if (txq->ift_cidx_processed >= txq->ift_size)
 		txq->ift_cidx_processed -= txq->ift_size;
 	return (credits);
 }
 
 static int
 iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget)
 {
 	iflib_fl_t fl;
 	u_int i;
 
 	for (i = 0, fl = &rxq->ifr_fl[0]; i < rxq->ifr_nfl; i++, fl++)
 		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 	return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx,
 	    budget));
 }
 
 void
 iflib_add_int_delay_sysctl(if_ctx_t ctx, const char *name,
 	const char *description, if_int_delay_info_t info,
 	int offset, int value)
 {
 	info->iidi_ctx = ctx;
 	info->iidi_offset = offset;
 	info->iidi_value = value;
 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(ctx->ifc_dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(ctx->ifc_dev)),
 	    OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 	    info, 0, iflib_sysctl_int_delay, "I", description);
 }
 
 struct sx *
 iflib_ctx_lock_get(if_ctx_t ctx)
 {
 
 	return (&ctx->ifc_ctx_sx);
 }
 
 static int
 iflib_msix_init(if_ctx_t ctx)
 {
 	device_t dev = ctx->ifc_dev;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	int admincnt, bar, err, iflib_num_rx_queues, iflib_num_tx_queues;
 	int msgs, queuemsgs, queues, rx_queues, tx_queues, vectors;
 
 	iflib_num_tx_queues = ctx->ifc_sysctl_ntxqs;
 	iflib_num_rx_queues = ctx->ifc_sysctl_nrxqs;
 
 	if (bootverbose)
 		device_printf(dev, "msix_init qsets capped at %d\n",
 		    imax(scctx->isc_ntxqsets, scctx->isc_nrxqsets));
 
 	/* Override by tuneable */
 	if (scctx->isc_disable_msix)
 		goto msi;
 
 	/* First try MSI-X */
 	if ((msgs = pci_msix_count(dev)) == 0) {
 		if (bootverbose)
 			device_printf(dev, "MSI-X not supported or disabled\n");
 		goto msi;
 	}
 
 	bar = ctx->ifc_softc_ctx.isc_msix_bar;
 	/*
 	 * bar == -1 => "trust me I know what I'm doing"
 	 * Some drivers are for hardware that is so shoddily
 	 * documented that no one knows which bars are which
 	 * so the developer has to map all bars. This hack
 	 * allows shoddy garbage to use MSI-X in this framework.
 	 */
 	if (bar != -1) {
 		ctx->ifc_msix_mem = bus_alloc_resource_any(dev,
 	            SYS_RES_MEMORY, &bar, RF_ACTIVE);
 		if (ctx->ifc_msix_mem == NULL) {
 			device_printf(dev, "Unable to map MSI-X table\n");
 			goto msi;
 		}
 	}
 
 	admincnt = sctx->isc_admin_intrcnt;
 #if IFLIB_DEBUG
 	/* use only 1 qset in debug mode */
 	queuemsgs = min(msgs - admincnt, 1);
 #else
 	queuemsgs = msgs - admincnt;
 #endif
 #ifdef RSS
 	queues = imin(queuemsgs, rss_getnumbuckets());
 #else
 	queues = queuemsgs;
 #endif
 	queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues);
 	if (bootverbose)
 		device_printf(dev,
 		    "intr CPUs: %d queue msgs: %d admincnt: %d\n",
 		    CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt);
 #ifdef  RSS
 	/* If we're doing RSS, clamp at the number of RSS buckets */
 	if (queues > rss_getnumbuckets())
 		queues = rss_getnumbuckets();
 #endif
 	if (iflib_num_rx_queues > 0 && iflib_num_rx_queues < queuemsgs - admincnt)
 		rx_queues = iflib_num_rx_queues;
 	else
 		rx_queues = queues;
 
 	if (rx_queues > scctx->isc_nrxqsets)
 		rx_queues = scctx->isc_nrxqsets;
 
 	/*
 	 * We want this to be all logical CPUs by default
 	 */
 	if (iflib_num_tx_queues > 0 && iflib_num_tx_queues < queues)
 		tx_queues = iflib_num_tx_queues;
 	else
 		tx_queues = mp_ncpus;
 
 	if (tx_queues > scctx->isc_ntxqsets)
 		tx_queues = scctx->isc_ntxqsets;
 
 	if (ctx->ifc_sysctl_qs_eq_override == 0) {
 #ifdef INVARIANTS
 		if (tx_queues != rx_queues)
 			device_printf(dev,
 			    "queue equality override not set, capping rx_queues at %d and tx_queues at %d\n",
 			    min(rx_queues, tx_queues), min(rx_queues, tx_queues));
 #endif
 		tx_queues = min(rx_queues, tx_queues);
 		rx_queues = min(rx_queues, tx_queues);
 	}
 
 	vectors = rx_queues + admincnt;
 	if (msgs < vectors) {
 		device_printf(dev,
 		    "insufficient number of MSI-X vectors "
 		    "(supported %d, need %d)\n", msgs, vectors);
 		goto msi;
 	}
 
 	device_printf(dev, "Using %d RX queues %d TX queues\n", rx_queues,
 	    tx_queues);
 	msgs = vectors;
 	if ((err = pci_alloc_msix(dev, &vectors)) == 0) {
 		if (vectors != msgs) {
 			device_printf(dev,
 			    "Unable to allocate sufficient MSI-X vectors "
 			    "(got %d, need %d)\n", vectors, msgs);
 			pci_release_msi(dev);
 			if (bar != -1) {
 				bus_release_resource(dev, SYS_RES_MEMORY, bar,
 				    ctx->ifc_msix_mem);
 				ctx->ifc_msix_mem = NULL;
 			}
 			goto msi;
 		}
 		device_printf(dev, "Using MSI-X interrupts with %d vectors\n",
 		    vectors);
 		scctx->isc_vectors = vectors;
 		scctx->isc_nrxqsets = rx_queues;
 		scctx->isc_ntxqsets = tx_queues;
 		scctx->isc_intr = IFLIB_INTR_MSIX;
 
 		return (vectors);
 	} else {
 		device_printf(dev,
 		    "failed to allocate %d MSI-X vectors, err: %d\n", vectors,
 		    err);
 		if (bar != -1) {
 			bus_release_resource(dev, SYS_RES_MEMORY, bar,
 			    ctx->ifc_msix_mem);
 			ctx->ifc_msix_mem = NULL;
 		}
 	}
 
 msi:
 	vectors = pci_msi_count(dev);
 	scctx->isc_nrxqsets = 1;
 	scctx->isc_ntxqsets = 1;
 	scctx->isc_vectors = vectors;
 	if (vectors == 1 && pci_alloc_msi(dev, &vectors) == 0) {
 		device_printf(dev,"Using an MSI interrupt\n");
 		scctx->isc_intr = IFLIB_INTR_MSI;
 	} else {
 		scctx->isc_vectors = 1;
 		device_printf(dev,"Using a Legacy interrupt\n");
 		scctx->isc_intr = IFLIB_INTR_LEGACY;
 	}
 
 	return (vectors);
 }
 
 static const char *ring_states[] = { "IDLE", "BUSY", "STALLED", "ABDICATED" };
 
 static int
 mp_ring_state_handler(SYSCTL_HANDLER_ARGS)
 {
 	int rc;
 	uint16_t *state = ((uint16_t *)oidp->oid_arg1);
 	struct sbuf *sb;
 	const char *ring_state = "UNKNOWN";
 
 	/* XXX needed ? */
 	rc = sysctl_wire_old_buffer(req, 0);
 	MPASS(rc == 0);
 	if (rc != 0)
 		return (rc);
 	sb = sbuf_new_for_sysctl(NULL, NULL, 80, req);
 	MPASS(sb != NULL);
 	if (sb == NULL)
 		return (ENOMEM);
 	if (state[3] <= 3)
 		ring_state = ring_states[state[3]];
 
 	sbuf_printf(sb, "pidx_head: %04hd pidx_tail: %04hd cidx: %04hd state: %s",
 		    state[0], state[1], state[2], ring_state);
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
         return(rc);
 }
 
 enum iflib_ndesc_handler {
 	IFLIB_NTXD_HANDLER,
 	IFLIB_NRXD_HANDLER,
 };
 
 static int
 mp_ndesc_handler(SYSCTL_HANDLER_ARGS)
 {
 	if_ctx_t ctx = (void *)arg1;
 	enum iflib_ndesc_handler type = arg2;
 	char buf[256] = {0};
 	qidx_t *ndesc;
 	char *p, *next;
 	int nqs, rc, i;
 
 	nqs = 8;
 	switch(type) {
 	case IFLIB_NTXD_HANDLER:
 		ndesc = ctx->ifc_sysctl_ntxds;
 		if (ctx->ifc_sctx)
 			nqs = ctx->ifc_sctx->isc_ntxqs;
 		break;
 	case IFLIB_NRXD_HANDLER:
 		ndesc = ctx->ifc_sysctl_nrxds;
 		if (ctx->ifc_sctx)
 			nqs = ctx->ifc_sctx->isc_nrxqs;
 		break;
 	default:
 		printf("%s: unhandled type\n", __func__);
 		return (EINVAL);
 	}
 	if (nqs == 0)
 		nqs = 8;
 
 	for (i=0; i<8; i++) {
 		if (i >= nqs)
 			break;
 		if (i)
 			strcat(buf, ",");
 		sprintf(strchr(buf, 0), "%d", ndesc[i]);
 	}
 
 	rc = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (rc || req->newptr == NULL)
 		return rc;
 
 	for (i = 0, next = buf, p = strsep(&next, " ,"); i < 8 && p;
 	    i++, p = strsep(&next, " ,")) {
 		ndesc[i] = strtoul(p, NULL, 10);
 	}
 
 	return(rc);
 }
 
 #define NAME_BUFLEN 32
 static void
 iflib_add_device_sysctl_pre(if_ctx_t ctx)
 {
         device_t dev = iflib_get_dev(ctx);
 	struct sysctl_oid_list *child, *oid_list;
 	struct sysctl_ctx_list *ctx_list;
 	struct sysctl_oid *node;
 
 	ctx_list = device_get_sysctl_ctx(dev);
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 	ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, "iflib",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "IFLIB fields");
 	oid_list = SYSCTL_CHILDREN(node);
 
 	SYSCTL_ADD_CONST_STRING(ctx_list, oid_list, OID_AUTO, "driver_version",
 		       CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version,
 		       "driver version");
 
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0,
 			"# of txqs to use, 0 => use default #");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0,
 			"# of rxqs to use, 0 => use default #");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0,
                        "permit #txq != #rxq");
 	SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "disable_msix",
                       CTLFLAG_RWTUN, &ctx->ifc_softc_ctx.isc_disable_msix, 0,
                       "disable MSI-X (default 0)");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "rx_budget",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_rx_budget, 0,
 		       "set the RX budget");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "tx_abdicate",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_tx_abdicate, 0,
 		       "cause TX to abdicate instead of running to completion");
 	ctx->ifc_sysctl_core_offset = CORE_OFFSET_UNSPECIFIED;
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "core_offset",
 		       CTLFLAG_RDTUN, &ctx->ifc_sysctl_core_offset, 0,
 		       "offset to start using cores at");
 	SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "separate_txrx",
 		       CTLFLAG_RDTUN, &ctx->ifc_sysctl_separate_txrx, 0,
 		       "use separate cores for TX and RX");
 	SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "use_logical_cores",
 		      CTLFLAG_RDTUN, &ctx->ifc_sysctl_use_logical_cores, 0,
 		      "try to make use of logical cores for TX and RX");
 
 	/* XXX change for per-queue sizes */
 	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
 	    CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, ctx,
 	    IFLIB_NTXD_HANDLER, mp_ndesc_handler, "A",
 	    "list of # of TX descriptors to use, 0 = use default #");
 	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds",
 	    CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, ctx,
 	    IFLIB_NRXD_HANDLER, mp_ndesc_handler, "A",
 	    "list of # of RX descriptors to use, 0 = use default #");
 }
 
 static void
 iflib_add_device_sysctl_post(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
         device_t dev = iflib_get_dev(ctx);
 	struct sysctl_oid_list *child;
 	struct sysctl_ctx_list *ctx_list;
 	iflib_fl_t fl;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	int i, j;
 	char namebuf[NAME_BUFLEN];
 	char *qfmt;
 	struct sysctl_oid *queue_node, *fl_node, *node;
 	struct sysctl_oid_list *queue_list, *fl_list;
 	ctx_list = device_get_sysctl_ctx(dev);
 
 	node = ctx->ifc_sysctl_node;
 	child = SYSCTL_CHILDREN(node);
 
 	if (scctx->isc_ntxqsets > 100)
 		qfmt = "txq%03d";
 	else if (scctx->isc_ntxqsets > 10)
 		qfmt = "txq%02d";
 	else
 		qfmt = "txq%d";
 	for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) {
 		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
 		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
 		queue_list = SYSCTL_CHILDREN(queue_node);
 		SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu",
 			       CTLFLAG_RD,
 			       &txq->ift_task.gt_cpu, 0, "cpu this queue is bound to");
 #if MEMORY_LOGGING
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued",
 				CTLFLAG_RD,
 				&txq->ift_dequeued, "total mbufs freed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_enqueued",
 				CTLFLAG_RD,
 				&txq->ift_enqueued, "total mbufs enqueued");
 #endif
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag",
 				   CTLFLAG_RD,
 				   &txq->ift_mbuf_defrag, "# of times m_defrag was called");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "m_pullups",
 				   CTLFLAG_RD,
 				   &txq->ift_pullups, "# of times m_pullup was called");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag_failed",
 				   CTLFLAG_RD,
 				   &txq->ift_mbuf_defrag_failed, "# of times m_defrag failed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_desc_avail",
 				   CTLFLAG_RD,
 				   &txq->ift_no_desc_avail, "# of times no descriptors were available");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "tx_map_failed",
 				   CTLFLAG_RD,
 				   &txq->ift_map_failed, "# of times DMA map failed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txd_encap_efbig",
 				   CTLFLAG_RD,
 				   &txq->ift_txd_encap_efbig, "# of times txd_encap returned EFBIG");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_tx_dma_setup",
 				   CTLFLAG_RD,
 				   &txq->ift_no_tx_dma_setup, "# of times map failed for other than EFBIG");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_pidx",
 				   CTLFLAG_RD,
 				   &txq->ift_pidx, 1, "Producer Index");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx",
 				   CTLFLAG_RD,
 				   &txq->ift_cidx, 1, "Consumer Index");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx_processed",
 				   CTLFLAG_RD,
 				   &txq->ift_cidx_processed, 1, "Consumer Index seen by credit update");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_in_use",
 				   CTLFLAG_RD,
 				   &txq->ift_in_use, 1, "descriptors in use");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_processed",
 				   CTLFLAG_RD,
 				   &txq->ift_processed, "descriptors procesed for clean");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_cleaned",
 				   CTLFLAG_RD,
 				   &txq->ift_cleaned, "total cleaned");
 		SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state",
 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
 		    __DEVOLATILE(uint64_t *, &txq->ift_br->state), 0,
 		    mp_ring_state_handler, "A", "soft ring state");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_enqueues",
 				       CTLFLAG_RD, &txq->ift_br->enqueues,
 				       "# of enqueues to the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_drops",
 				       CTLFLAG_RD, &txq->ift_br->drops,
 				       "# of drops in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_starts",
 				       CTLFLAG_RD, &txq->ift_br->starts,
 				       "# of normal consumer starts in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_stalls",
 				       CTLFLAG_RD, &txq->ift_br->stalls,
 					       "# of consumer stalls in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_restarts",
 			       CTLFLAG_RD, &txq->ift_br->restarts,
 				       "# of consumer restarts in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_abdications",
 				       CTLFLAG_RD, &txq->ift_br->abdications,
 				       "# of consumer abdications in the mp_ring for this queue");
 	}
 
 	if (scctx->isc_nrxqsets > 100)
 		qfmt = "rxq%03d";
 	else if (scctx->isc_nrxqsets > 10)
 		qfmt = "rxq%02d";
 	else
 		qfmt = "rxq%d";
 	for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) {
 		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
 		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
 		queue_list = SYSCTL_CHILDREN(queue_node);
 		SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu",
 			       CTLFLAG_RD,
 			       &rxq->ifr_task.gt_cpu, 0, "cpu this queue is bound to");
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx",
 				       CTLFLAG_RD,
 				       &rxq->ifr_cq_cidx, 1, "Consumer Index");
 		}
 
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
 			snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j);
 			fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list, OID_AUTO, namebuf,
 			    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "freelist Name");
 			fl_list = SYSCTL_CHILDREN(fl_node);
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "pidx",
 				       CTLFLAG_RD,
 				       &fl->ifl_pidx, 1, "Producer Index");
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "cidx",
 				       CTLFLAG_RD,
 				       &fl->ifl_cidx, 1, "Consumer Index");
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "credits",
 				       CTLFLAG_RD,
 				       &fl->ifl_credits, 1, "credits available");
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "buf_size",
 				       CTLFLAG_RD,
 				       &fl->ifl_buf_size, 1, "buffer size");
 #if MEMORY_LOGGING
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_enqueued",
 					CTLFLAG_RD,
 					&fl->ifl_m_enqueued, "mbufs allocated");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_dequeued",
 					CTLFLAG_RD,
 					&fl->ifl_m_dequeued, "mbufs freed");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_enqueued",
 					CTLFLAG_RD,
 					&fl->ifl_cl_enqueued, "clusters allocated");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_dequeued",
 					CTLFLAG_RD,
 					&fl->ifl_cl_dequeued, "clusters freed");
 #endif
 		}
 	}
 
 }
 
 void
 iflib_request_reset(if_ctx_t ctx)
 {
 
 	STATE_LOCK(ctx);
 	ctx->ifc_flags |= IFC_DO_RESET;
 	STATE_UNLOCK(ctx);
 }
 
 #ifndef __NO_STRICT_ALIGNMENT
 static struct mbuf *
 iflib_fixup_rx(struct mbuf *m)
 {
 	struct mbuf *n;
 
 	if (m->m_len <= (MCLBYTES - ETHER_HDR_LEN)) {
 		bcopy(m->m_data, m->m_data + ETHER_HDR_LEN, m->m_len);
 		m->m_data += ETHER_HDR_LEN;
 		n = m;
 	} else {
 		MGETHDR(n, M_NOWAIT, MT_DATA);
 		if (n == NULL) {
 			m_freem(m);
 			return (NULL);
 		}
 		bcopy(m->m_data, n->m_data, ETHER_HDR_LEN);
 		m->m_data += ETHER_HDR_LEN;
 		m->m_len -= ETHER_HDR_LEN;
 		n->m_len = ETHER_HDR_LEN;
 		M_MOVE_PKTHDR(n, m);
 		n->m_next = m;
 	}
 	return (n);
 }
 #endif
 
 #ifdef DEBUGNET
 static void
 iflib_debugnet_init(if_t ifp, int *nrxr, int *ncl, int *clsize)
 {
 	if_ctx_t ctx;
 
 	ctx = if_getsoftc(ifp);
 	CTX_LOCK(ctx);
 	*nrxr = NRXQSETS(ctx);
 	*ncl = ctx->ifc_rxqs[0].ifr_fl->ifl_size;
 	*clsize = ctx->ifc_rxqs[0].ifr_fl->ifl_buf_size;
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_debugnet_event(if_t ifp, enum debugnet_ev event)
 {
 	if_ctx_t ctx;
 	if_softc_ctx_t scctx;
 	iflib_fl_t fl;
 	iflib_rxq_t rxq;
 	int i, j;
 
 	ctx = if_getsoftc(ifp);
 	scctx = &ctx->ifc_softc_ctx;
 
 	switch (event) {
 	case DEBUGNET_START:
 		for (i = 0; i < scctx->isc_nrxqsets; i++) {
 			rxq = &ctx->ifc_rxqs[i];
 			for (j = 0; j < rxq->ifr_nfl; j++) {
 				fl = rxq->ifr_fl;
 				fl->ifl_zone = m_getzone(fl->ifl_buf_size);
 			}
 		}
 		iflib_no_tx_batch = 1;
 		break;
 	default:
 		break;
 	}
 }
 
 static int
 iflib_debugnet_transmit(if_t ifp, struct mbuf *m)
 {
 	if_ctx_t ctx;
 	iflib_txq_t txq;
 	int error;
 
 	ctx = if_getsoftc(ifp);
 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 	    IFF_DRV_RUNNING)
 		return (EBUSY);
 
 	txq = &ctx->ifc_txqs[0];
 	error = iflib_encap(txq, &m);
 	if (error == 0)
 		(void)iflib_txd_db_check(txq, true);
 	return (error);
 }
 
 static int
 iflib_debugnet_poll(if_t ifp, int count)
 {
 	struct epoch_tracker et;
 	if_ctx_t ctx;
 	if_softc_ctx_t scctx;
 	iflib_txq_t txq;
 	int i;
 
 	ctx = if_getsoftc(ifp);
 	scctx = &ctx->ifc_softc_ctx;
 
 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 	    IFF_DRV_RUNNING)
 		return (EBUSY);
 
 	txq = &ctx->ifc_txqs[0];
 	(void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
 
 	NET_EPOCH_ENTER(et);
 	for (i = 0; i < scctx->isc_nrxqsets; i++)
 		(void)iflib_rxeof(&ctx->ifc_rxqs[i], 16 /* XXX */);
 	NET_EPOCH_EXIT(et);
 	return (0);
 }
 #endif /* DEBUGNET */
diff --git a/sys/net/iflib_clone.c b/sys/net/iflib_clone.c
index 89d37a586f8d..32ec0119d33b 100644
--- a/sys/net/iflib_clone.c
+++ b/sys/net/iflib_clone.c
@@ -1,302 +1,303 @@
 /*-
  * Copyright (c) 2014-2018, Matthew Macy <mmacy@mattmacy.io>
  * Copyright (C) 2017-2018 Joyent Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  *  1. Redistributions of source code must retain the above copyright notice,
  *     this list of conditions and the following disclaimer.
  *
  *  2. Neither the name of Matthew Macy nor the names of its
  *     contributors may be used to endorse or promote products derived from
  *     this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_acpi.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 #include <sys/event.h>
 #include <sys/sockio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/kobj.h>
 #include <sys/rman.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/taskqueue.h>
 #include <sys/limits.h>
 #include <sys/queue.h>
 #include <sys/jail.h>
 #include <sys/md5.h>
 #include <sys/proc.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/if_media.h>
 #include <net/if_clone.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/vnet.h>
 
 #include <net/iflib.h>
 #include <net/iflib_private.h>
 #include "ifdi_if.h"
 
 int
 noop_attach(device_t dev)
 {
 	return (0);
 }
 
 int
 iflib_pseudo_detach(device_t dev)
 {
 	if_ctx_t ctx;
 
 	ctx = device_get_softc(dev);
 	if ((iflib_get_flags(ctx) & (IFC_INIT_DONE | IFC_IN_DETACH)) ==
 	    IFC_INIT_DONE)
 		return (EBUSY);
 	return (0);
 }
 
 static device_t iflib_pseudodev;
 
 static struct mtx pseudoif_mtx;
 MTX_SYSINIT(pseudoif_mtx, &pseudoif_mtx, "pseudoif_mtx", MTX_DEF);
 
 #define PSEUDO_LOCK() mtx_lock(&pseudoif_mtx);
 #define PSEUDO_UNLOCK() mtx_unlock(&pseudoif_mtx);
 
 struct if_pseudo {
 	eventhandler_tag ip_detach_tag;
 	eventhandler_tag ip_lladdr_tag;
 	struct if_clone *ip_ifc;
 	if_shared_ctx_t ip_sctx;
 	devclass_t ip_dc;
 	LIST_ENTRY(if_pseudo) ip_list;
 	int ip_on_list;
 };
 
 static LIST_HEAD(, if_pseudo) iflib_pseudos = LIST_HEAD_INITIALIZER(iflib_pseudos);
 
 /*
  * XXX this assumes that the rest of the
  * code won't hang on to it after it's
  * removed / unloaded
  */
 static if_pseudo_t
 iflib_ip_lookup(const char *name)
 {
 	if_pseudo_t ip = NULL;
 
 	PSEUDO_LOCK();
 	LIST_FOREACH(ip, &iflib_pseudos, ip_list) {
 		if (!strcmp(ip->ip_sctx->isc_name, name))
 			break;
 	}
 	PSEUDO_UNLOCK();
 	return (ip);
 }
 
 static void
 iflib_ip_delete(if_pseudo_t ip)
 {
 	PSEUDO_LOCK();
 	if (ip->ip_on_list) {
 		LIST_REMOVE(ip, ip_list);
 		ip->ip_on_list = 0;
 	}
 	PSEUDO_UNLOCK();
 }
 
 static void
 iflib_ip_insert(if_pseudo_t ip)
 {
 	PSEUDO_LOCK();
 	if (!ip->ip_on_list) {
 		LIST_INSERT_HEAD(&iflib_pseudos, ip, ip_list);
 		ip->ip_on_list = 1;
 	}
 	PSEUDO_UNLOCK();
 }
 
 static void
 iflib_ifdetach(void *arg __unused, if_t ifp)
 {
 
 	/* If the ifnet is just being renamed, don't do anything. */
 	if (ifp->if_flags & IFF_RENAMING)
 		return;
 }
 
 static void
 iflib_iflladdr(void *arg __unused, if_t ifp __unused)
 {
 
 }
 
 static int
 iflib_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	const char *name = ifc_name(ifc);
 	struct iflib_cloneattach_ctx clctx;
 	if_ctx_t ctx;
 	if_pseudo_t ip;
 	device_t dev;
 	int rc;
 
 	clctx.cc_ifc = ifc;
 	clctx.cc_len = 0;
 	clctx.cc_params = params;
 	clctx.cc_name = name;
 
 	if (__predict_false(iflib_pseudodev == NULL)) {
 		/* SYSINIT initialization would panic !?! */
 		bus_topo_lock();
 		iflib_pseudodev = device_add_child(root_bus, "ifpseudo", 0);
 		bus_topo_unlock();
 		MPASS(iflib_pseudodev != NULL);
 	}
 	ip = iflib_ip_lookup(name);
 	if (ip == NULL) {
 		printf("no ip found for %s\n", name);
 		return (ENOENT);
 	}
 	if ((dev = devclass_get_device(ip->ip_dc, unit)) != NULL) {
 		printf("unit %d allocated\n", unit);
 		bus_generic_print_child(iflib_pseudodev, dev);
 		return (EBUSY);
 	}
 	PSEUDO_LOCK();
 	dev = device_add_child(iflib_pseudodev, name, unit);
 	device_set_driver(dev, &iflib_pseudodriver);
 	PSEUDO_UNLOCK();
 	device_quiet(dev);
 	rc = device_attach(dev);
 	MPASS(rc == 0);
 	MPASS(dev != NULL);
 	MPASS(devclass_get_device(ip->ip_dc, unit) == dev);
 	rc = iflib_pseudo_register(dev, ip->ip_sctx, &ctx, &clctx);
 	if (rc) {
 		bus_topo_lock();
 		device_delete_child(iflib_pseudodev, dev);
 		bus_topo_unlock();
 	} else
 		device_set_softc(dev, ctx);
 
 	return (rc);
 }
 
 static void
 iflib_clone_destroy(if_t ifp)
 {
 	if_ctx_t ctx;
 	device_t dev;
 	struct sx *ctx_lock;
 	int rc;
 
 	/*
 	 * Detach device / free / free unit 
 	 */
 	ctx = if_getsoftc(ifp);
 	dev = iflib_get_dev(ctx);
 	ctx_lock = iflib_ctx_lock_get(ctx);
 	sx_xlock(ctx_lock);
 	iflib_set_detach(ctx);
 	iflib_stop(ctx);
 	sx_xunlock(ctx_lock);
 
 	bus_topo_lock();
 	rc = device_delete_child(iflib_pseudodev, dev);
 	bus_topo_unlock();
 	if (rc == 0)
 		iflib_pseudo_deregister(ctx);
 }
 
 if_pseudo_t
 iflib_clone_register(if_shared_ctx_t sctx)
 {
 	if_pseudo_t ip;
 
 	if (sctx->isc_name == NULL) {
 		printf("iflib_clone_register failed - shared_ctx needs to have a device name\n");
 		return (NULL);
 	}
 	if (iflib_ip_lookup(sctx->isc_name) != NULL) {
 		printf("iflib_clone_register failed - shared_ctx %s alread registered\n",
 			   sctx->isc_name);
 		return (NULL);
 	}
 	ip = malloc(sizeof(*ip), M_IFLIB, M_WAITOK|M_ZERO);
 	ip->ip_sctx = sctx;
 	ip->ip_dc = devclass_create(sctx->isc_name);
 	if (ip->ip_dc == NULL)
 		goto fail_clone;
 	/* XXX --- we can handle clone_advanced later */
 	ip->ip_ifc  = if_clone_simple(sctx->isc_name, iflib_clone_create, iflib_clone_destroy, 0);
 	if (ip->ip_ifc == NULL) {
 		printf("clone_simple failed -- cloned %s  devices will not be available\n", sctx->isc_name);
 		goto fail_clone;
 	}
 	ip->ip_lladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event,
 											 iflib_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
 	if (ip->ip_lladdr_tag == NULL)
 		goto fail_addr;
 	ip->ip_detach_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
 											 iflib_ifdetach, NULL, EVENTHANDLER_PRI_ANY);
 
 	if (ip->ip_detach_tag == NULL)
 		goto fail_depart;
 
 	iflib_ip_insert(ip);
 	return (ip);
  fail_depart:
 	EVENTHANDLER_DEREGISTER(iflladdr_event, ip->ip_lladdr_tag);
  fail_addr:
 	if_clone_detach(ip->ip_ifc);
  fail_clone:
 	free(ip, M_IFLIB);
 	return (NULL);
 }
 
 void
 iflib_clone_deregister(if_pseudo_t ip)
 {
 	/* XXX check that is not still in use */
 	iflib_ip_delete(ip);
 	EVENTHANDLER_DEREGISTER(ifnet_departure_event, ip->ip_detach_tag);
 	EVENTHANDLER_DEREGISTER(iflladdr_event, ip->ip_lladdr_tag);
 	if_clone_detach(ip->ip_ifc);
 	/* XXX free devclass */
 	free(ip, M_IFLIB);
 }
diff --git a/sys/net/netisr.c b/sys/net/netisr.c
index 9898e0b18caf..e5ec57a7263d 100644
--- a/sys/net/netisr.c
+++ b/sys/net/netisr.c
@@ -1,1564 +1,1565 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2007-2009 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert N. M. Watson under contract
  * to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * netisr is a packet dispatch service, allowing synchronous (directly
  * dispatched) and asynchronous (deferred dispatch) processing of packets by
  * registered protocol handlers.  Callers pass a protocol identifier and
  * packet to netisr, along with a direct dispatch hint, and work will either
  * be immediately processed by the registered handler, or passed to a
  * software interrupt (SWI) thread for deferred dispatch.  Callers will
  * generally select one or the other based on:
  *
  * - Whether directly dispatching a netisr handler lead to code reentrance or
  *   lock recursion, such as entering the socket code from the socket code.
  * - Whether directly dispatching a netisr handler lead to recursive
  *   processing, such as when decapsulating several wrapped layers of tunnel
  *   information (IPSEC within IPSEC within ...).
  *
  * Maintaining ordering for protocol streams is a critical design concern.
  * Enforcing ordering limits the opportunity for concurrency, but maintains
  * the strong ordering requirements found in some protocols, such as TCP.  Of
  * related concern is CPU affinity--it is desirable to process all data
  * associated with a particular stream on the same CPU over time in order to
  * avoid acquiring locks associated with the connection on different CPUs,
  * keep connection data in one cache, and to generally encourage associated
  * user threads to live on the same CPU as the stream.  It's also desirable
  * to avoid lock migration and contention where locks are associated with
  * more than one flow.
  *
  * netisr supports several policy variations, represented by the
  * NETISR_POLICY_* constants, allowing protocols to play various roles in
  * identifying flows, assigning work to CPUs, etc.  These are described in
  * netisr.h.
  */
 
 #include "opt_ddb.h"
 #include "opt_device_polling.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/interrupt.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/rmlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #define	_WANT_NETISR_INTERNAL	/* Enable definitions from netisr_internal.h */
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/netisr.h>
 #include <net/netisr_internal.h>
 #include <net/vnet.h>
 
 /*-
  * Synchronize use and modification of the registered netisr data structures;
  * acquire a read lock while modifying the set of registered protocols to
  * prevent partially registered or unregistered protocols from being run.
  *
  * The following data structures and fields are protected by this lock:
  *
  * - The netisr_proto array, including all fields of struct netisr_proto.
  * - The nws array, including all fields of struct netisr_worker.
  * - The nws_array array.
  *
  * Note: the NETISR_LOCKING define controls whether read locks are acquired
  * in packet processing paths requiring netisr registration stability.  This
  * is disabled by default as it can lead to measurable performance
  * degradation even with rmlocks (3%-6% for loopback ping-pong traffic), and
  * because netisr registration and unregistration is extremely rare at
  * runtime.  If it becomes more common, this decision should be revisited.
  *
  * XXXRW: rmlocks don't support assertions.
  */
 static struct rmlock	netisr_rmlock;
 #define	NETISR_LOCK_INIT()	rm_init_flags(&netisr_rmlock, "netisr", \
 				    RM_NOWITNESS)
 #define	NETISR_LOCK_ASSERT()
 #define	NETISR_RLOCK(tracker)	rm_rlock(&netisr_rmlock, (tracker))
 #define	NETISR_RUNLOCK(tracker)	rm_runlock(&netisr_rmlock, (tracker))
 #define	NETISR_WLOCK()		rm_wlock(&netisr_rmlock)
 #define	NETISR_WUNLOCK()	rm_wunlock(&netisr_rmlock)
 /* #define	NETISR_LOCKING */
 
 static SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "netisr");
 
 /*-
  * Three global direct dispatch policies are supported:
  *
  * NETISR_DISPATCH_DEFERRED: All work is deferred for a netisr, regardless of
  * context (may be overridden by protocols).
  *
  * NETISR_DISPATCH_HYBRID: If the executing context allows direct dispatch,
  * and we're running on the CPU the work would be performed on, then direct
  * dispatch it if it wouldn't violate ordering constraints on the workstream.
  *
  * NETISR_DISPATCH_DIRECT: If the executing context allows direct dispatch,
  * always direct dispatch.  (The default.)
  *
  * Notice that changing the global policy could lead to short periods of
  * misordered processing, but this is considered acceptable as compared to
  * the complexity of enforcing ordering during policy changes.  Protocols can
  * override the global policy (when they're not doing that, they select
  * NETISR_DISPATCH_DEFAULT).
  */
 #define	NETISR_DISPATCH_POLICY_DEFAULT	NETISR_DISPATCH_DIRECT
 #define	NETISR_DISPATCH_POLICY_MAXSTR	20 /* Used for temporary buffers. */
 static u_int	netisr_dispatch_policy = NETISR_DISPATCH_POLICY_DEFAULT;
 static int	sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_net_isr, OID_AUTO, dispatch,
     CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT,
     0, 0, sysctl_netisr_dispatch_policy, "A",
     "netisr dispatch policy");
 
 /*
  * Allow the administrator to limit the number of threads (CPUs) to use for
  * netisr.  We don't check netisr_maxthreads before creating the thread for
  * CPU 0. This must be set at boot. We will create at most one thread per CPU.
  * By default we initialize this to 1 which would assign just 1 cpu (cpu0) and
  * therefore only 1 workstream. If set to -1, netisr would use all cpus
  * (mp_ncpus) and therefore would have those many workstreams. One workstream
  * per thread (CPU).
  */
 static int	netisr_maxthreads = 1;		/* Max number of threads. */
 SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RDTUN,
     &netisr_maxthreads, 0,
     "Use at most this many CPUs for netisr processing");
 
 static int	netisr_bindthreads = 0;		/* Bind threads to CPUs. */
 SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RDTUN,
     &netisr_bindthreads, 0, "Bind netisr threads to CPUs.");
 
 /*
  * Limit per-workstream mbuf queue limits s to at most net.isr.maxqlimit,
  * both for initial configuration and later modification using
  * netisr_setqlimit().
  */
 #define	NETISR_DEFAULT_MAXQLIMIT	10240
 static u_int	netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT;
 SYSCTL_UINT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RDTUN,
     &netisr_maxqlimit, 0,
     "Maximum netisr per-protocol, per-CPU queue depth.");
 
 /*
  * The default per-workstream mbuf queue limit for protocols that don't
  * initialize the nh_qlimit field of their struct netisr_handler.  If this is
  * set above netisr_maxqlimit, we truncate it to the maximum during boot.
  */
 #define	NETISR_DEFAULT_DEFAULTQLIMIT	256
 static u_int	netisr_defaultqlimit = NETISR_DEFAULT_DEFAULTQLIMIT;
 SYSCTL_UINT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RDTUN,
     &netisr_defaultqlimit, 0,
     "Default netisr per-protocol, per-CPU queue limit if not set by protocol");
 
 /*
  * Store and export the compile-time constant NETISR_MAXPROT limit on the
  * number of protocols that can register with netisr at a time.  This is
  * required for crashdump analysis, as it sizes netisr_proto[].
  */
 static u_int	netisr_maxprot = NETISR_MAXPROT;
 SYSCTL_UINT(_net_isr, OID_AUTO, maxprot, CTLFLAG_RD,
     &netisr_maxprot, 0,
     "Compile-time limit on the number of protocols supported by netisr.");
 
 /*
  * The netisr_proto array describes all registered protocols, indexed by
  * protocol number.  See netisr_internal.h for more details.
  */
 static struct netisr_proto	netisr_proto[NETISR_MAXPROT];
 
 #ifdef VIMAGE
 /*
  * The netisr_enable array describes a per-VNET flag for registered
  * protocols on whether this netisr is active in this VNET or not.
  * netisr_register() will automatically enable the netisr for the
  * default VNET and all currently active instances.
  * netisr_unregister() will disable all active VNETs, including vnet0.
  * Individual network stack instances can be enabled/disabled by the
  * netisr_(un)register _vnet() functions.
  * With this we keep the one netisr_proto per protocol but add a
  * mechanism to stop netisr processing for vnet teardown.
  * Apart from that we expect a VNET to always be enabled.
  */
 VNET_DEFINE_STATIC(u_int,	netisr_enable[NETISR_MAXPROT]);
 #define	V_netisr_enable		VNET(netisr_enable)
 #endif
 
 /*
  * Per-CPU workstream data.  See netisr_internal.h for more details.
  */
 DPCPU_DEFINE(struct netisr_workstream, nws);
 
 /*
  * Map contiguous values between 0 and nws_count into CPU IDs appropriate for
  * accessing workstreams.  This allows constructions of the form
  * DPCPU_ID_GET(nws_array[arbitraryvalue % nws_count], nws).
  */
 static u_int				 nws_array[MAXCPU];
 
 /*
  * Number of registered workstreams.  Will be at most the number of running
  * CPUs once fully started.
  */
 static u_int				 nws_count;
 SYSCTL_UINT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD,
     &nws_count, 0, "Number of extant netisr threads.");
 
 /*
  * Synchronization for each workstream: a mutex protects all mutable fields
  * in each stream, including per-protocol state (mbuf queues).  The SWI is
  * woken up if asynchronous dispatch is required.
  */
 #define	NWS_LOCK(s)		mtx_lock(&(s)->nws_mtx)
 #define	NWS_LOCK_ASSERT(s)	mtx_assert(&(s)->nws_mtx, MA_OWNED)
 #define	NWS_UNLOCK(s)		mtx_unlock(&(s)->nws_mtx)
 #define	NWS_SIGNAL(s)		swi_sched((s)->nws_swi_cookie, 0)
 
 /*
  * Utility routines for protocols that implement their own mapping of flows
  * to CPUs.
  */
 u_int
 netisr_get_cpucount(void)
 {
 
 	return (nws_count);
 }
 
 u_int
 netisr_get_cpuid(u_int cpunumber)
 {
 
 	return (nws_array[cpunumber % nws_count]);
 }
 
 /*
  * The default implementation of flow -> CPU ID mapping.
  *
  * Non-static so that protocols can use it to map their own work to specific
  * CPUs in a manner consistent to netisr for affinity purposes.
  */
 u_int
 netisr_default_flow2cpu(u_int flowid)
 {
 
 	return (nws_array[flowid % nws_count]);
 }
 
 /*
  * Dispatch tunable and sysctl configuration.
  */
 struct netisr_dispatch_table_entry {
 	u_int		 ndte_policy;
 	const char	*ndte_policy_str;
 };
 static const struct netisr_dispatch_table_entry netisr_dispatch_table[] = {
 	{ NETISR_DISPATCH_DEFAULT, "default" },
 	{ NETISR_DISPATCH_DEFERRED, "deferred" },
 	{ NETISR_DISPATCH_HYBRID, "hybrid" },
 	{ NETISR_DISPATCH_DIRECT, "direct" },
 };
 
 static void
 netisr_dispatch_policy_to_str(u_int dispatch_policy, char *buffer,
     u_int buflen)
 {
 	const struct netisr_dispatch_table_entry *ndtep;
 	const char *str;
 	u_int i;
 
 	str = "unknown";
 	for (i = 0; i < nitems(netisr_dispatch_table); i++) {
 		ndtep = &netisr_dispatch_table[i];
 		if (ndtep->ndte_policy == dispatch_policy) {
 			str = ndtep->ndte_policy_str;
 			break;
 		}
 	}
 	snprintf(buffer, buflen, "%s", str);
 }
 
 static int
 netisr_dispatch_policy_from_str(const char *str, u_int *dispatch_policyp)
 {
 	const struct netisr_dispatch_table_entry *ndtep;
 	u_int i;
 
 	for (i = 0; i < nitems(netisr_dispatch_table); i++) {
 		ndtep = &netisr_dispatch_table[i];
 		if (strcmp(ndtep->ndte_policy_str, str) == 0) {
 			*dispatch_policyp = ndtep->ndte_policy;
 			return (0);
 		}
 	}
 	return (EINVAL);
 }
 
 static int
 sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS)
 {
 	char tmp[NETISR_DISPATCH_POLICY_MAXSTR];
 	size_t len;
 	u_int dispatch_policy;
 	int error;
 
 	netisr_dispatch_policy_to_str(netisr_dispatch_policy, tmp,
 	    sizeof(tmp));
 	/*
 	 * netisr is initialised very early during the boot when malloc isn't
 	 * available yet so we can't use sysctl_handle_string() to process
 	 * any non-default value that was potentially set via loader.
 	 */
 	if (req->newptr != NULL) {
 		len = req->newlen - req->newidx;
 		if (len >= NETISR_DISPATCH_POLICY_MAXSTR)
 			return (EINVAL);
 		error = SYSCTL_IN(req, tmp, len);
 		if (error == 0) {
 			tmp[len] = '\0';
 			error = netisr_dispatch_policy_from_str(tmp,
 			    &dispatch_policy);
 			if (error == 0 &&
 			    dispatch_policy == NETISR_DISPATCH_DEFAULT)
 				error = EINVAL;
 			if (error == 0)
 				netisr_dispatch_policy = dispatch_policy;
 		}
 	} else {
 		error = sysctl_handle_string(oidp, tmp, sizeof(tmp), req);
 	}
 	return (error);
 }
 
 /*
  * Register a new netisr handler, which requires initializing per-protocol
  * fields for each workstream.  All netisr work is briefly suspended while
  * the protocol is installed.
  */
 void
 netisr_register(const struct netisr_handler *nhp)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct netisr_work *npwp;
 	const char *name;
 	u_int i, proto;
 
 	proto = nhp->nh_proto;
 	name = nhp->nh_name;
 
 	/*
 	 * Test that the requested registration is valid.
 	 */
 	KASSERT(nhp->nh_name != NULL,
 	    ("%s: nh_name NULL for %u", __func__, proto));
 	KASSERT(nhp->nh_handler != NULL,
 	    ("%s: nh_handler NULL for %s", __func__, name));
 	KASSERT(nhp->nh_policy == NETISR_POLICY_SOURCE ||
 	    nhp->nh_policy == NETISR_POLICY_FLOW ||
 	    nhp->nh_policy == NETISR_POLICY_CPU,
 	    ("%s: unsupported nh_policy %u for %s", __func__,
 	    nhp->nh_policy, name));
 	KASSERT(nhp->nh_policy == NETISR_POLICY_FLOW ||
 	    nhp->nh_m2flow == NULL,
 	    ("%s: nh_policy != FLOW but m2flow defined for %s", __func__,
 	    name));
 	KASSERT(nhp->nh_policy == NETISR_POLICY_CPU || nhp->nh_m2cpuid == NULL,
 	    ("%s: nh_policy != CPU but m2cpuid defined for %s", __func__,
 	    name));
 	KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL,
 	    ("%s: nh_policy == CPU but m2cpuid not defined for %s", __func__,
 	    name));
 	KASSERT(nhp->nh_dispatch == NETISR_DISPATCH_DEFAULT ||
 	    nhp->nh_dispatch == NETISR_DISPATCH_DEFERRED ||
 	    nhp->nh_dispatch == NETISR_DISPATCH_HYBRID ||
 	    nhp->nh_dispatch == NETISR_DISPATCH_DIRECT,
 	    ("%s: invalid nh_dispatch (%u)", __func__, nhp->nh_dispatch));
 
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u, %s): protocol too big", __func__, proto, name));
 
 	/*
 	 * Test that no existing registration exists for this protocol.
 	 */
 	NETISR_WLOCK();
 	KASSERT(netisr_proto[proto].np_name == NULL,
 	    ("%s(%u, %s): name present", __func__, proto, name));
 	KASSERT(netisr_proto[proto].np_handler == NULL,
 	    ("%s(%u, %s): handler present", __func__, proto, name));
 
 	netisr_proto[proto].np_name = name;
 	netisr_proto[proto].np_handler = nhp->nh_handler;
 	netisr_proto[proto].np_m2flow = nhp->nh_m2flow;
 	netisr_proto[proto].np_m2cpuid = nhp->nh_m2cpuid;
 	netisr_proto[proto].np_drainedcpu = nhp->nh_drainedcpu;
 	if (nhp->nh_qlimit == 0)
 		netisr_proto[proto].np_qlimit = netisr_defaultqlimit;
 	else if (nhp->nh_qlimit > netisr_maxqlimit) {
 		printf("%s: %s requested queue limit %u capped to "
 		    "net.isr.maxqlimit %u\n", __func__, name, nhp->nh_qlimit,
 		    netisr_maxqlimit);
 		netisr_proto[proto].np_qlimit = netisr_maxqlimit;
 	} else
 		netisr_proto[proto].np_qlimit = nhp->nh_qlimit;
 	netisr_proto[proto].np_policy = nhp->nh_policy;
 	netisr_proto[proto].np_dispatch = nhp->nh_dispatch;
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		bzero(npwp, sizeof(*npwp));
 		npwp->nw_qlimit = netisr_proto[proto].np_qlimit;
 	}
 
 #ifdef VIMAGE
 	/*
 	 * Test that we are in vnet0 and have a curvnet set.
 	 */
 	KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__));
 	KASSERT(IS_DEFAULT_VNET(curvnet), ("%s: curvnet %p is not vnet0 %p",
 	    __func__, curvnet, vnet0));
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		V_netisr_enable[proto] = 1;
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 #endif
 	NETISR_WUNLOCK();
 }
 
 /*
  * Clear drop counters across all workstreams for a protocol.
  */
 void
 netisr_clearqdrops(const struct netisr_handler *nhp)
 {
 	struct netisr_work *npwp;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int i, proto;
 
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_WLOCK();
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		npwp->nw_qdrops = 0;
 	}
 	NETISR_WUNLOCK();
 }
 
 /*
  * Query current drop counters across all workstreams for a protocol.
  */
 void
 netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp)
 {
 	struct netisr_work *npwp;
 	struct rm_priotracker tracker;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int i, proto;
 
 	*qdropp = 0;
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_RLOCK(&tracker);
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		*qdropp += npwp->nw_qdrops;
 	}
 	NETISR_RUNLOCK(&tracker);
 }
 
 /*
  * Query current per-workstream queue limit for a protocol.
  */
 void
 netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp)
 {
 	struct rm_priotracker tracker;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int proto;
 
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_RLOCK(&tracker);
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 	*qlimitp = netisr_proto[proto].np_qlimit;
 	NETISR_RUNLOCK(&tracker);
 }
 
 /*
  * Update the queue limit across per-workstream queues for a protocol.  We
  * simply change the limits, and don't drain overflowed packets as they will
  * (hopefully) take care of themselves shortly.
  */
 int
 netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit)
 {
 	struct netisr_work *npwp;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int i, proto;
 
 	if (qlimit > netisr_maxqlimit)
 		return (EINVAL);
 
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_WLOCK();
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 
 	netisr_proto[proto].np_qlimit = qlimit;
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		npwp->nw_qlimit = qlimit;
 	}
 	NETISR_WUNLOCK();
 	return (0);
 }
 
 /*
  * Drain all packets currently held in a particular protocol work queue.
  */
 static void
 netisr_drain_proto(struct netisr_work *npwp)
 {
 	struct mbuf *m;
 
 	/*
 	 * We would assert the lock on the workstream but it's not passed in.
 	 */
 	while ((m = npwp->nw_head) != NULL) {
 		npwp->nw_head = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		if (npwp->nw_head == NULL)
 			npwp->nw_tail = NULL;
 		npwp->nw_len--;
 		m_freem(m);
 	}
 	KASSERT(npwp->nw_tail == NULL, ("%s: tail", __func__));
 	KASSERT(npwp->nw_len == 0, ("%s: len", __func__));
 }
 
 /*
  * Remove the registration of a network protocol, which requires clearing
  * per-protocol fields across all workstreams, including freeing all mbufs in
  * the queues at time of unregister.  All work in netisr is briefly suspended
  * while this takes place.
  */
 void
 netisr_unregister(const struct netisr_handler *nhp)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct netisr_work *npwp;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int i, proto;
 
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_WLOCK();
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 
 #ifdef VIMAGE
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		V_netisr_enable[proto] = 0;
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 #endif
 
 	netisr_proto[proto].np_name = NULL;
 	netisr_proto[proto].np_handler = NULL;
 	netisr_proto[proto].np_m2flow = NULL;
 	netisr_proto[proto].np_m2cpuid = NULL;
 	netisr_proto[proto].np_qlimit = 0;
 	netisr_proto[proto].np_policy = 0;
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		netisr_drain_proto(npwp);
 		bzero(npwp, sizeof(*npwp));
 	}
 	NETISR_WUNLOCK();
 }
 
 #ifdef VIMAGE
 void
 netisr_register_vnet(const struct netisr_handler *nhp)
 {
 	u_int proto;
 
 	proto = nhp->nh_proto;
 
 	KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__));
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, nhp->nh_name));
 	NETISR_WLOCK();
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    nhp->nh_name));
 
 	V_netisr_enable[proto] = 1;
 	NETISR_WUNLOCK();
 }
 
 static void
 netisr_drain_proto_vnet(struct vnet *vnet, u_int proto)
 {
 	struct epoch_tracker et;
 	struct netisr_workstream *nwsp;
 	struct netisr_work *npwp;
 	struct mbuf *m, *mp, *n, *ne;
 	struct ifnet *ifp;
 	u_int i;
 
 	KASSERT(vnet != NULL, ("%s: vnet is NULL", __func__));
 	NETISR_LOCK_ASSERT();
 
 	CPU_FOREACH(i) {
 		nwsp = DPCPU_ID_PTR(i, nws);
 		if (nwsp->nws_intr_event == NULL)
 			continue;
 		npwp = &nwsp->nws_work[proto];
 		NWS_LOCK(nwsp);
 
 		/*
 		 * Rather than dissecting and removing mbufs from the middle
 		 * of the chain, we build a new chain if the packet stays and
 		 * update the head and tail pointers at the end.  All packets
 		 * matching the given vnet are freed.
 		 */
 		m = npwp->nw_head;
 		n = ne = NULL;
 		NET_EPOCH_ENTER(et);
 		while (m != NULL) {
 			mp = m;
 			m = m->m_nextpkt;
 			mp->m_nextpkt = NULL;
 			if ((ifp = ifnet_byindexgen(mp->m_pkthdr.rcvidx,
 			    mp->m_pkthdr.rcvgen)) != NULL &&
 			    ifp->if_vnet != vnet) {
 				if (n == NULL) {
 					n = ne = mp;
 				} else {
 					ne->m_nextpkt = mp;
 					ne = mp;
 				}
 				continue;
 			}
 			/* This is a packet in the selected vnet, or belongs
 			   to destroyed interface. Free it. */
 			npwp->nw_len--;
 			m_freem(mp);
 		}
 		NET_EPOCH_EXIT(et);
 		npwp->nw_head = n;
 		npwp->nw_tail = ne;
 		NWS_UNLOCK(nwsp);
 	}
 }
 
 void
 netisr_unregister_vnet(const struct netisr_handler *nhp)
 {
 	u_int proto;
 
 	proto = nhp->nh_proto;
 
 	KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__));
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, nhp->nh_name));
 	NETISR_WLOCK();
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    nhp->nh_name));
 
 	V_netisr_enable[proto] = 0;
 
 	netisr_drain_proto_vnet(curvnet, proto);
 	NETISR_WUNLOCK();
 }
 #endif
 
 /*
  * Compose the global and per-protocol policies on dispatch, and return the
  * dispatch policy to use.
  */
 static u_int
 netisr_get_dispatch(struct netisr_proto *npp)
 {
 
 	/*
 	 * Protocol-specific configuration overrides the global default.
 	 */
 	if (npp->np_dispatch != NETISR_DISPATCH_DEFAULT)
 		return (npp->np_dispatch);
 	return (netisr_dispatch_policy);
 }
 
 /*
  * Look up the workstream given a packet and source identifier.  Do this by
  * checking the protocol's policy, and optionally call out to the protocol
  * for assistance if required.
  */
 static struct mbuf *
 netisr_select_cpuid(struct netisr_proto *npp, u_int dispatch_policy,
     uintptr_t source, struct mbuf *m, u_int *cpuidp)
 {
 	struct ifnet *ifp;
 	u_int policy;
 
 	NETISR_LOCK_ASSERT();
 
 	/*
 	 * In the event we have only one worker, shortcut and deliver to it
 	 * without further ado.
 	 */
 	if (nws_count == 1) {
 		*cpuidp = nws_array[0];
 		return (m);
 	}
 
 	/*
 	 * What happens next depends on the policy selected by the protocol.
 	 * If we want to support per-interface policies, we should do that
 	 * here first.
 	 */
 	policy = npp->np_policy;
 	if (policy == NETISR_POLICY_CPU) {
 		m = npp->np_m2cpuid(m, source, cpuidp);
 		if (m == NULL)
 			return (NULL);
 
 		/*
 		 * It's possible for a protocol not to have a good idea about
 		 * where to process a packet, in which case we fall back on
 		 * the netisr code to decide.  In the hybrid case, return the
 		 * current CPU ID, which will force an immediate direct
 		 * dispatch.  In the queued case, fall back on the SOURCE
 		 * policy.
 		 */
 		if (*cpuidp != NETISR_CPUID_NONE) {
 			*cpuidp = netisr_get_cpuid(*cpuidp);
 			return (m);
 		}
 		if (dispatch_policy == NETISR_DISPATCH_HYBRID) {
 			*cpuidp = netisr_get_cpuid(curcpu);
 			return (m);
 		}
 		policy = NETISR_POLICY_SOURCE;
 	}
 
 	if (policy == NETISR_POLICY_FLOW) {
 		if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE &&
 		    npp->np_m2flow != NULL) {
 			m = npp->np_m2flow(m, source);
 			if (m == NULL)
 				return (NULL);
 		}
 		if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
 			*cpuidp =
 			    netisr_default_flow2cpu(m->m_pkthdr.flowid);
 			return (m);
 		}
 		policy = NETISR_POLICY_SOURCE;
 	}
 
 	KASSERT(policy == NETISR_POLICY_SOURCE,
 	    ("%s: invalid policy %u for %s", __func__, npp->np_policy,
 	    npp->np_name));
 
 	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	ifp = m->m_pkthdr.rcvif;
 	if (ifp != NULL)
 		*cpuidp = nws_array[(ifp->if_index + source) % nws_count];
 	else
 		*cpuidp = nws_array[source % nws_count];
 	return (m);
 }
 
 /*
  * Process packets associated with a workstream and protocol.  For reasons of
  * fairness, we process up to one complete netisr queue at a time, moving the
  * queue to a stack-local queue for processing, but do not loop refreshing
  * from the global queue.  The caller is responsible for deciding whether to
  * loop, and for setting the NWS_RUNNING flag.  The passed workstream will be
  * locked on entry and relocked before return, but will be released while
  * processing.  The number of packets processed is returned.
  */
 static u_int
 netisr_process_workstream_proto(struct netisr_workstream *nwsp, u_int proto)
 {
 	struct netisr_work local_npw, *npwp;
 	u_int handled;
 	struct mbuf *m;
 
 	NETISR_LOCK_ASSERT();
 	NWS_LOCK_ASSERT(nwsp);
 
 	KASSERT(nwsp->nws_flags & NWS_RUNNING,
 	    ("%s(%u): not running", __func__, proto));
 	KASSERT(proto >= 0 && proto < NETISR_MAXPROT,
 	    ("%s(%u): invalid proto\n", __func__, proto));
 
 	npwp = &nwsp->nws_work[proto];
 	if (npwp->nw_len == 0)
 		return (0);
 
 	/*
 	 * Move the global work queue to a thread-local work queue.
 	 *
 	 * Notice that this means the effective maximum length of the queue
 	 * is actually twice that of the maximum queue length specified in
 	 * the protocol registration call.
 	 */
 	handled = npwp->nw_len;
 	local_npw = *npwp;
 	npwp->nw_head = NULL;
 	npwp->nw_tail = NULL;
 	npwp->nw_len = 0;
 	nwsp->nws_pendingbits &= ~(1 << proto);
 	NWS_UNLOCK(nwsp);
 	while ((m = local_npw.nw_head) != NULL) {
 		local_npw.nw_head = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		if (local_npw.nw_head == NULL)
 			local_npw.nw_tail = NULL;
 		local_npw.nw_len--;
 		if (__predict_false(m_rcvif_restore(m) == NULL)) {
 			m_freem(m);
 			continue;
 		}
 		CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
 		netisr_proto[proto].np_handler(m);
 		CURVNET_RESTORE();
 	}
 	KASSERT(local_npw.nw_len == 0,
 	    ("%s(%u): len %u", __func__, proto, local_npw.nw_len));
 	if (netisr_proto[proto].np_drainedcpu)
 		netisr_proto[proto].np_drainedcpu(nwsp->nws_cpu);
 	NWS_LOCK(nwsp);
 	npwp->nw_handled += handled;
 	return (handled);
 }
 
 /*
  * SWI handler for netisr -- processes packets in a set of workstreams that
  * it owns, woken up by calls to NWS_SIGNAL().  If this workstream is already
  * being direct dispatched, go back to sleep and wait for the dispatching
  * thread to wake us up again.
  */
 static void
 swi_net(void *arg)
 {
 #ifdef NETISR_LOCKING
 	struct rm_priotracker tracker;
 #endif
 	struct netisr_workstream *nwsp;
 	u_int bits, prot;
 
 	nwsp = arg;
 
 #ifdef DEVICE_POLLING
 	KASSERT(nws_count == 1,
 	    ("%s: device_polling but nws_count != 1", __func__));
 	netisr_poll();
 #endif
 #ifdef NETISR_LOCKING
 	NETISR_RLOCK(&tracker);
 #endif
 	NWS_LOCK(nwsp);
 	KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running"));
 	if (nwsp->nws_flags & NWS_DISPATCHING)
 		goto out;
 	nwsp->nws_flags |= NWS_RUNNING;
 	nwsp->nws_flags &= ~NWS_SCHEDULED;
 	while ((bits = nwsp->nws_pendingbits) != 0) {
 		while ((prot = ffs(bits)) != 0) {
 			prot--;
 			bits &= ~(1 << prot);
 			(void)netisr_process_workstream_proto(nwsp, prot);
 		}
 	}
 	nwsp->nws_flags &= ~NWS_RUNNING;
 out:
 	NWS_UNLOCK(nwsp);
 #ifdef NETISR_LOCKING
 	NETISR_RUNLOCK(&tracker);
 #endif
 #ifdef DEVICE_POLLING
 	netisr_pollmore();
 #endif
 }
 
 static int
 netisr_queue_workstream(struct netisr_workstream *nwsp, u_int proto,
     struct netisr_work *npwp, struct mbuf *m, int *dosignalp)
 {
 
 	NWS_LOCK_ASSERT(nwsp);
 
 	*dosignalp = 0;
 	if (npwp->nw_len < npwp->nw_qlimit) {
 		m_rcvif_serialize(m);
 		m->m_nextpkt = NULL;
 		if (npwp->nw_head == NULL) {
 			npwp->nw_head = m;
 			npwp->nw_tail = m;
 		} else {
 			npwp->nw_tail->m_nextpkt = m;
 			npwp->nw_tail = m;
 		}
 		npwp->nw_len++;
 		if (npwp->nw_len > npwp->nw_watermark)
 			npwp->nw_watermark = npwp->nw_len;
 
 		/*
 		 * We must set the bit regardless of NWS_RUNNING, so that
 		 * swi_net() keeps calling netisr_process_workstream_proto().
 		 */
 		nwsp->nws_pendingbits |= (1 << proto);
 		if (!(nwsp->nws_flags & 
 		    (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED))) {
 			nwsp->nws_flags |= NWS_SCHEDULED;
 			*dosignalp = 1;	/* Defer until unlocked. */
 		}
 		npwp->nw_queued++;
 		return (0);
 	} else {
 		m_freem(m);
 		npwp->nw_qdrops++;
 		return (ENOBUFS);
 	}
 }
 
 static int
 netisr_queue_internal(u_int proto, struct mbuf *m, u_int cpuid)
 {
 	struct netisr_workstream *nwsp;
 	struct netisr_work *npwp;
 	int dosignal, error;
 
 #ifdef NETISR_LOCKING
 	NETISR_LOCK_ASSERT();
 #endif
 	KASSERT(cpuid <= mp_maxid, ("%s: cpuid too big (%u, %u)", __func__,
 	    cpuid, mp_maxid));
 	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
 
 	dosignal = 0;
 	error = 0;
 	nwsp = DPCPU_ID_PTR(cpuid, nws);
 	npwp = &nwsp->nws_work[proto];
 	NWS_LOCK(nwsp);
 	error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal);
 	NWS_UNLOCK(nwsp);
 	if (dosignal)
 		NWS_SIGNAL(nwsp);
 	return (error);
 }
 
 int
 netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m)
 {
 #ifdef NETISR_LOCKING
 	struct rm_priotracker tracker;
 #endif
 	u_int cpuid;
 	int error;
 
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s: invalid proto %u", __func__, proto));
 
 #ifdef NETISR_LOCKING
 	NETISR_RLOCK(&tracker);
 #endif
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s: invalid proto %u", __func__, proto));
 
 #ifdef VIMAGE
 	if (V_netisr_enable[proto] == 0) {
 		m_freem(m);
 		return (ENOPROTOOPT);
 	}
 #endif
 
 	m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_DEFERRED,
 	    source, m, &cpuid);
 	if (m != NULL) {
 		KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__,
 		    cpuid));
 		VNET_ASSERT(m->m_pkthdr.rcvif != NULL,
 		    ("%s:%d rcvif == NULL: m=%p", __func__, __LINE__, m));
 		error = netisr_queue_internal(proto, m, cpuid);
 	} else
 		error = ENOBUFS;
 #ifdef NETISR_LOCKING
 	NETISR_RUNLOCK(&tracker);
 #endif
 	return (error);
 }
 
 int
 netisr_queue(u_int proto, struct mbuf *m)
 {
 
 	return (netisr_queue_src(proto, 0, m));
 }
 
 /*
  * Dispatch a packet for netisr processing; direct dispatch is permitted by
  * calling context.
  */
 int
 netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m)
 {
 #ifdef NETISR_LOCKING
 	struct rm_priotracker tracker;
 #endif
 	struct netisr_workstream *nwsp;
 	struct netisr_proto *npp;
 	struct netisr_work *npwp;
 	int dosignal, error;
 	u_int cpuid, dispatch_policy;
 
 	NET_EPOCH_ASSERT();
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s: invalid proto %u", __func__, proto));
 #ifdef NETISR_LOCKING
 	NETISR_RLOCK(&tracker);
 #endif
 	npp = &netisr_proto[proto];
 	KASSERT(npp->np_handler != NULL, ("%s: invalid proto %u", __func__,
 	    proto));
 
 #ifdef VIMAGE
 	if (V_netisr_enable[proto] == 0) {
 		m_freem(m);
 		return (ENOPROTOOPT);
 	}
 #endif
 
 	dispatch_policy = netisr_get_dispatch(npp);
 	if (dispatch_policy == NETISR_DISPATCH_DEFERRED)
 		return (netisr_queue_src(proto, source, m));
 
 	/*
 	 * If direct dispatch is forced, then unconditionally dispatch
 	 * without a formal CPU selection.  Borrow the current CPU's stats,
 	 * even if there's no worker on it.  In this case we don't update
 	 * nws_flags because all netisr processing will be source ordered due
 	 * to always being forced to directly dispatch.
 	 */
 	if (dispatch_policy == NETISR_DISPATCH_DIRECT) {
 		nwsp = DPCPU_PTR(nws);
 		npwp = &nwsp->nws_work[proto];
 		npwp->nw_dispatched++;
 		npwp->nw_handled++;
 		netisr_proto[proto].np_handler(m);
 		error = 0;
 		goto out_unlock;
 	}
 
 	KASSERT(dispatch_policy == NETISR_DISPATCH_HYBRID,
 	    ("%s: unknown dispatch policy (%u)", __func__, dispatch_policy));
 
 	/*
 	 * Otherwise, we execute in a hybrid mode where we will try to direct
 	 * dispatch if we're on the right CPU and the netisr worker isn't
 	 * already running.
 	 */
 	sched_pin();
 	m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_HYBRID,
 	    source, m, &cpuid);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto out_unpin;
 	}
 	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
 	if (cpuid != curcpu)
 		goto queue_fallback;
 	nwsp = DPCPU_PTR(nws);
 	npwp = &nwsp->nws_work[proto];
 
 	/*-
 	 * We are willing to direct dispatch only if three conditions hold:
 	 *
 	 * (1) The netisr worker isn't already running,
 	 * (2) Another thread isn't already directly dispatching, and
 	 * (3) The netisr hasn't already been woken up.
 	 */
 	NWS_LOCK(nwsp);
 	if (nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED)) {
 		error = netisr_queue_workstream(nwsp, proto, npwp, m,
 		    &dosignal);
 		NWS_UNLOCK(nwsp);
 		if (dosignal)
 			NWS_SIGNAL(nwsp);
 		goto out_unpin;
 	}
 
 	/*
 	 * The current thread is now effectively the netisr worker, so set
 	 * the dispatching flag to prevent concurrent processing of the
 	 * stream from another thread (even the netisr worker), which could
 	 * otherwise lead to effective misordering of the stream.
 	 */
 	nwsp->nws_flags |= NWS_DISPATCHING;
 	NWS_UNLOCK(nwsp);
 	netisr_proto[proto].np_handler(m);
 	NWS_LOCK(nwsp);
 	nwsp->nws_flags &= ~NWS_DISPATCHING;
 	npwp->nw_handled++;
 	npwp->nw_hybrid_dispatched++;
 
 	/*
 	 * If other work was enqueued by another thread while we were direct
 	 * dispatching, we need to signal the netisr worker to do that work.
 	 * In the future, we might want to do some of that work in the
 	 * current thread, rather than trigger further context switches.  If
 	 * so, we'll want to establish a reasonable bound on the work done in
 	 * the "borrowed" context.
 	 */
 	if (nwsp->nws_pendingbits != 0) {
 		nwsp->nws_flags |= NWS_SCHEDULED;
 		dosignal = 1;
 	} else
 		dosignal = 0;
 	NWS_UNLOCK(nwsp);
 	if (dosignal)
 		NWS_SIGNAL(nwsp);
 	error = 0;
 	goto out_unpin;
 
 queue_fallback:
 	error = netisr_queue_internal(proto, m, cpuid);
 out_unpin:
 	sched_unpin();
 out_unlock:
 #ifdef NETISR_LOCKING
 	NETISR_RUNLOCK(&tracker);
 #endif
 	return (error);
 }
 
 int
 netisr_dispatch(u_int proto, struct mbuf *m)
 {
 
 	return (netisr_dispatch_src(proto, 0, m));
 }
 
 #ifdef DEVICE_POLLING
 /*
  * Kernel polling borrows a netisr thread to run interface polling in; this
  * function allows kernel polling to request that the netisr thread be
  * scheduled even if no packets are pending for protocols.
  */
 void
 netisr_sched_poll(void)
 {
 	struct netisr_workstream *nwsp;
 
 	nwsp = DPCPU_ID_PTR(nws_array[0], nws);
 	NWS_SIGNAL(nwsp);
 }
 #endif
 
 static void
 netisr_start_swi(u_int cpuid, struct pcpu *pc)
 {
 	char swiname[12];
 	struct netisr_workstream *nwsp;
 	int error;
 
 	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
 
 	nwsp = DPCPU_ID_PTR(cpuid, nws);
 	mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF);
 	nwsp->nws_cpu = cpuid;
 	snprintf(swiname, sizeof(swiname), "netisr %u", cpuid);
 	error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, nwsp,
 	    SWI_NET, INTR_TYPE_NET | INTR_MPSAFE, &nwsp->nws_swi_cookie);
 	if (error)
 		panic("%s: swi_add %d", __func__, error);
 	pc->pc_netisr = nwsp->nws_intr_event;
 	if (netisr_bindthreads) {
 		error = intr_event_bind(nwsp->nws_intr_event, cpuid);
 		if (error != 0)
 			printf("%s: cpu %u: intr_event_bind: %d", __func__,
 			    cpuid, error);
 	}
 	NETISR_WLOCK();
 	nws_array[nws_count] = nwsp->nws_cpu;
 	nws_count++;
 	NETISR_WUNLOCK();
 }
 
 /*
  * Initialize the netisr subsystem.  We rely on BSS and static initialization
  * of most fields in global data structures.
  *
  * Start a worker thread for the boot CPU so that we can support network
  * traffic immediately in case the network stack is used before additional
  * CPUs are started (for example, diskless boot).
  */
 static void
 netisr_init(void *arg)
 {
 	struct pcpu *pc;
 
 	NETISR_LOCK_INIT();
 	if (netisr_maxthreads == 0 || netisr_maxthreads < -1 )
 		netisr_maxthreads = 1;		/* default behavior */
 	else if (netisr_maxthreads == -1)
 		netisr_maxthreads = mp_ncpus;	/* use max cpus */
 	if (netisr_maxthreads > mp_ncpus) {
 		printf("netisr_init: forcing maxthreads from %d to %d\n",
 		    netisr_maxthreads, mp_ncpus);
 		netisr_maxthreads = mp_ncpus;
 	}
 	if (netisr_defaultqlimit > netisr_maxqlimit) {
 		printf("netisr_init: forcing defaultqlimit from %d to %d\n",
 		    netisr_defaultqlimit, netisr_maxqlimit);
 		netisr_defaultqlimit = netisr_maxqlimit;
 	}
 #ifdef DEVICE_POLLING
 	/*
 	 * The device polling code is not yet aware of how to deal with
 	 * multiple netisr threads, so for the time being compiling in device
 	 * polling disables parallel netisr workers.
 	 */
 	if (netisr_maxthreads != 1 || netisr_bindthreads != 0) {
 		printf("netisr_init: forcing maxthreads to 1 and "
 		    "bindthreads to 0 for device polling\n");
 		netisr_maxthreads = 1;
 		netisr_bindthreads = 0;
 	}
 #endif
 
 #ifdef EARLY_AP_STARTUP
 	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 		if (nws_count >= netisr_maxthreads)
 			break;
 		netisr_start_swi(pc->pc_cpuid, pc);
 	}
 #else
 	pc = get_pcpu();
 	netisr_start_swi(pc->pc_cpuid, pc);
 #endif
 }
 SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL);
 
 #ifndef EARLY_AP_STARTUP
 /*
  * Start worker threads for additional CPUs.  No attempt to gracefully handle
  * work reassignment, we don't yet support dynamic reconfiguration.
  */
 static void
 netisr_start(void *arg)
 {
 	struct pcpu *pc;
 
 	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 		if (nws_count >= netisr_maxthreads)
 			break;
 		/* Worker will already be present for boot CPU. */
 		if (pc->pc_netisr != NULL)
 			continue;
 		netisr_start_swi(pc->pc_cpuid, pc);
 	}
 }
 SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL);
 #endif
 
 /*
  * Sysctl monitoring for netisr: query a list of registered protocols.
  */
 static int
 sysctl_netisr_proto(SYSCTL_HANDLER_ARGS)
 {
 	struct rm_priotracker tracker;
 	struct sysctl_netisr_proto *snpp, *snp_array;
 	struct netisr_proto *npp;
 	u_int counter, proto;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EINVAL);
 	snp_array = malloc(sizeof(*snp_array) * NETISR_MAXPROT, M_TEMP,
 	    M_ZERO | M_WAITOK);
 	counter = 0;
 	NETISR_RLOCK(&tracker);
 	for (proto = 0; proto < NETISR_MAXPROT; proto++) {
 		npp = &netisr_proto[proto];
 		if (npp->np_name == NULL)
 			continue;
 		snpp = &snp_array[counter];
 		snpp->snp_version = sizeof(*snpp);
 		strlcpy(snpp->snp_name, npp->np_name, NETISR_NAMEMAXLEN);
 		snpp->snp_proto = proto;
 		snpp->snp_qlimit = npp->np_qlimit;
 		snpp->snp_policy = npp->np_policy;
 		snpp->snp_dispatch = npp->np_dispatch;
 		if (npp->np_m2flow != NULL)
 			snpp->snp_flags |= NETISR_SNP_FLAGS_M2FLOW;
 		if (npp->np_m2cpuid != NULL)
 			snpp->snp_flags |= NETISR_SNP_FLAGS_M2CPUID;
 		if (npp->np_drainedcpu != NULL)
 			snpp->snp_flags |= NETISR_SNP_FLAGS_DRAINEDCPU;
 		counter++;
 	}
 	NETISR_RUNLOCK(&tracker);
 	KASSERT(counter <= NETISR_MAXPROT,
 	    ("sysctl_netisr_proto: counter too big (%d)", counter));
 	error = SYSCTL_OUT(req, snp_array, sizeof(*snp_array) * counter);
 	free(snp_array, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_isr, OID_AUTO, proto,
     CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_proto,
     "S,sysctl_netisr_proto",
     "Return list of protocols registered with netisr");
 
 /*
  * Sysctl monitoring for netisr: query a list of workstreams.
  */
 static int
 sysctl_netisr_workstream(SYSCTL_HANDLER_ARGS)
 {
 	struct rm_priotracker tracker;
 	struct sysctl_netisr_workstream *snwsp, *snws_array;
 	struct netisr_workstream *nwsp;
 	u_int counter, cpuid;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EINVAL);
 	snws_array = malloc(sizeof(*snws_array) * MAXCPU, M_TEMP,
 	    M_ZERO | M_WAITOK);
 	counter = 0;
 	NETISR_RLOCK(&tracker);
 	CPU_FOREACH(cpuid) {
 		nwsp = DPCPU_ID_PTR(cpuid, nws);
 		if (nwsp->nws_intr_event == NULL)
 			continue;
 		NWS_LOCK(nwsp);
 		snwsp = &snws_array[counter];
 		snwsp->snws_version = sizeof(*snwsp);
 
 		/*
 		 * For now, we equate workstream IDs and CPU IDs in the
 		 * kernel, but expose them independently to userspace in case
 		 * that assumption changes in the future.
 		 */
 		snwsp->snws_wsid = cpuid;
 		snwsp->snws_cpu = cpuid;
 		if (nwsp->nws_intr_event != NULL)
 			snwsp->snws_flags |= NETISR_SNWS_FLAGS_INTR;
 		NWS_UNLOCK(nwsp);
 		counter++;
 	}
 	NETISR_RUNLOCK(&tracker);
 	KASSERT(counter <= MAXCPU,
 	    ("sysctl_netisr_workstream: counter too big (%d)", counter));
 	error = SYSCTL_OUT(req, snws_array, sizeof(*snws_array) * counter);
 	free(snws_array, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_isr, OID_AUTO, workstream,
     CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_workstream,
     "S,sysctl_netisr_workstream",
     "Return list of workstreams implemented by netisr");
 
 /*
  * Sysctl monitoring for netisr: query per-protocol data across all
  * workstreams.
  */
 static int
 sysctl_netisr_work(SYSCTL_HANDLER_ARGS)
 {
 	struct rm_priotracker tracker;
 	struct sysctl_netisr_work *snwp, *snw_array;
 	struct netisr_workstream *nwsp;
 	struct netisr_proto *npp;
 	struct netisr_work *nwp;
 	u_int counter, cpuid, proto;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EINVAL);
 	snw_array = malloc(sizeof(*snw_array) * MAXCPU * NETISR_MAXPROT,
 	    M_TEMP, M_ZERO | M_WAITOK);
 	counter = 0;
 	NETISR_RLOCK(&tracker);
 	CPU_FOREACH(cpuid) {
 		nwsp = DPCPU_ID_PTR(cpuid, nws);
 		if (nwsp->nws_intr_event == NULL)
 			continue;
 		NWS_LOCK(nwsp);
 		for (proto = 0; proto < NETISR_MAXPROT; proto++) {
 			npp = &netisr_proto[proto];
 			if (npp->np_name == NULL)
 				continue;
 			nwp = &nwsp->nws_work[proto];
 			snwp = &snw_array[counter];
 			snwp->snw_version = sizeof(*snwp);
 			snwp->snw_wsid = cpuid;		/* See comment above. */
 			snwp->snw_proto = proto;
 			snwp->snw_len = nwp->nw_len;
 			snwp->snw_watermark = nwp->nw_watermark;
 			snwp->snw_dispatched = nwp->nw_dispatched;
 			snwp->snw_hybrid_dispatched =
 			    nwp->nw_hybrid_dispatched;
 			snwp->snw_qdrops = nwp->nw_qdrops;
 			snwp->snw_queued = nwp->nw_queued;
 			snwp->snw_handled = nwp->nw_handled;
 			counter++;
 		}
 		NWS_UNLOCK(nwsp);
 	}
 	KASSERT(counter <= MAXCPU * NETISR_MAXPROT,
 	    ("sysctl_netisr_work: counter too big (%d)", counter));
 	NETISR_RUNLOCK(&tracker);
 	error = SYSCTL_OUT(req, snw_array, sizeof(*snw_array) * counter);
 	free(snw_array, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_isr, OID_AUTO, work,
     CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_work,
     "S,sysctl_netisr_work",
     "Return list of per-workstream, per-protocol work in netisr");
 
 #ifdef DDB
 DB_SHOW_COMMAND(netisr, db_show_netisr)
 {
 	struct netisr_workstream *nwsp;
 	struct netisr_work *nwp;
 	int first, proto;
 	u_int cpuid;
 
 	db_printf("%3s %6s %5s %5s %5s %8s %8s %8s %8s\n", "CPU", "Proto",
 	    "Len", "WMark", "Max", "Disp", "HDisp", "Drop", "Queue");
 	CPU_FOREACH(cpuid) {
 		nwsp = DPCPU_ID_PTR(cpuid, nws);
 		if (nwsp->nws_intr_event == NULL)
 			continue;
 		first = 1;
 		for (proto = 0; proto < NETISR_MAXPROT; proto++) {
 			if (netisr_proto[proto].np_handler == NULL)
 				continue;
 			nwp = &nwsp->nws_work[proto];
 			if (first) {
 				db_printf("%3d ", cpuid);
 				first = 0;
 			} else
 				db_printf("%3s ", "");
 			db_printf(
 			    "%6s %5d %5d %5d %8ju %8ju %8ju %8ju\n",
 			    netisr_proto[proto].np_name, nwp->nw_len,
 			    nwp->nw_watermark, nwp->nw_qlimit,
 			    nwp->nw_dispatched, nwp->nw_hybrid_dispatched,
 			    nwp->nw_qdrops, nwp->nw_queued);
 		}
 	}
 }
 #endif
diff --git a/sys/net/route.c b/sys/net/route.c
index 0d6227a515f7..1373b0986876 100644
--- a/sys/net/route.c
+++ b/sys/net/route.c
@@ -1,723 +1,724 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1980, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)route.c	8.3.1.1 (Berkeley) 2/23/95
  * $FreeBSD$
  */
 /************************************************************************
  * Note: In this file a 'fib' is a "forwarding information base"	*
  * Which is the new name for an in kernel routing (next hop) table.	*
  ***********************************************************************/
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_mrouting.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/proc.h>
 #include <sys/devctl.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/ip_mroute.h>
 #include <netinet6/in6_var.h>
 
 VNET_PCPUSTAT_DEFINE(struct rtstat, rtstat);
 
 VNET_PCPUSTAT_SYSINIT(rtstat);
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(rtstat);
 #endif
 
 EVENTHANDLER_LIST_DEFINE(rt_addrmsg);
 
 static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *,
     void *arg);
 
 /*
  * route initialization must occur before ip6_init2(), which happenas at
  * SI_ORDER_MIDDLE.
  */
 static void
 route_init(void)
 {
 
 	nhops_init();
 }
 SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL);
 
 struct rib_head *
 rt_table_init(int offset, int family, u_int fibnum)
 {
 	struct rib_head *rh;
 
 	rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO);
 
 	/* TODO: These details should be hidded inside radix.c */
 	/* Init masks tree */
 	rn_inithead_internal(&rh->head, rh->rnh_nodes, offset);
 	rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0);
 	rh->head.rnh_masks = &rh->rmhead;
 
 	/* Save metadata associated with this routing table. */
 	rh->rib_family = family;
 	rh->rib_fibnum = fibnum;
 #ifdef VIMAGE
 	rh->rib_vnet = curvnet;
 #endif
 
 	tmproutes_init(rh);
 
 	/* Init locks */
 	RIB_LOCK_INIT(rh);
 
 	nhops_init_rib(rh);
 
 	/* Init subscription system */
 	rib_init_subscriptions(rh);
 
 	/* Finally, set base callbacks */
 	rh->rnh_addaddr = rn_addroute;
 	rh->rnh_deladdr = rn_delete;
 	rh->rnh_matchaddr = rn_match;
 	rh->rnh_lookup = rn_lookup;
 	rh->rnh_walktree = rn_walktree;
 	rh->rnh_walktree_from = rn_walktree_from;
 
 	return (rh);
 }
 
 static int
 rt_freeentry(struct radix_node *rn, void *arg)
 {
 	struct radix_head * const rnh = arg;
 	struct radix_node *x;
 
 	x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh);
 	if (x != NULL)
 		R_Free(x);
 	return (0);
 }
 
 void
 rt_table_destroy(struct rib_head *rh)
 {
 
 	RIB_WLOCK(rh);
 	rh->rib_dying = true;
 	RIB_WUNLOCK(rh);
 
 #ifdef FIB_ALGO
 	fib_destroy_rib(rh);
 #endif
 
 	tmproutes_destroy(rh);
 
 	rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head);
 
 	nhops_destroy_rib(rh);
 
 	rib_destroy_subscriptions(rh);
 
 	/* Assume table is already empty */
 	RIB_LOCK_DESTROY(rh);
 	free(rh, M_RTABLE);
 }
 
 /*
  * Adds a temporal redirect entry to the routing table.
  * @fibnum: fib number
  * @dst: destination to install redirect to
  * @gateway: gateway to go via
  * @author: sockaddr of originating router, can be NULL
  * @ifp: interface to use for the redirected route
  * @flags: set of flags to add. Allowed: RTF_GATEWAY
  * @lifetime_sec: time in seconds to expire this redirect.
  *
  * Retuns 0 on success, errno otherwise.
  */
 int
 rib_add_redirect(u_int fibnum, struct sockaddr *dst, struct sockaddr *gateway,
     struct sockaddr *author, struct ifnet *ifp, int flags, int lifetime_sec)
 {
 	struct route_nhop_data rnd = { .rnd_weight = RT_DEFAULT_WEIGHT };
 	struct rib_cmd_info rc;
 	struct ifaddr *ifa;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	if (rt_tables_get_rnh(fibnum, dst->sa_family) == NULL)
 		return (EAFNOSUPPORT);
 
 	/* Verify the allowed flag mask. */
 	KASSERT(((flags & ~(RTF_GATEWAY)) == 0),
 	    ("invalid redirect flags: %x", flags));
 	flags |= RTF_HOST | RTF_DYNAMIC;
 
 	/* Get the best ifa for the given interface and gateway. */
 	if ((ifa = ifaof_ifpforaddr(gateway, ifp)) == NULL)
 		return (ENETUNREACH);
 
 	struct nhop_object *nh = nhop_alloc(fibnum, dst->sa_family);
 	if (nh == NULL)
 		return (ENOMEM);
 
 	nhop_set_gw(nh, gateway, flags & RTF_GATEWAY);
 	nhop_set_transmit_ifp(nh, ifp);
 	nhop_set_src(nh, ifa);
 	nhop_set_pxtype_flag(nh, NHF_HOST);
 	nhop_set_expire(nh, lifetime_sec + time_uptime);
 	nhop_set_redirect(nh, true);
 	nhop_set_origin(nh, NH_ORIGIN_REDIRECT);
 	rnd.rnd_nhop = nhop_get_nhop(nh, &error);
 	if (error == 0) {
 		error = rib_add_route_px(fibnum, dst, -1,
 		    &rnd, RTM_F_CREATE, &rc);
 	}
 
 	if (error != 0) {
 		/* TODO: add per-fib redirect stats. */
 		return (error);
 	}
 
 	RTSTAT_INC(rts_dynamic);
 
 	/* Send notification of a route addition to userland. */
 	struct rt_addrinfo info = {
 		.rti_info[RTAX_DST] = dst,
 		.rti_info[RTAX_GATEWAY] = gateway,
 		.rti_info[RTAX_AUTHOR] = author,
 	};
 	rt_missmsg_fib(RTM_REDIRECT, &info, flags | RTF_UP, error, fibnum);
 
 	return (0);
 }
 
 /*
  * Routing table ioctl interface.
  */
 int
 rtioctl_fib(u_long req, caddr_t data, u_int fibnum)
 {
 
 	/*
 	 * If more ioctl commands are added here, make sure the proper
 	 * super-user checks are being performed because it is possible for
 	 * prison-root to make it this far if raw sockets have been enabled
 	 * in jails.
 	 */
 #ifdef INET
 	/* Multicast goop, grrr... */
 	return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP;
 #else /* INET */
 	return ENXIO;
 #endif /* INET */
 }
 
 struct ifaddr *
 ifa_ifwithroute(int flags, const struct sockaddr *dst,
     const struct sockaddr *gateway, u_int fibnum)
 {
 	struct ifaddr *ifa;
 
 	NET_EPOCH_ASSERT();
 	if ((flags & RTF_GATEWAY) == 0) {
 		/*
 		 * If we are adding a route to an interface,
 		 * and the interface is a pt to pt link
 		 * we should search for the destination
 		 * as our clue to the interface.  Otherwise
 		 * we can use the local address.
 		 */
 		ifa = NULL;
 		if (flags & RTF_HOST)
 			ifa = ifa_ifwithdstaddr(dst, fibnum);
 		if (ifa == NULL)
 			ifa = ifa_ifwithaddr(gateway);
 	} else {
 		/*
 		 * If we are adding a route to a remote net
 		 * or host, the gateway may still be on the
 		 * other end of a pt to pt link.
 		 */
 		ifa = ifa_ifwithdstaddr(gateway, fibnum);
 	}
 	if (ifa == NULL)
 		ifa = ifa_ifwithnet(gateway, 0, fibnum);
 	if (ifa == NULL) {
 		struct nhop_object *nh;
 
 		nh = rib_lookup(fibnum, gateway, NHR_NONE, 0);
 
 		/*
 		 * dismiss a gateway that is reachable only
 		 * through the default router
 		 */
 		if ((nh == NULL) || (nh->nh_flags & NHF_DEFAULT))
 			return (NULL);
 		ifa = nh->nh_ifa;
 	}
 	if (ifa->ifa_addr->sa_family != dst->sa_family) {
 		struct ifaddr *oifa = ifa;
 		ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
 		if (ifa == NULL)
 			ifa = oifa;
 	}
 
 	return (ifa);
 }
 
 /*
  * Delete Routes for a Network Interface
  *
  * Called for each routing entry via the rnh->rnh_walktree() call above
  * to delete all route entries referencing a detaching network interface.
  *
  * Arguments:
  *	rt	pointer to rtentry
  *	nh	pointer to nhop
  *	arg	argument passed to rnh->rnh_walktree() - detaching interface
  *
  * Returns:
  *	0	successful
  *	errno	failed - reason indicated
  */
 static int
 rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *nh, void *arg)
 {
 	struct ifnet	*ifp = arg;
 
 	if (nh->nh_ifp != ifp)
 		return (0);
 
 	/*
 	 * Protect (sorta) against walktree recursion problems
 	 * with cloned routes
 	 */
 	if ((rt->rte_flags & RTF_UP) == 0)
 		return (0);
 
 	return (1);
 }
 
 void
 rt_flushifroutes(struct ifnet *ifp)
 {
 
 	rib_foreach_table_walk_del(AF_UNSPEC, rt_ifdelroute, ifp);
 }
 
 /*
  * Tries to extract interface from RTAX_IFP passed in rt_addrinfo.
  * Interface can be specified ether as interface index (sdl_index) or
  * the interface name (sdl_data).
  *
  * Returns found ifp or NULL
  */
 static struct ifnet *
 info_get_ifp(struct rt_addrinfo *info)
 {
 	const struct sockaddr_dl *sdl;
 
 	sdl = (const struct sockaddr_dl *)info->rti_info[RTAX_IFP];
 	if (sdl->sdl_family != AF_LINK)
 		return (NULL);
 
 	if (sdl->sdl_index != 0)
 		return (ifnet_byindex(sdl->sdl_index));
 	if (sdl->sdl_nlen > 0) {
 		char if_name[IF_NAMESIZE];
 		if (sdl->sdl_nlen + offsetof(struct sockaddr_dl, sdl_data) > sdl->sdl_len)
 			return (NULL);
 		if (sdl->sdl_nlen >= IF_NAMESIZE)
 			return (NULL);
 		bzero(if_name, sizeof(if_name));
 		memcpy(if_name, sdl->sdl_data, sdl->sdl_nlen);
 		return (ifunit(if_name));
 	}
 
 	return (NULL);
 }
 
 /*
  * Calculates proper ifa/ifp for the cases when gateway AF is different
  * from dst AF.
  *
  * Returns 0 on success.
  */
 __noinline static int
 rt_getifa_family(struct rt_addrinfo *info, uint32_t fibnum)
 {
 	if (info->rti_ifp == NULL) {
 		struct ifaddr *ifa = NULL;
 		/*
 		 * No transmit interface specified. Guess it by checking gw sa.
 		 */
 		const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
 		ifa = ifa_ifwithroute(RTF_GATEWAY, gw, gw, fibnum);
 		if (ifa == NULL)
 			return (ENETUNREACH);
 		info->rti_ifp = ifa->ifa_ifp;
 	}
 
 	/* Prefer address from outgoing interface */
 	info->rti_ifa = ifaof_ifpforaddr(info->rti_info[RTAX_DST], info->rti_ifp);
 #ifdef INET
 	if (info->rti_ifa == NULL) {
 		/* Use first found IPv4 address */
 		bool loopback_ok = info->rti_ifp->if_flags & IFF_LOOPBACK;
 		info->rti_ifa = (struct ifaddr *)in_findlocal(fibnum, loopback_ok);
 	}
 #endif
 	if (info->rti_ifa == NULL)
 		return (ENETUNREACH);
 	return (0);
 }
 
 /*
  * Fills in rti_ifp and rti_ifa for the provided fib.
  *
  * Assume basic consistency checks are executed by callers:
  * RTAX_DST exists, if RTF_GATEWAY is set, RTAX_GATEWAY exists as well.
  */
 int
 rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
 {
 	const struct sockaddr *dst, *gateway, *ifaaddr;
 	int error, flags;
 
 	dst = info->rti_info[RTAX_DST];
 	gateway = info->rti_info[RTAX_GATEWAY];
 	ifaaddr = info->rti_info[RTAX_IFA];
 	flags = info->rti_flags;
 
 	/*
 	 * ifp may be specified by sockaddr_dl
 	 * when protocol address is ambiguous.
 	 */
 	error = 0;
 
 	/* If we have interface specified by RTAX_IFP address, try to use it */
 	if ((info->rti_ifp == NULL) && (info->rti_info[RTAX_IFP] != NULL))
 		info->rti_ifp = info_get_ifp(info);
 	/*
 	 * If we have source address specified, try to find it
 	 * TODO: avoid enumerating all ifas on all interfaces.
 	 */
 	if (info->rti_ifa == NULL && ifaaddr != NULL)
 		info->rti_ifa = ifa_ifwithaddr(ifaaddr);
 	if ((info->rti_ifa == NULL) && ((info->rti_flags & RTF_GATEWAY) != 0) &&
 	    (gateway->sa_family != dst->sa_family))
 		return (rt_getifa_family(info, fibnum));
 	if (info->rti_ifa == NULL) {
 		const struct sockaddr *sa;
 
 		/*
 		 * Most common use case for the userland-supplied routes.
 		 *
 		 * Choose sockaddr to select ifa.
 		 * -- if ifp is set --
 		 * Order of preference:
 		 * 1) IFA address
 		 * 2) gateway address
 		 *   Note: for interface routes link-level gateway address 
 		 *     is specified to indicate the interface index without
 		 *     specifying RTF_GATEWAY. In this case, ignore gateway
 		 *   Note: gateway AF may be different from dst AF. In this case,
 		 *   ignore gateway
 		 * 3) final destination.
 		 * 4) if all of these fails, try to get at least link-level ifa.
 		 * -- else --
 		 * try to lookup gateway or dst in the routing table to get ifa
 		 */
 		if (info->rti_info[RTAX_IFA] != NULL)
 			sa = info->rti_info[RTAX_IFA];
 		else if ((info->rti_flags & RTF_GATEWAY) != 0 &&
 		    gateway->sa_family == dst->sa_family)
 			sa = gateway;
 		else
 			sa = dst;
 		if (info->rti_ifp != NULL) {
 			info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
 			/* Case 4 */
 			if (info->rti_ifa == NULL && gateway != NULL)
 				info->rti_ifa = ifaof_ifpforaddr(gateway, info->rti_ifp);
 		} else if (dst != NULL && gateway != NULL)
 			info->rti_ifa = ifa_ifwithroute(flags, dst, gateway,
 							fibnum);
 		else if (sa != NULL)
 			info->rti_ifa = ifa_ifwithroute(flags, sa, sa,
 							fibnum);
 	}
 	if (info->rti_ifa != NULL) {
 		if (info->rti_ifp == NULL)
 			info->rti_ifp = info->rti_ifa->ifa_ifp;
 	} else
 		error = ENETUNREACH;
 	return (error);
 }
 
 void
 rt_updatemtu(struct ifnet *ifp)
 {
 	struct rib_head *rnh;
 	int mtu;
 	int i, j;
 
 	/*
 	 * Try to update rt_mtu for all routes using this interface
 	 * Unfortunately the only way to do this is to traverse all
 	 * routing tables in all fibs/domains.
 	 */
 	for (i = 1; i <= AF_MAX; i++) {
 		mtu = if_getmtu_family(ifp, i);
 		for (j = 0; j < rt_numfibs; j++) {
 			rnh = rt_tables_get_rnh(j, i);
 			if (rnh == NULL)
 				continue;
 			nhops_update_ifmtu(rnh, ifp, mtu);
 		}
 	}
 }
 
 #if 0
 int p_sockaddr(char *buf, int buflen, struct sockaddr *s);
 int rt_print(char *buf, int buflen, struct rtentry *rt);
 
 int
 p_sockaddr(char *buf, int buflen, struct sockaddr *s)
 {
 	void *paddr = NULL;
 
 	switch (s->sa_family) {
 	case AF_INET:
 		paddr = &((struct sockaddr_in *)s)->sin_addr;
 		break;
 	case AF_INET6:
 		paddr = &((struct sockaddr_in6 *)s)->sin6_addr;
 		break;
 	}
 
 	if (paddr == NULL)
 		return (0);
 
 	if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL)
 		return (0);
 
 	return (strlen(buf));
 }
 
 int
 rt_print(char *buf, int buflen, struct rtentry *rt)
 {
 	struct sockaddr *addr, *mask;
 	int i = 0;
 
 	addr = rt_key(rt);
 	mask = rt_mask(rt);
 
 	i = p_sockaddr(buf, buflen, addr);
 	if (!(rt->rt_flags & RTF_HOST)) {
 		buf[i++] = '/';
 		i += p_sockaddr(buf + i, buflen - i, mask);
 	}
 
 	if (rt->rt_flags & RTF_GATEWAY) {
 		buf[i++] = '>';
 		i += p_sockaddr(buf + i, buflen - i, &rt->rt_nhop->gw_sa);
 	}
 
 	return (i);
 }
 #endif
 
 void
 rt_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
     const struct sockaddr *netmask)
 {
 	const u_char *cp1 = (const u_char *)src;
 	u_char *cp2 = (u_char *)dst;
 	const u_char *cp3 = (const u_char *)netmask;
 	u_char *cplim = cp2 + *cp3;
 	u_char *cplim2 = cp2 + *cp1;
 
 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
 	cp3 += 2;
 	if (cplim > cplim2)
 		cplim = cplim2;
 	while (cp2 < cplim)
 		*cp2++ = *cp1++ & *cp3++;
 	if (cp2 < cplim2)
 		bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
 }
 
 /*
  * Announce interface address arrival/withdraw
  * Returns 0 on success.
  */
 int
 rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum)
 {
 #if defined(INET) || defined(INET6)
 	struct sockaddr *sa = ifa->ifa_addr;
 	struct ifnet *ifp = ifa->ifa_ifp;
 #endif
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
 	    ("unexpected cmd %d", cmd));
 	KASSERT((fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 	EVENTHANDLER_DIRECT_INVOKE(rt_addrmsg, ifa, cmd);
 
 #ifdef INET
 	if (sa->sa_family == AF_INET) {
 		char addrstr[INET_ADDRSTRLEN];
 		char strbuf[INET_ADDRSTRLEN + 12];
 
 		inet_ntoa_r(((struct sockaddr_in *)sa)->sin_addr, addrstr);
 		snprintf(strbuf, sizeof(strbuf), "address=%s", addrstr);
 		devctl_notify("IFNET", ifp->if_xname,
 		    (cmd == RTM_ADD) ? "ADDR_ADD" : "ADDR_DEL", strbuf);
 	}
 #endif
 #ifdef INET6
 	if (sa->sa_family == AF_INET6) {
 		char addrstr[INET6_ADDRSTRLEN];
 		char strbuf[INET6_ADDRSTRLEN + 12];
 
 		ip6_sprintf(addrstr, IFA_IN6(ifa));
 		snprintf(strbuf, sizeof(strbuf), "address=%s", addrstr);
 		devctl_notify("IFNET", ifp->if_xname,
 		    (cmd == RTM_ADD) ? "ADDR_ADD" : "ADDR_DEL", strbuf);
 	}
 #endif
 
 	if (V_rt_add_addr_allfibs)
 		fibnum = RT_ALL_FIBS;
 	return (rtsock_addrmsg(cmd, ifa, fibnum));
 }
 
 /*
  * Announce kernel-originated route addition/removal to rtsock based on @rt data.
  * cmd: RTM_ cmd
  * @rt: valid rtentry
  * @nh: nhop object to announce
  * @fibnum: fib id or RT_ALL_FIBS
  *
  * Returns 0 on success.
  */
 int
 rt_routemsg(int cmd, struct rtentry *rt, struct nhop_object *nh,
     int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE || cmd == RTM_CHANGE,
 	    ("unexpected cmd %d", cmd));
 
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 	KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__));
 
 	return (rtsock_routemsg(cmd, rt, nh, fibnum));
 }
 
 /*
  * Announce kernel-originated route addition/removal to rtsock based on @rt data.
  * cmd: RTM_ cmd
  * @info: addrinfo structure with valid data.
  * @fibnum: fib id or RT_ALL_FIBS
  *
  * Returns 0 on success.
  */
 int
 rt_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE || cmd == RTM_CHANGE,
 	    ("unexpected cmd %d", cmd));
 
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 	KASSERT(info->rti_info[RTAX_DST] != NULL, (":%s: RTAX_DST must be supplied", __func__));
 
 	return (rtsock_routemsg_info(cmd, info, fibnum));
 }
 
 void
 rt_ifmsg(struct ifnet *ifp, int if_flags_mask)
 {
 	rtsock_callback_p->ifmsg_f(ifp, if_flags_mask);
 	netlink_callback_p->ifmsg_f(ifp, if_flags_mask);
 }
 
 /* Netlink-related callbacks needed to glue rtsock, netlink and linuxolator */
 static void
 ignore_route_event(uint32_t fibnum, const struct rib_cmd_info *rc)
 {
 }
 
 static void
 ignore_ifmsg_event(struct ifnet *ifp, int if_flags_mask)
 {
 }
 
 static struct rtbridge ignore_cb = {
 	.route_f = ignore_route_event,
 	.ifmsg_f = ignore_ifmsg_event,
 };
 
 void *linux_netlink_p = NULL; /* Callback pointer for Linux translator functions */
 struct rtbridge *rtsock_callback_p = &ignore_cb;
 struct rtbridge *netlink_callback_p = &ignore_cb;
diff --git a/sys/net/route/nhgrp_ctl.c b/sys/net/route/nhgrp_ctl.c
index b829b1125597..1aba9d6bea28 100644
--- a/sys/net/route/nhgrp_ctl.c
+++ b/sys/net/route/nhgrp_ctl.c
@@ -1,977 +1,978 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2020 Alexander V. Chernikov
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #include "opt_inet.h"
 #include "opt_route.h"
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/refcount.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/kernel.h>
 #include <sys/epoch.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_fib.h>
 
 #include <net/route/nhop_utils.h>
 #include <net/route/nhop.h>
 #include <net/route/nhop_var.h>
 #include <net/route/nhgrp_var.h>
 
 #define	DEBUG_MOD_NAME	nhgrp_ctl
 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
 #include <net/route/route_debug.h>
 _DECLARE_DEBUG(LOG_INFO);
 
 /*
  * This file contains the supporting functions for creating multipath groups
  *  and compiling their dataplane parts.
  */
 
 /* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */
 _Static_assert(MPF_MULTIPATH == NHF_MULTIPATH,
     "MPF_MULTIPATH must be the same as NHF_MULTIPATH");
 /* Offset and size of flags field has to be the same for nhop/nhop groups */
 CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags);
 /* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */
 CTASSERT(RIB_MAX_MPATH_WIDTH <= 64);
 
 static int wn_cmp_idx(const void *a, const void *b);
 static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops);
 
 static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl,
     struct weightened_nhop *wn, int num_nhops, uint32_t uidx, int *perror);
 static void destroy_nhgrp(struct nhgrp_priv *nhg_priv);
 static void destroy_nhgrp_epoch(epoch_context_t ctx);
 static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv);
 
 static int
 wn_cmp_idx(const void *a, const void *b)
 {
 	const struct weightened_nhop *w_a = a;
 	const struct weightened_nhop *w_b = b;
 	uint32_t a_idx = w_a->nh->nh_priv->nh_idx;
 	uint32_t b_idx = w_b->nh->nh_priv->nh_idx;
 
 	if (a_idx < b_idx)
 		return (-1);
 	else if (a_idx > b_idx)
 		return (1);
 	else
 		return (0);
 }
 
 /*
  * Perform in-place sorting for array of nexthops in @wn.
  * Sort by nexthop index ascending.
  */
 static void
 sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops)
 {
 
 	qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp_idx);
 }
 
 /*
  * In order to determine the minimum weight difference in the array
  * of weights, create a sorted array of weights, using spare "storage"
  * field in the `struct weightened_nhop`.
  * Assume weights to be (mostly) the same and use insertion sort to
  * make it sorted.
  */
 static void
 sort_weightened_nhops_weights(struct weightened_nhop *wn, int num_items)
 {
 	wn[0].storage = wn[0].weight;
 	for (int i = 1, j = 0; i < num_items; i++) {
 		uint32_t weight = wn[i].weight; // read from 'weight' as it's not reordered
 		/* Move all weights > weight 1 position right */
 		for (j = i - 1; j >= 0 && wn[j].storage > weight; j--)
 			wn[j + 1].storage = wn[j].storage;
 		wn[j + 1].storage = weight;
 	}
 }
 
 /*
  * Calculate minimum number of slots required to fit the existing
  * set of weights in the common use case where weights are "easily"
  * comparable.
  * Assumes @wn is sorted by weight ascending and each weight is > 0.
  * Returns number of slots or 0 if precise calculation failed.
  *
  * Some examples:
  * note: (i, X) pair means (nhop=i, weight=X):
  * (1, 1) (2, 2) -> 3 slots [1, 2, 2]
  * (1, 100), (2, 200) -> 3 slots [1, 2, 2]
  * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3]
  */
 static uint32_t
 calc_min_mpath_slots_fast(struct weightened_nhop *wn, size_t num_items,
     uint64_t *ptotal)
 {
 	uint32_t i, last, xmin;
 	uint64_t total = 0;
 
 	// Get sorted array of weights in .storage field
 	sort_weightened_nhops_weights(wn, num_items);
 
 	last = 0;
 	xmin = wn[0].storage;
 	for (i = 0; i < num_items; i++) {
 		total += wn[i].storage;
 		if ((wn[i].storage != last) &&
 		    ((wn[i].storage - last < xmin) || xmin == 0)) {
 			xmin = wn[i].storage - last;
 		}
 		last = wn[i].storage;
 	}
 	*ptotal = total;
 	/* xmin is the minimum unit of desired capacity */
 	if ((total % xmin) != 0)
 		return (0);
 	for (i = 0; i < num_items; i++) {
 		if ((wn[i].weight % xmin) != 0)
 			return (0);
 	}
 
 	return ((uint32_t)(total / xmin));
 }
 
 /*
  * Calculate minimum number of slots required to fit the existing
  * set of weights while maintaining weight coefficients.
  *
  * Assume @wn is sorted by weight ascending and each weight is > 0.
  *
  * Tries to find simple precise solution first and falls back to
  *  RIB_MAX_MPATH_WIDTH in case of any failure.
  */
 static uint32_t
 calc_min_mpath_slots(struct weightened_nhop *wn, size_t num_items)
 {
 	uint32_t v;
 	uint64_t total;
 
 	v = calc_min_mpath_slots_fast(wn, num_items, &total);
 	if (total == 0)
 		return (0);
 	if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH))
 		v = RIB_MAX_MPATH_WIDTH;
 
 	return (v);
 }
 
 /*
  * Nexthop group data consists of
  * 1) dataplane part, with nhgrp_object as a header followed by an
  *   arbitrary number of nexthop pointers.
  * 2) control plane part, with nhgrp_priv as a header, followed by
  *   an arbirtrary number of 'struct weightened_nhop' object.
  *
  * Given nexthop groups are (mostly) immutable, allocate all data
  * in one go.
  *
  */
 __noinline static size_t
 get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops)
 {
 	size_t sz;
 
 	sz = sizeof(struct nhgrp_object);
 	sz += nhg_size * sizeof(struct nhop_object *);
 	sz += sizeof(struct nhgrp_priv);
 	sz += num_nhops * sizeof(struct weightened_nhop);
 	return (sz);
 }
 
 /*
  * Compile actual list of nexthops to be used by datapath from
  *  the nexthop group @dst.
  *
  * For example, compiling control plane list of 2 nexthops
  *  [(200, A), (100, B)] would result in the datapath array
  *  [A, A, B]
  */
 static void
 compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x,
     uint32_t num_slots)
 {
 	struct nhgrp_object *dst;
 	int i, slot_idx, remaining_slots;
 	uint64_t remaining_sum, nh_weight, nh_slots;
 
 	slot_idx  = 0;
 	dst = dst_priv->nhg;
 	/* Calculate sum of all weights */
 	remaining_sum = 0;
 	for (i = 0; i < dst_priv->nhg_nh_count; i++)
 		remaining_sum += x[i].weight;
 	remaining_slots = num_slots;
 	FIB_NH_LOG(LOG_DEBUG3, x[0].nh, "sum: %lu, slots: %d",
 	    remaining_sum, remaining_slots);
 	for (i = 0; i < dst_priv->nhg_nh_count; i++) {
 		/* Calculate number of slots for the current nexthop */
 		if (remaining_sum > 0) {
 			nh_weight = (uint64_t)x[i].weight;
 			nh_slots = (nh_weight * remaining_slots / remaining_sum);
 		} else
 			nh_slots = 0;
 
 		remaining_sum -= x[i].weight;
 		remaining_slots -= nh_slots;
 
 		FIB_NH_LOG(LOG_DEBUG3, x[0].nh,
 		    " rem_sum: %lu, rem_slots: %d nh_slots: %d, slot_idx: %d",
 		    remaining_sum, remaining_slots, (int)nh_slots, slot_idx);
 
 		KASSERT((slot_idx + nh_slots <= num_slots),
 		    ("index overflow during nhg compilation"));
 		while (nh_slots-- > 0)
 			dst->nhops[slot_idx++] = x[i].nh;
 	}
 }
 
 /*
  * Allocates new nexthop group for the list of weightened nexthops.
  * Assume sorted list.
  * Does NOT reference any nexthops in the group.
  * Returns group with refcount=1 or NULL.
  */
 static struct nhgrp_priv *
 alloc_nhgrp(struct weightened_nhop *wn, int num_nhops)
 {
 	uint32_t nhgrp_size;
 	struct nhgrp_object *nhg;
 	struct nhgrp_priv *nhg_priv;
 
 	nhgrp_size = calc_min_mpath_slots(wn, num_nhops);
 	if (nhgrp_size == 0) {
 		/* Zero weights, abort */
 		return (NULL);
 	}
 
 	size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops);
 	nhg = malloc(sz, M_NHOP, M_NOWAIT | M_ZERO);
 	if (nhg == NULL) {
 		FIB_NH_LOG(LOG_INFO, wn[0].nh,
 		    "unable to allocate group with num_nhops %d (compiled %u)",
 		    num_nhops, nhgrp_size);
 		return (NULL);
 	}
 
 	/* Has to be the first to make NHGRP_PRIV() work */
 	nhg->nhg_size = nhgrp_size;
 	nhg->nhg_flags = MPF_MULTIPATH;
 
 	nhg_priv = NHGRP_PRIV(nhg);
 	nhg_priv->nhg_nh_count = num_nhops;
 	refcount_init(&nhg_priv->nhg_refcount, 1);
 
 	/* Please see nhgrp_free() comments on the initial value */
 	refcount_init(&nhg_priv->nhg_linked, 2);
 
 	nhg_priv->nhg = nhg;
 	memcpy(&nhg_priv->nhg_nh_weights[0], wn,
 	  num_nhops * sizeof(struct weightened_nhop));
 
 	FIB_NH_LOG(LOG_DEBUG, wn[0].nh, "num_nhops: %d, compiled_nhop: %u",
 	    num_nhops, nhgrp_size);
 
 	compile_nhgrp(nhg_priv, wn, nhg->nhg_size);
 
 	return (nhg_priv);
 }
 
 void
 nhgrp_ref_object(struct nhgrp_object *nhg)
 {
 	struct nhgrp_priv *nhg_priv;
 	u_int old __diagused;
 
 	nhg_priv = NHGRP_PRIV(nhg);
 	old = refcount_acquire(&nhg_priv->nhg_refcount);
 	KASSERT(old > 0, ("%s: nhgrp object %p has 0 refs", __func__, nhg));
 }
 
 void
 nhgrp_free(struct nhgrp_object *nhg)
 {
 	struct nhgrp_priv *nhg_priv;
 	struct nh_control *ctl;
 	struct epoch_tracker et;
 
 	nhg_priv = NHGRP_PRIV(nhg);
 
 	if (!refcount_release(&nhg_priv->nhg_refcount))
 		return;
 
 	/*
 	 * group objects don't have an explicit lock attached to it.
 	 * As groups are reclaimed based on reference count, it is possible
 	 * that some groups will persist after vnet destruction callback
 	 * called. Given that, handle scenario with nhgrp_free_group() being
 	 * called either after or simultaneously with nhgrp_ctl_unlink_all()
 	 * by using another reference counter: nhg_linked.
 	 *
 	 * There are only 2 places, where nhg_linked can be decreased:
 	 *  rib destroy (nhgrp_ctl_unlink_all) and this function.
 	 * nhg_link can never be increased.
 	 *
 	 * Hence, use initial value of 2 to make use of
 	 *  refcount_release_if_not_last().
 	 *
 	 * There can be two scenarious when calling this function:
 	 *
 	 * 1) nhg_linked value is 2. This means that either
 	 *  nhgrp_ctl_unlink_all() has not been called OR it is running,
 	 *  but we are guaranteed that nh_control won't be freed in
 	 *  this epoch. Hence, nexthop can be safely unlinked.
 	 *
 	 * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all()
 	 *  has been called and nhgrp unlink can be skipped.
 	 */
 
 	NET_EPOCH_ENTER(et);
 	if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) {
 		ctl = nhg_priv->nh_control;
 		if (unlink_nhgrp(ctl, nhg_priv) == NULL) {
 			/* Do not try to reclaim */
 			RT_LOG(LOG_INFO, "Failed to unlink nexhop group %p",
 			    nhg_priv);
 			NET_EPOCH_EXIT(et);
 			return;
 		}
 	}
 	NET_EPOCH_EXIT(et);
 
 	KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0"));
 	NET_EPOCH_CALL(destroy_nhgrp_epoch, &nhg_priv->nhg_epoch_ctx);
 }
 
 /*
  * Destroys all local resources belonging to @nhg_priv.
  */
 __noinline static void
 destroy_nhgrp_int(struct nhgrp_priv *nhg_priv)
 {
 
 	free(nhg_priv->nhg, M_NHOP);
 }
 
 __noinline static void
 destroy_nhgrp(struct nhgrp_priv *nhg_priv)
 {
 
 	KASSERT((nhg_priv->nhg_refcount == 0), ("nhg_refcount != 0"));
 	KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0"));
 
 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
 		char nhgbuf[NHOP_PRINT_BUFSIZE] __unused;
 		FIB_NH_LOG(LOG_DEBUG2, nhg_priv->nhg_nh_weights[0].nh,
 		    "destroying %s", nhgrp_print_buf(nhg_priv->nhg,
 		    nhgbuf, sizeof(nhgbuf)));
 	}
 
 	free_nhgrp_nhops(nhg_priv);
 	destroy_nhgrp_int(nhg_priv);
 }
 
 /*
  * Epoch callback indicating group is safe to destroy
  */
 static void
 destroy_nhgrp_epoch(epoch_context_t ctx)
 {
 	struct nhgrp_priv *nhg_priv;
 
 	nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx);
 
 	destroy_nhgrp(nhg_priv);
 }
 
 static bool
 ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
 {
 
 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
 		if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0)
 			continue;
 
 		/*
 		 * Failed to ref the nexthop, b/c it's deleted.
 		 * Need to rollback references back.
 		 */
 		for (int j = 0; j < i; j++)
 			nhop_free(nhg_priv->nhg_nh_weights[j].nh);
 		return (false);
 	}
 
 	return (true);
 }
 
 static void
 free_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
 {
 
 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++)
 		nhop_free(nhg_priv->nhg_nh_weights[i].nh);
 }
 
 /*
  * Allocate nexthop group of size @num_nhops with nexthops specified by
  * @wn. Nexthops have to be unique and match the fibnum/family of the group.
  * Returns unlinked nhgrp object on success or NULL and non-zero perror.
  */
 struct nhgrp_object *
 nhgrp_alloc(uint32_t fibnum, int family, struct weightened_nhop *wn, int num_nhops,
     int *perror)
 {
 	struct rib_head *rh = rt_tables_get_rnh(fibnum, family);
 	struct nhgrp_priv *nhg_priv;
 	struct nh_control *ctl;
 
 	if (rh == NULL) {
 		*perror = E2BIG;
 		return (NULL);
 	}
 
 	ctl = rh->nh_control;
 
 	if (num_nhops > RIB_MAX_MPATH_WIDTH) {
 		*perror = E2BIG;
 		return (NULL);
 	}
 
 	if (ctl->gr_head.hash_size == 0) {
 		/* First multipath request. Bootstrap mpath datastructures. */
 		if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) {
 			*perror = ENOMEM;
 			return (NULL);
 		}
 	}
 
 	/* Sort nexthops & check there are no duplicates */
 	sort_weightened_nhops(wn, num_nhops);
 	uint32_t last_id = 0;
 	for (int i = 0; i < num_nhops; i++) {
 		if (wn[i].nh->nh_priv->nh_control != ctl) {
 			*perror = EINVAL;
 			return (NULL);
 		}
 		if (wn[i].nh->nh_priv->nh_idx == last_id) {
 			*perror = EEXIST;
 			return (NULL);
 		}
 		last_id = wn[i].nh->nh_priv->nh_idx;
 	}
 
 	if ((nhg_priv = alloc_nhgrp(wn, num_nhops)) == NULL) {
 		*perror = ENOMEM;
 		return (NULL);
 	}
 	nhg_priv->nh_control = ctl;
 
 	*perror = 0;
 	return (nhg_priv->nhg);
 }
 
 /*
  * Finds an existing group matching @nhg or links @nhg to the tree.
  * Returns the referenced group or NULL and non-zero @perror.
  */
 struct nhgrp_object *
 nhgrp_get_nhgrp(struct nhgrp_object *nhg, int *perror)
 {
 	struct nhgrp_priv *nhg_priv, *key = NHGRP_PRIV(nhg);
 	struct nh_control *ctl = key->nh_control;
 
 	nhg_priv = find_nhgrp(ctl, key);
 	if (nhg_priv != NULL) {
 		/*
 		 * Free originally-created group. As it hasn't been linked
 		 *  and the dependent nexhops haven't been referenced, just free
 		 *  the group.
 		 */
 		destroy_nhgrp_int(key);
 		*perror = 0;
 		return (nhg_priv->nhg);
 	} else {
 		/* No existing group, try to link the new one */
 		if (!ref_nhgrp_nhops(key)) {
 			/*
 			 * Some of the nexthops have been scheduled for deletion.
 			 * As the group hasn't been linked / no nexhops have been
 			 *  referenced, call the final destructor immediately.
 			 */
 			destroy_nhgrp_int(key);
 			*perror = EAGAIN;
 			return (NULL);
 		}
 		if (link_nhgrp(ctl, key) == 0) {
 			/* Unable to allocate index? */
 			*perror = EAGAIN;
 			free_nhgrp_nhops(key);
 			destroy_nhgrp_int(key);
 			return (NULL);
 		}
 		*perror = 0;
 		return (nhg);
 	}
 
 	/* NOTREACHED */
 }
 
 /*
  * Creates or looks up an existing nexthop group based on @wn and @num_nhops.
  *
  * Returns referenced nhop group or NULL, passing error code in @perror.
  */
 struct nhgrp_priv *
 get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops,
     uint32_t uidx, int *perror)
 {
 	struct nhgrp_object *nhg;
 
 	nhg = nhgrp_alloc(ctl->ctl_rh->rib_fibnum, ctl->ctl_rh->rib_family,
 	    wn, num_nhops, perror);
 	if (nhg == NULL)
 		return (NULL);
 	nhgrp_set_uidx(nhg, uidx);
 	nhg = nhgrp_get_nhgrp(nhg, perror);
 	if (nhg != NULL)
 		return (NHGRP_PRIV(nhg));
 	return (NULL);
 }
 
 
 /*
  * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig.
  *
  * Returns referenced nexthop group or NULL. In the latter case, @perror is
  *  filled with an error code.
  * Note that function does NOT care if the next nexthops already exists
  * in the @gr_orig. As a result, they will be added, resulting in the
  * same nexthop being present multiple times in the new group.
  */
 static struct nhgrp_priv *
 append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig,
     struct weightened_nhop *wn, int num_nhops, int *perror)
 {
 	char storage[64];
 	struct weightened_nhop *pnhops;
 	struct nhgrp_priv *nhg_priv;
 	const struct nhgrp_priv *src_priv;
 	size_t sz;
 	int curr_nhops;
 
 	src_priv = NHGRP_PRIV_CONST(gr_orig);
 	curr_nhops = src_priv->nhg_nh_count;
 
 	*perror = 0;
 
 	sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop));
 	/* optimize for <= 4 paths, each path=16 bytes */
 	if (sz <= sizeof(storage))
 		pnhops = (struct weightened_nhop *)&storage[0];
 	else {
 		pnhops = malloc(sz, M_TEMP, M_NOWAIT);
 		if (pnhops == NULL) {
 			*perror = ENOMEM;
 			return (NULL);
 		}
 	}
 
 	/* Copy nhops from original group first */
 	memcpy(pnhops, src_priv->nhg_nh_weights,
 	  curr_nhops * sizeof(struct weightened_nhop));
 	memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop));
 	curr_nhops += num_nhops;
 
 	nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, 0, perror);
 
 	if (pnhops != (struct weightened_nhop *)&storage[0])
 		free(pnhops, M_TEMP);
 
 	if (nhg_priv == NULL)
 		return (NULL);
 
 	return (nhg_priv);
 }
 
 
 /*
  * Creates/finds nexthop group based on @wn and @num_nhops.
  * Returns 0 on success with referenced group in @rnd, or
  * errno.
  *
  * If the error is EAGAIN, then the operation can be retried.
  */
 int
 nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops,
     uint32_t uidx, struct nhgrp_object **pnhg)
 {
 	struct nh_control *ctl = rh->nh_control;
 	struct nhgrp_priv *nhg_priv;
 	int error;
 
 	nhg_priv = get_nhgrp(ctl, wn, num_nhops, uidx, &error);
 	if (nhg_priv != NULL)
 		*pnhg = nhg_priv->nhg;
 
 	return (error);
 }
 
 /*
  * Creates new nexthop group based on @src group without the nexthops
  * chosen by @flt_func.
  * Returns 0 on success, storring the reference nhop group/object in @rnd.
  */
 int
 nhgrp_get_filtered_group(struct rib_head *rh, const struct rtentry *rt,
     const struct nhgrp_object *src, rib_filter_f_t flt_func, void *flt_data,
     struct route_nhop_data *rnd)
 {
 	char storage[64];
 	struct nh_control *ctl = rh->nh_control;
 	struct weightened_nhop *pnhops;
 	const struct nhgrp_priv *mp_priv, *src_priv;
 	size_t sz;
 	int error, i, num_nhops;
 
 	src_priv = NHGRP_PRIV_CONST(src);
 
 	sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop));
 	/* optimize for <= 4 paths, each path=16 bytes */
 	if (sz <= sizeof(storage))
 		pnhops = (struct weightened_nhop *)&storage[0];
 	else {
 		if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL)
 			return (ENOMEM);
 	}
 
 	/* Filter nexthops */
 	error = 0;
 	num_nhops = 0;
 	for (i = 0; i < src_priv->nhg_nh_count; i++) {
 		if (flt_func(rt, src_priv->nhg_nh_weights[i].nh, flt_data))
 			continue;
 		memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i],
 		  sizeof(struct weightened_nhop));
 	}
 
 	if (num_nhops == 0) {
 		rnd->rnd_nhgrp = NULL;
 		rnd->rnd_weight = 0;
 	} else if (num_nhops == 1) {
 		rnd->rnd_nhop = pnhops[0].nh;
 		rnd->rnd_weight = pnhops[0].weight;
 		if (nhop_try_ref_object(rnd->rnd_nhop) == 0)
 			error = EAGAIN;
 	} else {
 		mp_priv = get_nhgrp(ctl, pnhops, num_nhops, 0, &error);
 		if (mp_priv != NULL)
 			rnd->rnd_nhgrp = mp_priv->nhg;
 		rnd->rnd_weight = 0;
 	}
 
 	if (pnhops != (struct weightened_nhop *)&storage[0])
 		free(pnhops, M_TEMP);
 
 	return (error);
 }
 
 /*
  * Creates new multipath group based on existing group/nhop in @rnd_orig and
  *  to-be-added nhop @wn_add.
  * Returns 0 on success and stores result in @rnd_new.
  */
 int
 nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig,
     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new)
 {
 	struct nh_control *ctl = rh->nh_control;
 	struct nhgrp_priv *nhg_priv;
 	struct weightened_nhop wn[2] = {};
 	int error;
 
 	if (rnd_orig->rnd_nhop == NULL) {
 		/* No paths to add to, just reference current nhop */
 		*rnd_new = *rnd_add;
 		if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0)
 			return (EAGAIN);
 		return (0);
 	}
 
 	wn[0].nh = rnd_add->rnd_nhop;
 	wn[0].weight = rnd_add->rnd_weight;
 
 	if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) {
 		/* Simple merge of 2 non-multipath nexthops */
 		wn[1].nh = rnd_orig->rnd_nhop;
 		wn[1].weight = rnd_orig->rnd_weight;
 		nhg_priv = get_nhgrp(ctl, wn, 2, 0, &error);
 	} else {
 		/* Get new nhop group with @rt->rt_nhop as an additional nhop */
 		nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1,
 		    &error);
 	}
 
 	if (nhg_priv == NULL)
 		return (error);
 	rnd_new->rnd_nhgrp = nhg_priv->nhg;
 	rnd_new->rnd_weight = 0;
 
 	return (0);
 }
 
 /*
  * Returns pointer to array of nexthops with weights for
  * given @nhg. Stores number of items in the array into @pnum_nhops.
  */
 const struct weightened_nhop *
 nhgrp_get_nhops(const struct nhgrp_object *nhg, uint32_t *pnum_nhops)
 {
 	const struct nhgrp_priv *nhg_priv;
 
 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
 
 	nhg_priv = NHGRP_PRIV_CONST(nhg);
 	*pnum_nhops = nhg_priv->nhg_nh_count;
 
 	return (nhg_priv->nhg_nh_weights);
 }
 
 void
 nhgrp_set_uidx(struct nhgrp_object *nhg, uint32_t uidx)
 {
 	struct nhgrp_priv *nhg_priv;
 
 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
 
 	nhg_priv = NHGRP_PRIV(nhg);
 
 	nhg_priv->nhg_uidx = uidx;
 }
 
 uint32_t
 nhgrp_get_uidx(const struct nhgrp_object *nhg)
 {
 	const struct nhgrp_priv *nhg_priv;
 
 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
 
 	nhg_priv = NHGRP_PRIV_CONST(nhg);
 	return (nhg_priv->nhg_uidx);
 }
 
 /*
  * Prints nexhop group @nhg data in the provided @buf.
  * Example: nhg#33/sz=3:[#1:100,#2:100,#3:100]
  * Example: nhg#33/sz=5:[#1:100,#2:100,..]
  */
 char *
 nhgrp_print_buf(const struct nhgrp_object *nhg, char *buf, size_t bufsize)
 {
 	const struct nhgrp_priv *nhg_priv = NHGRP_PRIV_CONST(nhg);
 
 	int off = snprintf(buf, bufsize, "nhg#%u/sz=%u:[", nhg_priv->nhg_idx,
 	    nhg_priv->nhg_nh_count);
 
 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
 		const struct weightened_nhop *wn = &nhg_priv->nhg_nh_weights[i];
 		int len = snprintf(&buf[off], bufsize - off, "#%u:%u,",
 		    wn->nh->nh_priv->nh_idx, wn->weight);
 		if (len + off + 3 >= bufsize) {
 			int len = snprintf(&buf[off], bufsize - off, "...");
 			off += len;
 			break;
 		}
 		off += len;
 	}
 	if (off > 0)
 		off--; // remove last ","
 	if (off + 1 < bufsize)
 		snprintf(&buf[off], bufsize - off, "]");
 	return buf;
 }
 
 __noinline static int
 dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv,
     char *buffer, size_t buffer_size, struct sysctl_req *w)
 {
 	struct rt_msghdr *rtm;
 	struct nhgrp_external *nhge;
 	struct nhgrp_container *nhgc;
 	const struct nhgrp_object *nhg;
 	struct nhgrp_nhop_external *ext;
 	int error;
 	size_t sz;
 
 	nhg = nhg_priv->nhg;
 
 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
 	/* controlplane nexthops */
 	sz += sizeof(struct nhgrp_container);
 	sz += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
 	/* dataplane nexthops */
 	sz += sizeof(struct nhgrp_container);
 	sz += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
 
 	KASSERT(sz <= buffer_size, ("increase nhgrp buffer size"));
 
 	bzero(buffer, sz);
 
 	rtm = (struct rt_msghdr *)buffer;
 	rtm->rtm_msglen = sz;
 	rtm->rtm_version = RTM_VERSION;
 	rtm->rtm_type = RTM_GET;
 
 	nhge = (struct nhgrp_external *)(rtm + 1);
 
 	nhge->nhg_idx = nhg_priv->nhg_idx;
 	nhge->nhg_refcount = nhg_priv->nhg_refcount;
 
 	/* fill in control plane nexthops firs */
 	nhgc = (struct nhgrp_container *)(nhge + 1);
 	nhgc->nhgc_type = NHG_C_TYPE_CNHOPS;
 	nhgc->nhgc_subtype = 0;
 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
 	nhgc->nhgc_count = nhg_priv->nhg_nh_count;
 
 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
 		ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx;
 		ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight;
 	}
 
 	/* fill in dataplane nexthops */
 	nhgc = (struct nhgrp_container *)(&ext[nhg_priv->nhg_nh_count]);
 	nhgc->nhgc_type = NHG_C_TYPE_DNHOPS;
 	nhgc->nhgc_subtype = 0;
 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
 	nhgc->nhgc_count = nhg->nhg_size;
 
 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
 	for (int i = 0; i < nhg->nhg_size; i++) {
 		ext[i].nh_idx = nhg->nhops[i]->nh_priv->nh_idx;
 		ext[i].nh_weight = 0;
 	}
 
 	error = SYSCTL_OUT(w, buffer, sz);
 
 	return (error);
 }
 
 uint32_t
 nhgrp_get_idx(const struct nhgrp_object *nhg)
 {
 	const struct nhgrp_priv *nhg_priv;
 
 	nhg_priv = NHGRP_PRIV_CONST(nhg);
 	return (nhg_priv->nhg_idx);
 }
 
 uint8_t
 nhgrp_get_origin(const struct nhgrp_object *nhg)
 {
 	return (NHGRP_PRIV_CONST(nhg)->nhg_origin);
 }
 
 void
 nhgrp_set_origin(struct nhgrp_object *nhg, uint8_t origin)
 {
 	NHGRP_PRIV(nhg)->nhg_origin = origin;
 }
 
 uint32_t
 nhgrp_get_count(struct rib_head *rh)
 {
 	struct nh_control *ctl;
 	uint32_t count;
 
 	ctl = rh->nh_control;
 
 	NHOPS_RLOCK(ctl);
 	count = ctl->gr_head.items_count;
 	NHOPS_RUNLOCK(ctl);
 
 	return (count);
 }
 
 int
 nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
 {
 	struct nh_control *ctl = rh->nh_control;
 	struct epoch_tracker et;
 	struct nhgrp_priv *nhg_priv;
 	char *buffer;
 	size_t sz;
 	int error = 0;
 
 	if (ctl->gr_head.items_count == 0)
 		return (0);
 
 	/* Calculate the maximum nhop group size in bytes */
 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
 	sz += 2 * sizeof(struct nhgrp_container);
 	sz += 2 * sizeof(struct nhgrp_nhop_external) * RIB_MAX_MPATH_WIDTH;
 	buffer = malloc(sz, M_TEMP, M_NOWAIT);
 	if (buffer == NULL)
 		return (ENOMEM);
 
 	NET_EPOCH_ENTER(et);
 	NHOPS_RLOCK(ctl);
 	CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
 		error = dump_nhgrp_entry(rh, nhg_priv, buffer, sz, w);
 		if (error != 0)
 			break;
 	} CHT_SLIST_FOREACH_END;
 	NHOPS_RUNLOCK(ctl);
 	NET_EPOCH_EXIT(et);
 
 	free(buffer, M_TEMP);
 
 	return (error);
 }
diff --git a/sys/net/route/nhop_ctl.c b/sys/net/route/nhop_ctl.c
index d042d9519f6b..15d4ec394187 100644
--- a/sys/net/route/nhop_ctl.c
+++ b/sys/net/route/nhop_ctl.c
@@ -1,1217 +1,1218 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2020 Alexander V. Chernikov
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/kernel.h>
 #include <sys/epoch.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/nhop_utils.h>
 #include <net/route/nhop.h>
 #include <net/route/nhop_var.h>
 #include <net/vnet.h>
 
 #define	DEBUG_MOD_NAME	nhop_ctl
 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
 #include <net/route/route_debug.h>
 _DECLARE_DEBUG(LOG_INFO);
 
 /*
  * This file contains core functionality for the nexthop ("nhop") route subsystem.
  * The business logic needed to create nexhop objects is implemented here.
  *
  * Nexthops in the original sense are the objects containing all the necessary
  * information to forward the packet to the selected destination.
  * In particular, nexthop is defined by a combination of
  *  ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_upper_family, mask of rt_flags and
  *    NHF_DEFAULT
  *
  * Additionally, each nexthop gets assigned its unique index (nexthop index).
  * It serves two purposes: first one is to ease the ability of userland programs to
  *  reference nexthops by their index. The second one allows lookup algorithms to
  *  to store index instead of pointer (2 bytes vs 8) as a lookup result.
  * All nexthops are stored in the resizable hash table.
  *
  * Basically, this file revolves around supporting 3 functions:
  * 1) nhop_create_from_info / nhop_create_from_nhop, which contains all
  *  business logic on filling the nexthop fields based on the provided request.
  * 2) nhop_get(), which gets a usable referenced nexthops.
  *
  * Conventions:
  * 1) non-exported functions start with verb
  * 2) exported function starts with the subsystem prefix: "nhop"
  */
 
 static int dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w);
 
 static int finalize_nhop(struct nh_control *ctl, struct nhop_object *nh, bool link);
 static struct ifnet *get_aifp(const struct nhop_object *nh);
 static void fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp);
 
 static void destroy_nhop_epoch(epoch_context_t ctx);
 static void destroy_nhop(struct nhop_object *nh);
 
 _Static_assert(__offsetof(struct nhop_object, nh_ifp) == 32,
     "nhop_object: wrong nh_ifp offset");
 _Static_assert(sizeof(struct nhop_object) <= 128,
     "nhop_object: size exceeds 128 bytes");
 
 static uma_zone_t nhops_zone;	/* Global zone for each and every nexthop */
 
 #define	NHOP_OBJECT_ALIGNED_SIZE	roundup2(sizeof(struct nhop_object), \
 							2 * CACHE_LINE_SIZE)
 #define	NHOP_PRIV_ALIGNED_SIZE		roundup2(sizeof(struct nhop_priv), \
 							2 * CACHE_LINE_SIZE)
 void
 nhops_init(void)
 {
 
 	nhops_zone = uma_zcreate("routing nhops",
 	    NHOP_OBJECT_ALIGNED_SIZE + NHOP_PRIV_ALIGNED_SIZE,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 
 /*
  * Fetches the interface of source address used by the route.
  * In all cases except interface-address-route it would be the
  * same as the transmit interfaces.
  * However, for the interface address this function will return
  * this interface ifp instead of loopback. This is needed to support
  * link-local IPv6 loopback communications.
  *
  * Returns found ifp.
  */
 static struct ifnet *
 get_aifp(const struct nhop_object *nh)
 {
 	struct ifnet *aifp = NULL;
 
 	/*
 	 * Adjust the "outgoing" interface.  If we're going to loop
 	 * the packet back to ourselves, the ifp would be the loopback
 	 * interface. However, we'd rather know the interface associated
 	 * to the destination address (which should probably be one of
 	 * our own addresses).
 	 */
 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) &&
 			nh->gw_sa.sa_family == AF_LINK) {
 		aifp = ifnet_byindex(nh->gwl_sa.sdl_index);
 		if (aifp == NULL) {
 			FIB_NH_LOG(LOG_WARNING, nh, "unable to get aifp for %s index %d",
 				if_name(nh->nh_ifp), nh->gwl_sa.sdl_index);
 		}
 	}
 
 	if (aifp == NULL)
 		aifp = nh->nh_ifp;
 
 	return (aifp);
 }
 
 int
 cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two)
 {
 
 	if (memcmp(_one->nh, _two->nh, NHOP_END_CMP) != 0)
 		return (0);
 
 	if (memcmp(_one, _two, NH_PRIV_END_CMP) != 0)
 		return (0);
 
 	return (1);
 }
 
 /*
  * Conditionally sets @nh mtu data based on the @info data.
  */
 static void
 set_nhop_mtu_from_info(struct nhop_object *nh, const struct rt_addrinfo *info)
 {
 	if (info->rti_mflags & RTV_MTU)
 		nhop_set_mtu(nh, info->rti_rmx->rmx_mtu, true);
 }
 
 /*
  * Fills in shorted link-level sockadd version suitable to be stored inside the
  *  nexthop gateway buffer.
  */
 static void
 fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp)
 {
 
 	bzero(sdl, sizeof(struct sockaddr_dl_short));
 	sdl->sdl_family = AF_LINK;
 	sdl->sdl_len = sizeof(struct sockaddr_dl_short);
 	sdl->sdl_index = ifp->if_index;
 	sdl->sdl_type = ifp->if_type;
 }
 
 static int
 set_nhop_gw_from_info(struct nhop_object *nh, struct rt_addrinfo *info)
 {
 	struct sockaddr *gw;
 
 	gw = info->rti_info[RTAX_GATEWAY];
 	MPASS(gw != NULL);
 	bool is_gw = info->rti_flags & RTF_GATEWAY;
 
 	if ((gw->sa_family == AF_LINK) && !is_gw) {
 
 		/*
 		 * Interface route with interface specified by the interface
 		 * index in sockadd_dl structure. It is used in the IPv6 loopback
 		 * output code, where we need to preserve the original interface
 		 * to maintain proper scoping.
 		 * Despite the fact that nexthop code stores original interface
 		 * in the separate field (nh_aifp, see below), write AF_LINK
 		 * compatible sa with shorter total length.
 		 */
 		struct sockaddr_dl *sdl = (struct sockaddr_dl *)gw;
 		struct ifnet *ifp = ifnet_byindex(sdl->sdl_index);
 		if (ifp == NULL) {
 			FIB_NH_LOG(LOG_DEBUG, nh, "error: invalid ifindex %d",
 			    sdl->sdl_index);
 			return (EINVAL);
 		}
 		nhop_set_direct_gw(nh, ifp);
 	} else {
 
 		/*
 		 * Multiple options here:
 		 *
 		 * 1) RTF_GATEWAY with IPv4/IPv6 gateway data
 		 * 2) Interface route with IPv4/IPv6 address of the
 		 *   matching interface. Some routing daemons do that
 		 *   instead of specifying ifindex in AF_LINK.
 		 *
 		 * In both cases, save the original nexthop to make the callers
 		 *   happy.
 		 */
 		if (!nhop_set_gw(nh, gw, is_gw))
 			return (EINVAL);
 	}
 	return (0);
 }
 
 static void
 set_nhop_expire_from_info(struct nhop_object *nh, const struct rt_addrinfo *info)
 {
 	uint32_t nh_expire = 0;
 
 	/* Kernel -> userland timebase conversion. */
 	if ((info->rti_mflags & RTV_EXPIRE) && (info->rti_rmx->rmx_expire > 0))
 		nh_expire = info->rti_rmx->rmx_expire - time_second + time_uptime;
 	nhop_set_expire(nh, nh_expire);
 }
 
 /*
  * Creates a new nexthop based on the information in @info.
  *
  * Returns:
  * 0 on success, filling @nh_ret with the desired nexthop object ptr
  * errno otherwise
  */
 int
 nhop_create_from_info(struct rib_head *rnh, struct rt_addrinfo *info,
     struct nhop_object **nh_ret)
 {
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	MPASS(info->rti_ifa != NULL);
 	MPASS(info->rti_ifp != NULL);
 
 	if (info->rti_info[RTAX_GATEWAY] == NULL) {
 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: empty gateway");
 		return (EINVAL);
 	}
 
 	struct nhop_object *nh = nhop_alloc(rnh->rib_fibnum, rnh->rib_family);
 	if (nh == NULL)
 		return (ENOMEM);
 
 	if ((error = set_nhop_gw_from_info(nh, info)) != 0) {
 		nhop_free(nh);
 		return (error);
 	}
 	nhop_set_transmit_ifp(nh, info->rti_ifp);
 
 	nhop_set_blackhole(nh, info->rti_flags & (RTF_BLACKHOLE | RTF_REJECT));
 
 	error = rnh->rnh_set_nh_pfxflags(rnh->rib_fibnum, info->rti_info[RTAX_DST],
 	    info->rti_info[RTAX_NETMASK], nh);
 
 	nhop_set_redirect(nh, info->rti_flags & RTF_DYNAMIC);
 	nhop_set_pinned(nh, info->rti_flags & RTF_PINNED);
 	set_nhop_expire_from_info(nh, info);
 	nhop_set_rtflags(nh, info->rti_flags);
 
 	set_nhop_mtu_from_info(nh, info);
 	nhop_set_src(nh, info->rti_ifa);
 
 	/*
 	 * The remaining fields are either set from nh_preadd hook
 	 * or are computed from the provided data
 	 */
 	*nh_ret = nhop_get_nhop(nh, &error);
 
 	return (error);
 }
 
 /*
  * Gets linked nhop using the provided @nh nexhop data.
  * If linked nhop is found, returns it, freeing the provided one.
  * If there is no such nexthop, attaches the remaining data to the
  *  provided nexthop and links it.
  *
  * Returns 0 on success, storing referenced nexthop in @pnh.
  * Otherwise, errno is returned.
  */
 struct nhop_object *
 nhop_get_nhop(struct nhop_object *nh, int *perror)
 {
 	struct rib_head *rnh = nhop_get_rh(nh);
 
 	if (__predict_false(rnh == NULL)) {
 		*perror = EAFNOSUPPORT;
 		nhop_free(nh);
 		return (NULL);
 	}
 
 	return (nhop_get_nhop_internal(rnh, nh, perror));
 }
 
 struct nhop_object *
 nhop_get_nhop_internal(struct rib_head *rnh, struct nhop_object *nh, int *perror)
 {
 	struct nhop_priv *tmp_priv;
 	int error;
 
 	nh->nh_aifp = get_aifp(nh);
 
 	/* Give the protocols chance to augment nexthop properties */
 	error = rnh->rnh_augment_nh(rnh->rib_fibnum, nh);
 	if (error != 0) {
 		nhop_free(nh);
 		*perror = error;
 		return (NULL);
 	}
 
 	tmp_priv = find_nhop(rnh->nh_control, nh->nh_priv);
 	if (tmp_priv != NULL) {
 		nhop_free(nh);
 		*perror = 0;
 		return (tmp_priv->nh);
 	}
 
 	/*
 	 * Existing nexthop not found, need to create new one.
 	 * Note: multiple simultaneous requests
 	 *  can result in multiple equal nexhops existing in the
 	 *  nexthop table. This is not a not a problem until the
 	 *  relative number of such nexthops is significant, which
 	 *  is extremely unlikely.
 	 */
 	*perror = finalize_nhop(rnh->nh_control, nh, true);
 	return (*perror == 0 ? nh : NULL);
 }
 
 /*
  * Gets referenced but unlinked nhop.
  * Alocates/references the remaining bits of the nexthop data, so
  *  it can be safely linked later or used as a clone source.
  *
  * Returns 0 on success.
  */
 int
 nhop_get_unlinked(struct nhop_object *nh)
 {
 	struct rib_head *rnh = nhop_get_rh(nh);
 
 	if (__predict_false(rnh == NULL)) {
 		nhop_free(nh);
 		return (EAFNOSUPPORT);
 	}
 
 	nh->nh_aifp = get_aifp(nh);
 
 	return (finalize_nhop(rnh->nh_control, nh, false));
 }
 
 
 /*
  * Update @nh with data supplied in @info.
  * This is a helper function to support route changes.
  *
  * It limits the changes that can be done to the route to the following:
  * 1) all combination of gateway changes
  * 2) route flags (FLAG[123],STATIC)
  * 3) route MTU
  *
  * Returns:
  * 0 on success, errno otherwise
  */
 static int
 alter_nhop_from_info(struct nhop_object *nh, struct rt_addrinfo *info)
 {
 	struct sockaddr *info_gw;
 	int error;
 
 	/* Update MTU if set in the request*/
 	set_nhop_mtu_from_info(nh, info);
 
 	/* Only RTF_FLAG[123] and RTF_STATIC */
 	uint32_t rt_flags = nhop_get_rtflags(nh) & ~RT_CHANGE_RTFLAGS_MASK;
 	rt_flags |= info->rti_flags & RT_CHANGE_RTFLAGS_MASK;
 	nhop_set_rtflags(nh, rt_flags);
 
 	/* Consider gateway change */
 	info_gw = info->rti_info[RTAX_GATEWAY];
 	if (info_gw != NULL) {
 		error = set_nhop_gw_from_info(nh, info);
 		if (error != 0)
 			return (error);
 	}
 
 	if (info->rti_ifa != NULL)
 		nhop_set_src(nh, info->rti_ifa);
 	if (info->rti_ifp != NULL)
 		nhop_set_transmit_ifp(nh, info->rti_ifp);
 
 	return (0);
 }
 
 /*
  * Creates new nexthop based on @nh_orig and augmentation data from @info.
  * Helper function used in the route changes, please see
  *   alter_nhop_from_info() comments for more details.
  *
  * Returns:
  * 0 on success, filling @nh_ret with the desired nexthop object
  * errno otherwise
  */
 int
 nhop_create_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_orig,
     struct rt_addrinfo *info, struct nhop_object **pnh)
 {
 	struct nhop_object *nh;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	nh = nhop_alloc(rnh->rib_fibnum, rnh->rib_family);
 	if (nh == NULL)
 		return (ENOMEM);
 
 	nhop_copy(nh, nh_orig);
 
 	error = alter_nhop_from_info(nh, info);
 	if (error != 0) {
 		nhop_free(nh);
 		return (error);
 	}
 
 	*pnh = nhop_get_nhop(nh, &error);
 
 	return (error);
 }
 
 static bool
 reference_nhop_deps(struct nhop_object *nh)
 {
 	if (!ifa_try_ref(nh->nh_ifa))
 		return (false);
 	nh->nh_aifp = get_aifp(nh);
 	if (!if_try_ref(nh->nh_aifp)) {
 		ifa_free(nh->nh_ifa);
 		return (false);
 	}
 	FIB_NH_LOG(LOG_DEBUG2, nh, "nh_aifp: %s nh_ifp %s",
 	    if_name(nh->nh_aifp), if_name(nh->nh_ifp));
 	if (!if_try_ref(nh->nh_ifp)) {
 		ifa_free(nh->nh_ifa);
 		if_rele(nh->nh_aifp);
 		return (false);
 	}
 
 	return (true);
 }
 
 /*
  * Alocates/references the remaining bits of nexthop data and links
  *  it to the hash table.
  * Returns 0 if successful,
  *  errno otherwise. @nh_priv is freed in case of error.
  */
 static int
 finalize_nhop(struct nh_control *ctl, struct nhop_object *nh, bool link)
 {
 
 	/* Allocate per-cpu packet counter */
 	nh->nh_pksent = counter_u64_alloc(M_NOWAIT);
 	if (nh->nh_pksent == NULL) {
 		nhop_free(nh);
 		RTSTAT_INC(rts_nh_alloc_failure);
 		FIB_NH_LOG(LOG_WARNING, nh, "counter_u64_alloc() failed");
 		return (ENOMEM);
 	}
 
 	if (!reference_nhop_deps(nh)) {
 		counter_u64_free(nh->nh_pksent);
 		nhop_free(nh);
 		RTSTAT_INC(rts_nh_alloc_failure);
 		FIB_NH_LOG(LOG_WARNING, nh, "interface reference failed");
 		return (EAGAIN);
 	}
 
 	/* Save vnet to ease destruction */
 	nh->nh_priv->nh_vnet = curvnet;
 
 	/* Please see nhop_free() comments on the initial value */
 	refcount_init(&nh->nh_priv->nh_linked, 2);
 
 	MPASS(nh->nh_priv->nh_fibnum == ctl->ctl_rh->rib_fibnum);
 
 	if (!link) {
 		refcount_release(&nh->nh_priv->nh_linked);
 		NHOPS_WLOCK(ctl);
 		nh->nh_priv->nh_finalized = 1;
 		NHOPS_WUNLOCK(ctl);
 	} else if (link_nhop(ctl, nh->nh_priv) == 0) {
 		/*
 		 * Adding nexthop to the datastructures
 		 *  failed. Call destructor w/o waiting for
 		 *  the epoch end, as nexthop is not used
 		 *  and return.
 		 */
 		char nhbuf[NHOP_PRINT_BUFSIZE];
 		FIB_NH_LOG(LOG_WARNING, nh, "failed to link %s",
 		    nhop_print_buf(nh, nhbuf, sizeof(nhbuf)));
 		destroy_nhop(nh);
 
 		return (ENOBUFS);
 	}
 
 	IF_DEBUG_LEVEL(LOG_DEBUG) {
 		char nhbuf[NHOP_PRINT_BUFSIZE] __unused;
 		FIB_NH_LOG(LOG_DEBUG, nh, "finalized: %s",
 		    nhop_print_buf(nh, nhbuf, sizeof(nhbuf)));
 	}
 
 	return (0);
 }
 
 static void
 destroy_nhop(struct nhop_object *nh)
 {
 	if_rele(nh->nh_ifp);
 	if_rele(nh->nh_aifp);
 	ifa_free(nh->nh_ifa);
 	counter_u64_free(nh->nh_pksent);
 
 	uma_zfree(nhops_zone, nh);
 }
 
 /*
  * Epoch callback indicating nhop is safe to destroy
  */
 static void
 destroy_nhop_epoch(epoch_context_t ctx)
 {
 	struct nhop_priv *nh_priv;
 
 	nh_priv = __containerof(ctx, struct nhop_priv, nh_epoch_ctx);
 
 	destroy_nhop(nh_priv->nh);
 }
 
 void
 nhop_ref_object(struct nhop_object *nh)
 {
 	u_int old __diagused;
 
 	old = refcount_acquire(&nh->nh_priv->nh_refcnt);
 	KASSERT(old > 0, ("%s: nhop object %p has 0 refs", __func__, nh));
 }
 
 int
 nhop_try_ref_object(struct nhop_object *nh)
 {
 
 	return (refcount_acquire_if_not_zero(&nh->nh_priv->nh_refcnt));
 }
 
 void
 nhop_free(struct nhop_object *nh)
 {
 	struct nh_control *ctl;
 	struct nhop_priv *nh_priv = nh->nh_priv;
 	struct epoch_tracker et;
 
 	if (!refcount_release(&nh_priv->nh_refcnt))
 		return;
 
 	/* allows to use nhop_free() during nhop init */
 	if (__predict_false(nh_priv->nh_finalized == 0)) {
 		uma_zfree(nhops_zone, nh);
 		return;
 	}
 
 	IF_DEBUG_LEVEL(LOG_DEBUG) {
 		char nhbuf[NHOP_PRINT_BUFSIZE] __unused;
 		FIB_NH_LOG(LOG_DEBUG, nh, "deleting %s",
 		    nhop_print_buf(nh, nhbuf, sizeof(nhbuf)));
 	}
 
 	/*
 	 * There are only 2 places, where nh_linked can be decreased:
 	 *  rib destroy (nhops_destroy_rib) and this function.
 	 * nh_link can never be increased.
 	 *
 	 * Hence, use initial value of 2 to make use of
 	 *  refcount_release_if_not_last().
 	 *
 	 * There can be two scenarious when calling this function:
 	 *
 	 * 1) nh_linked value is 2. This means that either
 	 *  nhops_destroy_rib() has not been called OR it is running,
 	 *  but we are guaranteed that nh_control won't be freed in
 	 *  this epoch. Hence, nexthop can be safely unlinked.
 	 *
 	 * 2) nh_linked value is 1. In that case, nhops_destroy_rib()
 	 *  has been called and nhop unlink can be skipped.
 	 */
 
 	NET_EPOCH_ENTER(et);
 	if (refcount_release_if_not_last(&nh_priv->nh_linked)) {
 		ctl = nh_priv->nh_control;
 		if (unlink_nhop(ctl, nh_priv) == NULL) {
 			/* Do not try to reclaim */
 			char nhbuf[NHOP_PRINT_BUFSIZE];
 			FIB_NH_LOG(LOG_WARNING, nh, "failed to unlink %s",
 			    nhop_print_buf(nh, nhbuf, sizeof(nhbuf)));
 			NET_EPOCH_EXIT(et);
 			return;
 		}
 	}
 	NET_EPOCH_EXIT(et);
 
 	NET_EPOCH_CALL(destroy_nhop_epoch, &nh_priv->nh_epoch_ctx);
 }
 
 void
 nhop_ref_any(struct nhop_object *nh)
 {
 #ifdef ROUTE_MPATH
 	if (!NH_IS_NHGRP(nh))
 		nhop_ref_object(nh);
 	else
 		nhgrp_ref_object((struct nhgrp_object *)nh);
 #else
 	nhop_ref_object(nh);
 #endif
 }
 
 void
 nhop_free_any(struct nhop_object *nh)
 {
 
 #ifdef ROUTE_MPATH
 	if (!NH_IS_NHGRP(nh))
 		nhop_free(nh);
 	else
 		nhgrp_free((struct nhgrp_object *)nh);
 #else
 	nhop_free(nh);
 #endif
 }
 
 /* Nhop-related methods */
 
 /*
  * Allocates an empty unlinked nhop object.
  * Returns object pointer or NULL on failure
  */
 struct nhop_object *
 nhop_alloc(uint32_t fibnum, int family)
 {
 	struct nhop_object *nh;
 	struct nhop_priv *nh_priv;
 
 	nh = (struct nhop_object *)uma_zalloc(nhops_zone, M_NOWAIT | M_ZERO);
 	if (__predict_false(nh == NULL))
 		return (NULL);
 
 	nh_priv = (struct nhop_priv *)((char *)nh + NHOP_OBJECT_ALIGNED_SIZE);
 	nh->nh_priv = nh_priv;
 	nh_priv->nh = nh;
 
 	nh_priv->nh_upper_family = family;
 	nh_priv->nh_fibnum = fibnum;
 
 	/* Setup refcount early to allow nhop_free() to work */
 	refcount_init(&nh_priv->nh_refcnt, 1);
 
 	return (nh);
 }
 
 void
 nhop_copy(struct nhop_object *nh, const struct nhop_object *nh_orig)
 {
 	struct nhop_priv *nh_priv = nh->nh_priv;
 
 	nh->nh_flags = nh_orig->nh_flags;
 	nh->nh_mtu = nh_orig->nh_mtu;
 	memcpy(&nh->gw_sa, &nh_orig->gw_sa, nh_orig->gw_sa.sa_len);
 	nh->nh_ifp = nh_orig->nh_ifp;
 	nh->nh_ifa = nh_orig->nh_ifa;
 	nh->nh_aifp = nh_orig->nh_aifp;
 
 	nh_priv->nh_upper_family = nh_orig->nh_priv->nh_upper_family;
 	nh_priv->nh_neigh_family = nh_orig->nh_priv->nh_neigh_family;
 	nh_priv->nh_type = nh_orig->nh_priv->nh_type;
 	nh_priv->rt_flags = nh_orig->nh_priv->rt_flags;
 	nh_priv->nh_fibnum = nh_orig->nh_priv->nh_fibnum;
 	nh_priv->nh_origin = nh_orig->nh_priv->nh_origin;
 }
 
 void
 nhop_set_direct_gw(struct nhop_object *nh, struct ifnet *ifp)
 {
 	nh->nh_flags &= ~NHF_GATEWAY;
 	nh->nh_priv->rt_flags &= ~RTF_GATEWAY;
 	nh->nh_priv->nh_neigh_family = nh->nh_priv->nh_upper_family;
 
 	fill_sdl_from_ifp(&nh->gwl_sa, ifp);
 	memset(&nh->gw_buf[nh->gw_sa.sa_len], 0, sizeof(nh->gw_buf) - nh->gw_sa.sa_len);
 }
 
 bool
 nhop_check_gateway(int upper_family, int neigh_family)
 {
 	if (upper_family == neigh_family)
 		return (true);
 	else if (neigh_family == AF_UNSPEC || neigh_family == AF_LINK)
 		return (true);
 #if defined(INET) && defined(INET6)
 	else if (upper_family == AF_INET && neigh_family == AF_INET6 &&
 	    rib_can_4o6_nhop())
 		return (true);
 #endif
 	else
 		return (false);
 }
 
 /*
  * Sets gateway for the nexthop.
  * It can be "normal" gateway with is_gw set or a special form of
  * adding interface route, refering to it by specifying local interface
  * address. In that case is_gw is set to false.
  */
 bool
 nhop_set_gw(struct nhop_object *nh, const struct sockaddr *gw, bool is_gw)
 {
 	if (gw->sa_len > sizeof(nh->gw_buf)) {
 		FIB_NH_LOG(LOG_DEBUG, nh, "nhop SA size too big: AF %d len %u",
 		    gw->sa_family, gw->sa_len);
 		return (false);
 	}
 
 	if (!nhop_check_gateway(nh->nh_priv->nh_upper_family, gw->sa_family)) {
 		FIB_NH_LOG(LOG_DEBUG, nh,
 		    "error: invalid dst/gateway family combination (%d, %d)",
 		    nh->nh_priv->nh_upper_family, gw->sa_family);
 		return (false);
 	}
 
 	memcpy(&nh->gw_sa, gw, gw->sa_len);
 	memset(&nh->gw_buf[gw->sa_len], 0, sizeof(nh->gw_buf) - gw->sa_len);
 
 	if (is_gw) {
 		nh->nh_flags |= NHF_GATEWAY;
 		nh->nh_priv->rt_flags |= RTF_GATEWAY;
 		nh->nh_priv->nh_neigh_family = gw->sa_family;
 	} else {
 		nh->nh_flags &= ~NHF_GATEWAY;
 		nh->nh_priv->rt_flags &= ~RTF_GATEWAY;
 		nh->nh_priv->nh_neigh_family = nh->nh_priv->nh_upper_family;
 	}
 
 	return (true);
 }
 
 bool
 nhop_set_upper_family(struct nhop_object *nh, int family)
 {
 	if (!nhop_check_gateway(nh->nh_priv->nh_upper_family, family)) {
 		FIB_NH_LOG(LOG_DEBUG, nh,
 		    "error: invalid upper/neigh family combination (%d, %d)",
 		    nh->nh_priv->nh_upper_family, family);
 		return (false);
 	}
 
 	nh->nh_priv->nh_upper_family = family;
 	return (true);
 }
 
 void
 nhop_set_broadcast(struct nhop_object *nh, bool is_broadcast)
 {
 	if (is_broadcast) {
 		nh->nh_flags |= NHF_BROADCAST;
 		nh->nh_priv->rt_flags |= RTF_BROADCAST;
 	} else {
 		nh->nh_flags &= ~NHF_BROADCAST;
 		nh->nh_priv->rt_flags &= ~RTF_BROADCAST;
 	}
 }
 
 void
 nhop_set_blackhole(struct nhop_object *nh, int blackhole_rt_flag)
 {
 	nh->nh_flags &= ~(NHF_BLACKHOLE | NHF_REJECT);
 	nh->nh_priv->rt_flags &= ~(RTF_BLACKHOLE | RTF_REJECT);
 	switch (blackhole_rt_flag) {
 	case RTF_BLACKHOLE:
 		nh->nh_flags |= NHF_BLACKHOLE;
 		nh->nh_priv->rt_flags |= RTF_BLACKHOLE;
 		break;
 	case RTF_REJECT:
 		nh->nh_flags |= NHF_REJECT;
 		nh->nh_priv->rt_flags |= RTF_REJECT;
 		break;
 	}
 }
 
 void
 nhop_set_redirect(struct nhop_object *nh, bool is_redirect)
 {
 	if (is_redirect) {
 		nh->nh_priv->rt_flags |= RTF_DYNAMIC;
 		nh->nh_flags |= NHF_REDIRECT;
 	} else {
 		nh->nh_priv->rt_flags &= ~RTF_DYNAMIC;
 		nh->nh_flags &= ~NHF_REDIRECT;
 	}
 }
 
 void
 nhop_set_pinned(struct nhop_object *nh, bool is_pinned)
 {
 	if (is_pinned)
 		nh->nh_priv->rt_flags |= RTF_PINNED;
 	else
 		nh->nh_priv->rt_flags &= ~RTF_PINNED;
 }
 
 uint32_t
 nhop_get_idx(const struct nhop_object *nh)
 {
 
 	return (nh->nh_priv->nh_idx);
 }
 
 uint32_t
 nhop_get_uidx(const struct nhop_object *nh)
 {
 	return (nh->nh_priv->nh_uidx);
 }
 
 void
 nhop_set_uidx(struct nhop_object *nh, uint32_t uidx)
 {
 	nh->nh_priv->nh_uidx = uidx;
 }
 
 enum nhop_type
 nhop_get_type(const struct nhop_object *nh)
 {
 
 	return (nh->nh_priv->nh_type);
 }
 
 void
 nhop_set_type(struct nhop_object *nh, enum nhop_type nh_type)
 {
 
 	nh->nh_priv->nh_type = nh_type;
 }
 
 int
 nhop_get_rtflags(const struct nhop_object *nh)
 {
 
 	return (nh->nh_priv->rt_flags);
 }
 
 /*
  * Sets generic rtflags that are not covered by other functions.
  */
 void
 nhop_set_rtflags(struct nhop_object *nh, int rt_flags)
 {
 	nh->nh_priv->rt_flags &= ~RT_SET_RTFLAGS_MASK;
 	nh->nh_priv->rt_flags |= (rt_flags & RT_SET_RTFLAGS_MASK);
 }
 
 /*
  * Sets flags that are specific to the prefix (NHF_HOST or NHF_DEFAULT).
  */
 void
 nhop_set_pxtype_flag(struct nhop_object *nh, int nh_flag)
 {
 	if (nh_flag == NHF_HOST) {
 		nh->nh_flags |= NHF_HOST;
 		nh->nh_flags &= ~NHF_DEFAULT;
 		nh->nh_priv->rt_flags |= RTF_HOST;
 	} else if (nh_flag == NHF_DEFAULT) {
 		nh->nh_flags |= NHF_DEFAULT;
 		nh->nh_flags &= ~NHF_HOST;
 		nh->nh_priv->rt_flags &= ~RTF_HOST;
 	} else {
 		nh->nh_flags &= ~(NHF_HOST | NHF_DEFAULT);
 		nh->nh_priv->rt_flags &= ~RTF_HOST;
 	}
 }
 
 /*
  * Sets nhop MTU. Sets RTF_FIXEDMTU if mtu is explicitly
  * specified by userland.
  */
 void
 nhop_set_mtu(struct nhop_object *nh, uint32_t mtu, bool from_user)
 {
 	if (from_user) {
 		if (mtu != 0)
 			nh->nh_priv->rt_flags |= RTF_FIXEDMTU;
 		else
 			nh->nh_priv->rt_flags &= ~RTF_FIXEDMTU;
 	}
 	nh->nh_mtu = mtu;
 }
 
 void
 nhop_set_src(struct nhop_object *nh, struct ifaddr *ifa)
 {
 	nh->nh_ifa = ifa;
 }
 
 void
 nhop_set_transmit_ifp(struct nhop_object *nh, struct ifnet *ifp)
 {
 	nh->nh_ifp = ifp;
 }
 
 
 struct vnet *
 nhop_get_vnet(const struct nhop_object *nh)
 {
 
 	return (nh->nh_priv->nh_vnet);
 }
 
 struct nhop_object *
 nhop_select_func(struct nhop_object *nh, uint32_t flowid)
 {
 
 	return (nhop_select(nh, flowid));
 }
 
 /*
  * Returns address family of the traffic uses the nexthop.
  */
 int
 nhop_get_upper_family(const struct nhop_object *nh)
 {
 	return (nh->nh_priv->nh_upper_family);
 }
 
 /*
  * Returns address family of the LLE or gateway that is used
  * to forward the traffic to.
  */
 int
 nhop_get_neigh_family(const struct nhop_object *nh)
 {
 	return (nh->nh_priv->nh_neigh_family);
 }
 
 uint32_t
 nhop_get_fibnum(const struct nhop_object *nh)
 {
 	return (nh->nh_priv->nh_fibnum);
 }
 
 void
 nhop_set_fibnum(struct nhop_object *nh, uint32_t fibnum)
 {
 	nh->nh_priv->nh_fibnum = fibnum;
 }
 
 uint32_t
 nhop_get_expire(const struct nhop_object *nh)
 {
 	return (nh->nh_priv->nh_expire);
 }
 
 void
 nhop_set_expire(struct nhop_object *nh, uint32_t expire)
 {
 	MPASS(!NH_IS_LINKED(nh));
 	nh->nh_priv->nh_expire = expire;
 }
 
 struct rib_head *
 nhop_get_rh(const struct nhop_object *nh)
 {
 	uint32_t fibnum = nhop_get_fibnum(nh);
 	int family = nhop_get_neigh_family(nh);
 
 	return (rt_tables_get_rnh(fibnum, family));
 }
 
 uint8_t
 nhop_get_origin(const struct nhop_object *nh)
 {
 	return (nh->nh_priv->nh_origin);
 }
 
 void
 nhop_set_origin(struct nhop_object *nh, uint8_t origin)
 {
 	nh->nh_priv->nh_origin = origin;
 }
 
 void
 nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu)
 {
 	struct nh_control *ctl;
 	struct nhop_priv *nh_priv;
 	struct nhop_object *nh;
 
 	ctl = rh->nh_control;
 
 	NHOPS_WLOCK(ctl);
 	CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) {
 		nh = nh_priv->nh;
 		if (nh->nh_ifp == ifp) {
 			if ((nh_priv->rt_flags & RTF_FIXEDMTU) == 0 ||
 			    nh->nh_mtu > mtu) {
 				/* Update MTU directly */
 				nh->nh_mtu = mtu;
 			}
 		}
 	} CHT_SLIST_FOREACH_END;
 	NHOPS_WUNLOCK(ctl);
 
 }
 
 /*
  * Prints nexthop @nh data in the provided @buf.
  * Example: nh#33/inet/em0/192.168.0.1
  */
 char *
 nhop_print_buf(const struct nhop_object *nh, char *buf, size_t bufsize)
 {
 #if defined(INET) || defined(INET6)
 	char abuf[INET6_ADDRSTRLEN];
 #endif
 	struct nhop_priv *nh_priv = nh->nh_priv;
 	const char *upper_str = rib_print_family(nh->nh_priv->nh_upper_family);
 
 	switch (nh->gw_sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		inet_ntop(AF_INET, &nh->gw4_sa.sin_addr, abuf, sizeof(abuf));
 		snprintf(buf, bufsize, "nh#%d/%s/%s/%s", nh_priv->nh_idx, upper_str,
 		    if_name(nh->nh_ifp), abuf);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		inet_ntop(AF_INET6, &nh->gw6_sa.sin6_addr, abuf, sizeof(abuf));
 		snprintf(buf, bufsize, "nh#%d/%s/%s/%s", nh_priv->nh_idx, upper_str,
 		    if_name(nh->nh_ifp), abuf);
 		break;
 #endif
 	case AF_LINK:
 		snprintf(buf, bufsize, "nh#%d/%s/%s/resolve", nh_priv->nh_idx, upper_str,
 		    if_name(nh->nh_ifp));
 		break;
 	default:
 		snprintf(buf, bufsize, "nh#%d/%s/%s/????", nh_priv->nh_idx, upper_str,
 		    if_name(nh->nh_ifp));
 		break;
 	}
 
 	return (buf);
 }
 
 char *
 nhop_print_buf_any(const struct nhop_object *nh, char *buf, size_t bufsize)
 {
 #ifdef ROUTE_MPATH
 	if (NH_IS_NHGRP(nh))
 		return (nhgrp_print_buf((const struct nhgrp_object *)nh, buf, bufsize));
 	else
 #endif
 		return (nhop_print_buf(nh, buf, bufsize));
 }
 
 /*
  * Dumps a single entry to sysctl buffer.
  *
  * Layout:
  *  rt_msghdr - generic RTM header to allow users to skip non-understood messages
  *  nhop_external - nexhop description structure (with length)
  *  nhop_addrs - structure encapsulating GW/SRC sockaddrs
  */
 static int
 dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w)
 {
 	struct {
 		struct rt_msghdr	rtm;
 		struct nhop_external	nhe;
 		struct nhop_addrs	na;
 	} arpc;
 	struct nhop_external *pnhe;
 	struct sockaddr *gw_sa, *src_sa;
 	struct sockaddr_storage ss;
 	size_t addrs_len;
 	int error;
 
 	memset(&arpc, 0, sizeof(arpc));
 
 	arpc.rtm.rtm_msglen = sizeof(arpc);
 	arpc.rtm.rtm_version = RTM_VERSION;
 	arpc.rtm.rtm_type = RTM_GET;
 	//arpc.rtm.rtm_flags = RTF_UP;
 	arpc.rtm.rtm_flags = nh->nh_priv->rt_flags;
 
 	/* nhop_external */
 	pnhe = &arpc.nhe;
 	pnhe->nh_len = sizeof(struct nhop_external);
 	pnhe->nh_idx = nh->nh_priv->nh_idx;
 	pnhe->nh_fib = rh->rib_fibnum;
 	pnhe->ifindex = nh->nh_ifp->if_index;
 	pnhe->aifindex = nh->nh_aifp->if_index;
 	pnhe->nh_family = nh->nh_priv->nh_upper_family;
 	pnhe->nh_type = nh->nh_priv->nh_type;
 	pnhe->nh_mtu = nh->nh_mtu;
 	pnhe->nh_flags = nh->nh_flags;
 
 	memcpy(pnhe->nh_prepend, nh->nh_prepend, sizeof(nh->nh_prepend));
 	pnhe->prepend_len = nh->nh_prepend_len;
 	pnhe->nh_refcount = nh->nh_priv->nh_refcnt;
 	pnhe->nh_pksent = counter_u64_fetch(nh->nh_pksent);
 
 	/* sockaddr container */
 	addrs_len = sizeof(struct nhop_addrs);
 	arpc.na.gw_sa_off = addrs_len;
 	gw_sa = (struct sockaddr *)&nh->gw4_sa;
 	addrs_len += gw_sa->sa_len;
 
 	src_sa = nh->nh_ifa->ifa_addr;
 	if (src_sa->sa_family == AF_LINK) {
 		/* Shorten structure */
 		memset(&ss, 0, sizeof(struct sockaddr_storage));
 		fill_sdl_from_ifp((struct sockaddr_dl_short *)&ss,
 		    nh->nh_ifa->ifa_ifp);
 		src_sa = (struct sockaddr *)&ss;
 	}
 	arpc.na.src_sa_off = addrs_len;
 	addrs_len += src_sa->sa_len;
 
 	/* Write total container length */
 	arpc.na.na_len = addrs_len;
 
 	arpc.rtm.rtm_msglen += arpc.na.na_len - sizeof(struct nhop_addrs);
 
 	error = SYSCTL_OUT(w, &arpc, sizeof(arpc));
 	if (error == 0)
 		error = SYSCTL_OUT(w, gw_sa, gw_sa->sa_len);
 	if (error == 0)
 		error = SYSCTL_OUT(w, src_sa, src_sa->sa_len);
 
 	return (error);
 }
 
 uint32_t
 nhops_get_count(struct rib_head *rh)
 {
 	struct nh_control *ctl;
 	uint32_t count;
 
 	ctl = rh->nh_control;
 
 	NHOPS_RLOCK(ctl);
 	count = ctl->nh_head.items_count;
 	NHOPS_RUNLOCK(ctl);
 
 	return (count);
 }
 
 int
 nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
 {
 	struct nh_control *ctl;
 	struct nhop_priv *nh_priv;
 	int error;
 
 	ctl = rh->nh_control;
 
 	NHOPS_RLOCK(ctl);
 	FIB_RH_LOG(LOG_DEBUG, rh, "dump %u items", ctl->nh_head.items_count);
 	CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) {
 		error = dump_nhop_entry(rh, nh_priv->nh, w);
 		if (error != 0) {
 			NHOPS_RUNLOCK(ctl);
 			return (error);
 		}
 	} CHT_SLIST_FOREACH_END;
 	NHOPS_RUNLOCK(ctl);
 
 	return (0);
 }
diff --git a/sys/net/route/route_ctl.c b/sys/net/route/route_ctl.c
index eaabe901b3cb..fd374b468b7c 100644
--- a/sys/net/route/route_ctl.c
+++ b/sys/net/route/route_ctl.c
@@ -1,1552 +1,1553 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2020 Alexander V. Chernikov
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_dl.h>
 #include <net/vnet.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/nhop_utils.h>
 #include <net/route/nhop.h>
 #include <net/route/nhop_var.h>
 #include <netinet/in.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/in6_var.h>
 
 #define	DEBUG_MOD_NAME	route_ctl
 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
 #include <net/route/route_debug.h>
 _DECLARE_DEBUG(LOG_INFO);
 
 /*
  * This file contains control plane routing tables functions.
  *
  * All functions assumes they are called in net epoch.
  */
 
 union sockaddr_union {
 	struct sockaddr		sa;
 	struct sockaddr_in	sin;
 	struct sockaddr_in6	sin6;
 	char			_buf[32];
 };
 
 static int add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
     struct rib_cmd_info *rc);
 static int change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
     struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
     struct rib_cmd_info *rc);
 
 static int add_route_flags(struct rib_head *rnh, struct rtentry *rt,
     struct route_nhop_data *rnd_add, int op_flags, struct rib_cmd_info *rc);
 #ifdef ROUTE_MPATH
 static int add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
     int op_flags, struct rib_cmd_info *rc);
 #endif
 
 static int add_route(struct rib_head *rnh, struct rtentry *rt,
     struct route_nhop_data *rnd, struct rib_cmd_info *rc);
 static int delete_route(struct rib_head *rnh, struct rtentry *rt,
     struct rib_cmd_info *rc);
 static int rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc);
 
 static int get_prio_from_info(const struct rt_addrinfo *info);
 static int nhop_get_prio(const struct nhop_object *nh);
 
 #ifdef ROUTE_MPATH
 static bool rib_can_multipath(struct rib_head *rh);
 #endif
 
 /* Per-vnet multipath routing configuration */
 SYSCTL_DECL(_net_route);
 #define	V_rib_route_multipath	VNET(rib_route_multipath)
 #ifdef ROUTE_MPATH
 #define _MP_FLAGS	CTLFLAG_RW
 #else
 #define _MP_FLAGS	CTLFLAG_RD
 #endif
 VNET_DEFINE(u_int, rib_route_multipath) = 1;
 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
     &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
 #undef _MP_FLAGS
 
 #ifdef ROUTE_MPATH
 VNET_DEFINE(u_int, fib_hash_outbound) = 0;
 SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET,
     &VNET_NAME(fib_hash_outbound), 0,
     "Compute flowid for locally-originated packets");
 
 /* Default entropy to add to the hash calculation for the outbound connections*/
 uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = {
 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
 };
 #endif
 
 #if defined(INET) && defined(INET6)
 FEATURE(ipv4_rfc5549_support, "Route IPv4 packets via IPv6 nexthops");
 #define V_rib_route_ipv6_nexthop VNET(rib_route_ipv6_nexthop)
 VNET_DEFINE_STATIC(u_int, rib_route_ipv6_nexthop) = 1;
 SYSCTL_UINT(_net_route, OID_AUTO, ipv6_nexthop, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(rib_route_ipv6_nexthop), 0, "Enable IPv4 route via IPv6 Next Hop address");
 #endif
 
 /* Debug bits */
 SYSCTL_NODE(_net_route, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
 
 static struct rib_head *
 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
 {
 	struct rib_head *rnh;
 	struct sockaddr *dst;
 
 	KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
 
 	dst = info->rti_info[RTAX_DST];
 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 
 	return (rnh);
 }
 
 #if defined(INET) && defined(INET6)
 bool
 rib_can_4o6_nhop(void)
 {
 	return (!!V_rib_route_ipv6_nexthop);
 }
 #endif
 
 #ifdef ROUTE_MPATH
 static bool
 rib_can_multipath(struct rib_head *rh)
 {
 	int result;
 
 	CURVNET_SET(rh->rib_vnet);
 	result = !!V_rib_route_multipath;
 	CURVNET_RESTORE();
 
 	return (result);
 }
 
 /*
  * Check is nhop is multipath-eligible.
  * Avoid nhops without gateways and redirects.
  *
  * Returns 1 for multipath-eligible nexthop,
  * 0 otherwise.
  */
 bool
 nhop_can_multipath(const struct nhop_object *nh)
 {
 
 	if ((nh->nh_flags & NHF_MULTIPATH) != 0)
 		return (1);
 	if ((nh->nh_flags & NHF_GATEWAY) == 0)
 		return (0);
 	if ((nh->nh_flags & NHF_REDIRECT) != 0)
 		return (0);
 
 	return (1);
 }
 #endif
 
 static int
 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
 {
 	uint32_t weight;
 
 	if (info->rti_mflags & RTV_WEIGHT)
 		weight = info->rti_rmx->rmx_weight;
 	else
 		weight = default_weight;
 	/* Keep upper 1 byte for adm distance purposes */
 	if (weight > RT_MAX_WEIGHT)
 		weight = RT_MAX_WEIGHT;
 	else if (weight == 0)
 		weight = default_weight;
 
 	return (weight);
 }
 
 /*
  * File-local concept for distingushing between the normal and
  * RTF_PINNED routes tha can override the "normal" one.
  */
 #define	NH_PRIORITY_HIGH	2
 #define	NH_PRIORITY_NORMAL	1
 static int
 get_prio_from_info(const struct rt_addrinfo *info)
 {
 	if (info->rti_flags & RTF_PINNED)
 		return (NH_PRIORITY_HIGH);
 	return (NH_PRIORITY_NORMAL);
 }
 
 static int
 nhop_get_prio(const struct nhop_object *nh)
 {
 	if (NH_IS_PINNED(nh))
 		return (NH_PRIORITY_HIGH);
 	return (NH_PRIORITY_NORMAL);
 }
 
 /*
  * Check if specified @gw matches gw data in the nexthop @nh.
  *
  * Returns true if matches, false otherwise.
  */
 bool
 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
 {
 
 	if (nh->gw_sa.sa_family != gw->sa_family)
 		return (false);
 
 	switch (gw->sa_family) {
 	case AF_INET:
 		return (nh->gw4_sa.sin_addr.s_addr ==
 		    ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
 	case AF_INET6:
 		{
 			const struct sockaddr_in6 *gw6;
 			gw6 = (const struct sockaddr_in6 *)gw;
 
 			/*
 			 * Currently (2020-09) IPv6 gws in kernel have their
 			 * scope embedded. Once this becomes false, this code
 			 * has to be revisited.
 			 */
 			if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
 			    &gw6->sin6_addr))
 				return (true);
 			return (false);
 		}
 	case AF_LINK:
 		{
 			const struct sockaddr_dl *sdl;
 			sdl = (const struct sockaddr_dl *)gw;
 			return (nh->gwl_sa.sdl_index == sdl->sdl_index);
 		}
 	default:
 		return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
 	}
 
 	/* NOTREACHED */
 	return (false);
 }
 
 /*
  * Matches all nexthop with given @gw.
  * Can be used as rib_filter_f callback.
  */
 int
 rib_match_gw(const struct rtentry *rt, const struct nhop_object *nh, void *gw_sa)
 {
 	const struct sockaddr *gw = (const struct sockaddr *)gw_sa;
 
 	return (match_nhop_gw(nh, gw));
 }
 
 struct gw_filter_data {
 	const struct sockaddr *gw;
 	int count;
 };
 
 /*
  * Matches first occurence of the gateway provided in @gwd
  */
 static int
 match_gw_one(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
 {
 	struct gw_filter_data *gwd = (struct gw_filter_data *)_data;
 
 	/* Return only first match to make rtsock happy */
 	if (match_nhop_gw(nh, gwd->gw) && gwd->count++ == 0)
 		return (1);
 	return (0);
 }
 
 /*
  * Checks if data in @info matches nexhop @nh.
  *
  * Returns 0 on success,
  * ESRCH if not matched,
  * ENOENT if filter function returned false
  */
 int
 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
     const struct nhop_object *nh)
 {
 	const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
 
 	if (info->rti_filter != NULL) {
 	    if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
 		    return (ENOENT);
 	    else
 		    return (0);
 	}
 	if ((gw != NULL) && !match_nhop_gw(nh, gw))
 		return (ESRCH);
 
 	return (0);
 }
 
 /*
  * Runs exact prefix match based on @dst and @netmask.
  * Returns matched @rtentry if found or NULL.
  * If rtentry was found, saves nexthop / weight value into @rnd.
  */
 static struct rtentry *
 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
     const struct sockaddr *netmask, struct route_nhop_data *rnd)
 {
 	struct rtentry *rt;
 
 	RIB_LOCK_ASSERT(rnh);
 
 	rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
 	if (rt != NULL) {
 		rnd->rnd_nhop = rt->rt_nhop;
 		rnd->rnd_weight = rt->rt_weight;
 	} else {
 		rnd->rnd_nhop = NULL;
 		rnd->rnd_weight = 0;
 	}
 
 	return (rt);
 }
 
 struct rtentry *
 lookup_prefix_rt(struct rib_head *rnh, const struct rtentry *rt,
     struct route_nhop_data *rnd)
 {
 	return (lookup_prefix_bysa(rnh, rt_key_const(rt), rt_mask_const(rt), rnd));
 }
 
 /*
  * Runs exact prefix match based on dst/netmask from @info.
  * Assumes RIB lock is held.
  * Returns matched @rtentry if found or NULL.
  * If rtentry was found, saves nexthop / weight value into @rnd.
  */
 struct rtentry *
 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
     struct route_nhop_data *rnd)
 {
 	struct rtentry *rt;
 
 	rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
 	    info->rti_info[RTAX_NETMASK], rnd);
 
 	return (rt);
 }
 
 static bool
 fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
     struct sockaddr **pmask)
 {
 	if (plen == -1) {
 		*pmask = NULL;
 		return (true);
 	}
 
 	switch (family) {
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *mask = (struct sockaddr_in *)(*pmask);
 			struct sockaddr_in *dst= (struct sockaddr_in *)_dst;
 
 			memset(mask, 0, sizeof(*mask));
 			mask->sin_family = family;
 			mask->sin_len = sizeof(*mask);
 			if (plen == 32)
 				*pmask = NULL;
 			else if (plen > 32 || plen < 0)
 				return (false);
 			else {
 				uint32_t daddr, maddr;
 				maddr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
 				mask->sin_addr.s_addr = maddr;
 				daddr = dst->sin_addr.s_addr;
 				daddr = htonl(ntohl(daddr) & ntohl(maddr));
 				dst->sin_addr.s_addr = daddr;
 			}
 			return (true);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *mask = (struct sockaddr_in6 *)(*pmask);
 			struct sockaddr_in6 *dst = (struct sockaddr_in6 *)_dst;
 
 			memset(mask, 0, sizeof(*mask));
 			mask->sin6_family = family;
 			mask->sin6_len = sizeof(*mask);
 			if (plen == 128)
 				*pmask = NULL;
 			else if (plen > 128 || plen < 0)
 				return (false);
 			else {
 				ip6_writemask(&mask->sin6_addr, plen);
 				IN6_MASK_ADDR(&dst->sin6_addr, &mask->sin6_addr);
 			}
 			return (true);
 		}
 		break;
 #endif
 	}
 	return (false);
 }
 
 /*
  * Attempts to add @dst/plen prefix with nexthop/nexhopgroup data @rnd
  * to the routing table.
  *
  * @fibnum: rtable id to insert route to
  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
  * @plen: prefix length (or -1 if host route or not applicable for AF)
  * @op_flags: combination of RTM_F_ flags
  * @rc: storage to report operation result
  *
  * Returns 0 on success.
  */
 int
 rib_add_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
     struct route_nhop_data *rnd, int op_flags, struct rib_cmd_info *rc)
 {
 	union sockaddr_union mask_storage;
 	struct sockaddr *netmask = &mask_storage.sa;
 	struct rtentry *rt = NULL;
 
 	NET_EPOCH_ASSERT();
 
 	bzero(rc, sizeof(struct rib_cmd_info));
 	rc->rc_cmd = RTM_ADD;
 
 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	if (rnh == NULL)
 		return (EAFNOSUPPORT);
 
 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
 		return (EINVAL);
 	}
 
 	if (op_flags & RTM_F_CREATE) {
 		if ((rt = rt_alloc(rnh, dst, netmask)) == NULL) {
 			FIB_RH_LOG(LOG_INFO, rnh, "rtentry allocation failed");
 			return (ENOMEM);
 		}
 	}
 
 	return (add_route_flags(rnh, rt, rnd, op_flags, rc));
 }
 
 /*
  * Attempts to delete @dst/plen prefix matching gateway @gw from the
  *  routing rable.
  *
  * @fibnum: rtable id to remove route from
  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
  * @plen: prefix length (or -1 if host route or not applicable for AF)
  * @gw: gateway to match
  * @op_flags: combination of RTM_F_ flags
  * @rc: storage to report operation result
  *
  * Returns 0 on success.
  */
 int
 rib_del_route_px_gw(uint32_t fibnum, struct sockaddr *dst, int plen,
     const struct sockaddr *gw, int op_flags, struct rib_cmd_info *rc)
 {
 	struct gw_filter_data gwd = { .gw = gw };
 
 	return (rib_del_route_px(fibnum, dst, plen, match_gw_one, &gwd, op_flags, rc));
 }
 
 /*
  * Attempts to delete @dst/plen prefix matching @filter_func from the
  *  routing rable.
  *
  * @fibnum: rtable id to remove route from
  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
  * @plen: prefix length (or -1 if host route or not applicable for AF)
  * @filter_func: func to be called for each nexthop of the prefix for matching
  * @filter_arg: argument to pass to @filter_func
  * @op_flags: combination of RTM_F_ flags
  * @rc: storage to report operation result
  *
  * Returns 0 on success.
  */
 int
 rib_del_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
     rib_filter_f_t *filter_func, void *filter_arg, int op_flags,
     struct rib_cmd_info *rc)
 {
 	union sockaddr_union mask_storage;
 	struct sockaddr *netmask = &mask_storage.sa;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	bzero(rc, sizeof(struct rib_cmd_info));
 	rc->rc_cmd = RTM_DELETE;
 
 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	if (rnh == NULL)
 		return (EAFNOSUPPORT);
 
 	if (dst->sa_len > sizeof(mask_storage)) {
 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too big: %d", dst->sa_len);
 		return (EINVAL);
 	}
 
 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
 		return (EINVAL);
 	}
 
 	int prio = (op_flags & RTM_F_FORCE) ? NH_PRIORITY_HIGH : NH_PRIORITY_NORMAL;
 
 	RIB_WLOCK(rnh);
 	struct route_nhop_data rnd;
 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
 	if (rt != NULL) {
 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
 		    filter_arg, rc);
 	} else
 		error = ESRCH;
 	RIB_WUNLOCK(rnh);
 
 	if (error != 0)
 		return (error);
 
 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
 
 	if (rc->rc_cmd == RTM_DELETE)
 		rt_free(rc->rc_rt);
 #ifdef ROUTE_MPATH
 	else {
 		/*
 		 * Deleting 1 path may result in RTM_CHANGE to
 		 * a different mpath group/nhop.
 		 * Free old mpath group.
 		 */
 		nhop_free_any(rc->rc_nh_old);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Tries to copy route @rt from one rtable to the rtable specified by @dst_rh.
  * @rt: route to copy.
  * @rnd_src: nhop and weight. Multipath routes are not supported
  * @rh_dst: target rtable.
  * @rc: operation result storage
  *
  * Return 0 on success.
  */
 int
 rib_copy_route(struct rtentry *rt, const struct route_nhop_data *rnd_src,
     struct rib_head *rh_dst, struct rib_cmd_info *rc)
 {
 	struct nhop_object __diagused *nh_src = rnd_src->rnd_nhop;
 	int error;
 
 	MPASS((nh_src->nh_flags & NHF_MULTIPATH) == 0);
 
 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
 		char nhbuf[NHOP_PRINT_BUFSIZE], rtbuf[NHOP_PRINT_BUFSIZE];
 		nhop_print_buf_any(nh_src, nhbuf, sizeof(nhbuf));
 		rt_print_buf(rt, rtbuf, sizeof(rtbuf));
 		FIB_RH_LOG(LOG_DEBUG2, rh_dst, "copying %s -> %s from fib %u",
 		    rtbuf, nhbuf, nhop_get_fibnum(nh_src));
 	}
 	struct nhop_object *nh = nhop_alloc(rh_dst->rib_fibnum, rh_dst->rib_family);
 	if (nh == NULL) {
 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to allocate new nexthop");
 		return (ENOMEM);
 	}
 	nhop_copy(nh, rnd_src->rnd_nhop);
 	nhop_set_origin(nh, nhop_get_origin(rnd_src->rnd_nhop));
 	nhop_set_fibnum(nh, rh_dst->rib_fibnum);
 	nh = nhop_get_nhop_internal(rh_dst, nh, &error);
 	if (error != 0) {
 		FIB_RH_LOG(LOG_INFO, rh_dst,
 		    "unable to finalize new nexthop: error %d", error);
 		return (ENOMEM);
 	}
 
 	struct rtentry *rt_new = rt_alloc(rh_dst, rt_key(rt), rt_mask(rt));
 	if (rt_new == NULL) {
 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to create new rtentry");
 		nhop_free(nh);
 		return (ENOMEM);
 	}
 
 	struct route_nhop_data rnd = {
 		.rnd_nhop = nh,
 		.rnd_weight = rnd_src->rnd_weight
 	};
 	int op_flags = RTM_F_CREATE | (NH_IS_PINNED(nh) ? RTM_F_FORCE : 0);
 	error = add_route_flags(rh_dst, rt_new, &rnd, op_flags, rc);
 
 	if (error != 0) {
 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
 			char buf[NHOP_PRINT_BUFSIZE];
 			rt_print_buf(rt_new, buf, sizeof(buf));
 			FIB_RH_LOG(LOG_DEBUG, rh_dst,
 			    "Unable to add route %s: error %d", buf, error);
 		}
 		nhop_free(nh);
 		rt_free_immediate(rt_new);
 	}
 	return (error);
 }
 
 /*
  * Adds route defined by @info into the kernel table specified by @fibnum and
  * sa_family in @info->rti_info[RTAX_DST].
  *
  * Returns 0 on success and fills in operation metadata into @rc.
  */
 int
 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
     struct rib_cmd_info *rc)
 {
 	struct rib_head *rnh;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	rnh = get_rnh(fibnum, info);
 	if (rnh == NULL)
 		return (EAFNOSUPPORT);
 
 	/*
 	 * Check consistency between RTF_HOST flag and netmask
 	 * existence.
 	 */
 	if (info->rti_flags & RTF_HOST)
 		info->rti_info[RTAX_NETMASK] = NULL;
 	else if (info->rti_info[RTAX_NETMASK] == NULL) {
 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: no RTF_HOST and empty netmask");
 		return (EINVAL);
 	}
 
 	bzero(rc, sizeof(struct rib_cmd_info));
 	rc->rc_cmd = RTM_ADD;
 
 	error = add_route_byinfo(rnh, info, rc);
 	if (error == 0)
 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
 
 	return (error);
 }
 
 static int
 add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
     struct rib_cmd_info *rc)
 {
 	struct route_nhop_data rnd_add;
 	struct nhop_object *nh;
 	struct rtentry *rt;
 	struct sockaddr *dst, *gateway, *netmask;
 	int error;
 
 	dst = info->rti_info[RTAX_DST];
 	gateway = info->rti_info[RTAX_GATEWAY];
 	netmask = info->rti_info[RTAX_NETMASK];
 
 	if ((info->rti_flags & RTF_GATEWAY) && !gateway) {
 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: RTF_GATEWAY set with empty gw");
 		return (EINVAL);
 	}
 	if (dst && gateway && !nhop_check_gateway(dst->sa_family, gateway->sa_family)) {
 		FIB_RH_LOG(LOG_DEBUG, rnh,
 		    "error: invalid dst/gateway family combination (%d, %d)",
 		    dst->sa_family, gateway->sa_family);
 		return (EINVAL);
 	}
 
 	if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) {
 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large: %d",
 		    dst->sa_len);
 		return (EINVAL);
 	}
 
 	if (info->rti_ifa == NULL) {
 		error = rt_getifa_fib(info, rnh->rib_fibnum);
 		if (error)
 			return (error);
 	}
 
 	if ((rt = rt_alloc(rnh, dst, netmask)) == NULL)
 		return (ENOBUFS);
 
 	error = nhop_create_from_info(rnh, info, &nh);
 	if (error != 0) {
 		rt_free_immediate(rt);
 		return (error);
 	}
 
 	rnd_add.rnd_nhop = nh;
 	rnd_add.rnd_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
 
 	int op_flags = RTM_F_CREATE;
 	if (get_prio_from_info(info) == NH_PRIORITY_HIGH)
 		op_flags |= RTM_F_FORCE;
 	else
 		op_flags |= RTM_F_APPEND;
 	return (add_route_flags(rnh, rt, &rnd_add, op_flags, rc));
 
 }
 
 static int
 add_route_flags(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd_add,
     int op_flags, struct rib_cmd_info *rc)
 {
 	struct route_nhop_data rnd_orig;
 	struct nhop_object *nh;
 	struct rtentry *rt_orig;
 	int error = 0;
 
 	nh = rnd_add->rnd_nhop;
 
 	RIB_WLOCK(rnh);
 
 	rt_orig = lookup_prefix_rt(rnh, rt, &rnd_orig);
 
 	if (rt_orig == NULL) {
 		if (op_flags & RTM_F_CREATE)
 			error = add_route(rnh, rt, rnd_add, rc);
 		else
 			error = ESRCH; /* no entry but creation was not required */
 		RIB_WUNLOCK(rnh);
 		if (error != 0)
 			goto out;
 		return (0);
 	}
 
 	if (op_flags & RTM_F_EXCL) {
 		/* We have existing route in the RIB but not allowed to replace. */
 		RIB_WUNLOCK(rnh);
 		error = EEXIST;
 		goto out;
 	}
 
 	/* Now either append or replace */
 	if (op_flags & RTM_F_REPLACE) {
 		if (nhop_get_prio(rnd_orig.rnd_nhop) > nhop_get_prio(rnd_add->rnd_nhop)) {
 			/* Old path is "better" (e.g. has PINNED flag set) */
 			RIB_WUNLOCK(rnh);
 			error = EEXIST;
 			goto out;
 		}
 		change_route(rnh, rt_orig, rnd_add, rc);
 		RIB_WUNLOCK(rnh);
 		nh = rc->rc_nh_old;
 		goto out;
 	}
 
 	RIB_WUNLOCK(rnh);
 
 #ifdef ROUTE_MPATH
 	if ((op_flags & RTM_F_APPEND) && rib_can_multipath(rnh) &&
 	    nhop_can_multipath(rnd_add->rnd_nhop) &&
 	    nhop_can_multipath(rnd_orig.rnd_nhop)) {
 
 		for (int i = 0; i < RIB_MAX_RETRIES; i++) {
 			error = add_route_flags_mpath(rnh, rt_orig, rnd_add, &rnd_orig,
 			    op_flags, rc);
 			if (error != EAGAIN)
 				break;
 			RTSTAT_INC(rts_add_retry);
 		}
 
 		/*
 		 *  Original nhop reference is unused in any case.
 		 */
 		nhop_free_any(rnd_add->rnd_nhop);
 		if (op_flags & RTM_F_CREATE) {
 			if (error != 0 || rc->rc_cmd != RTM_ADD)
 				rt_free_immediate(rt);
 		}
 		return (error);
 	}
 #endif
 	/* Out of options - free state and return error */
 	error = EEXIST;
 out:
 	if (op_flags & RTM_F_CREATE)
 		rt_free_immediate(rt);
 	nhop_free_any(nh);
 
 	return (error);
 }
 
 #ifdef ROUTE_MPATH
 static int
 add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
     int op_flags, struct rib_cmd_info *rc)
 {
 	RIB_RLOCK_TRACKER;
 	struct route_nhop_data rnd_new;
 	int error = 0;
 
 	error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, &rnd_new);
 	if (error != 0) {
 		if (error == EAGAIN) {
 			/*
 			 * Group creation failed, most probably because
 			 * @rnd_orig data got scheduled for deletion.
 			 * Refresh @rnd_orig data and retry.
 			 */
 			RIB_RLOCK(rnh);
 			lookup_prefix_rt(rnh, rt, rnd_orig);
 			RIB_RUNLOCK(rnh);
 			if (rnd_orig == NULL && !(op_flags & RTM_F_CREATE)) {
 				/* In this iteration route doesn't exist */
 				error = ENOENT;
 			}
 		}
 		return (error);
 	}
 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
 	if (error != 0)
 		return (error);
 
 	if (V_fib_hash_outbound == 0 && NH_IS_NHGRP(rc->rc_nh_new)) {
 		/*
 		 * First multipath route got installed. Enable local
 		 * outbound connections hashing.
 		 */
 		if (bootverbose)
 			printf("FIB: enabled flowid calculation for locally-originated packets\n");
 		V_fib_hash_outbound = 1;
 	}
 
 	return (0);
 }
 #endif
 
 /*
  * Removes route defined by @info from the kernel table specified by @fibnum and
  * sa_family in @info->rti_info[RTAX_DST].
  *
  * Returns 0 on success and fills in operation metadata into @rc.
  */
 int
 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
 {
 	struct rib_head *rnh;
 	struct sockaddr *dst, *netmask;
 	struct sockaddr_storage mdst;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	rnh = get_rnh(fibnum, info);
 	if (rnh == NULL)
 		return (EAFNOSUPPORT);
 
 	bzero(rc, sizeof(struct rib_cmd_info));
 	rc->rc_cmd = RTM_DELETE;
 
 	dst = info->rti_info[RTAX_DST];
 	netmask = info->rti_info[RTAX_NETMASK];
 
 	if (netmask != NULL) {
 		/* Ensure @dst is always properly masked */
 		if (dst->sa_len > sizeof(mdst)) {
 			FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large");
 			return (EINVAL);
 		}
 		rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
 		dst = (struct sockaddr *)&mdst;
 	}
 
 	rib_filter_f_t *filter_func = NULL;
 	void *filter_arg = NULL;
 	struct gw_filter_data gwd = { .gw = info->rti_info[RTAX_GATEWAY] };
 
 	if (info->rti_filter != NULL) {
 		filter_func = info->rti_filter;
 		filter_arg = info->rti_filterdata;
 	} else if (gwd.gw != NULL) {
 		filter_func = match_gw_one;
 		filter_arg = &gwd;
 	}
 
 	int prio = get_prio_from_info(info);
 
 	RIB_WLOCK(rnh);
 	struct route_nhop_data rnd;
 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
 	if (rt != NULL) {
 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
 		    filter_arg, rc);
 	} else
 		error = ESRCH;
 	RIB_WUNLOCK(rnh);
 
 	if (error != 0)
 		return (error);
 
 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
 
 	if (rc->rc_cmd == RTM_DELETE)
 		rt_free(rc->rc_rt);
 #ifdef ROUTE_MPATH
 	else {
 		/*
 		 * Deleting 1 path may result in RTM_CHANGE to
 		 * a different mpath group/nhop.
 		 * Free old mpath group.
 		 */
 		nhop_free_any(rc->rc_nh_old);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Conditionally unlinks rtentry paths from @rnh matching @cb.
  * Returns 0 on success with operation result stored in @rc.
  * On error, returns:
  * ESRCH - if prefix was not found or filter function failed to match
  * EADDRINUSE - if trying to delete higher priority route.
  */
 static int
 rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc)
 {
 	struct nhop_object *nh = rt->rt_nhop;
 
 #ifdef ROUTE_MPATH
 	if (NH_IS_NHGRP(nh)) {
 		struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
 		struct route_nhop_data rnd;
 		int error;
 
 		if (cb == NULL)
 			return (ESRCH);
 		error = nhgrp_get_filtered_group(rnh, rt, nhg, cb, cbdata, &rnd);
 		if (error == 0) {
 			if (rnd.rnd_nhgrp == nhg) {
 				/* No match, unreference new group and return. */
 				nhop_free_any(rnd.rnd_nhop);
 				return (ESRCH);
 			}
 			error = change_route(rnh, rt, &rnd, rc);
 		}
 		return (error);
 	}
 #endif
 	if (cb != NULL && !cb(rt, nh, cbdata))
 		return (ESRCH);
 
 	if (prio < nhop_get_prio(nh))
 		return (EADDRINUSE);
 
 	return (delete_route(rnh, rt, rc));
 }
 
 int
 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
     struct rib_cmd_info *rc)
 {
 	RIB_RLOCK_TRACKER;
 	struct route_nhop_data rnd_orig;
 	struct rib_head *rnh;
 	struct rtentry *rt;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	rnh = get_rnh(fibnum, info);
 	if (rnh == NULL)
 		return (EAFNOSUPPORT);
 
 	bzero(rc, sizeof(struct rib_cmd_info));
 	rc->rc_cmd = RTM_CHANGE;
 
 	/* Check if updated gateway exists */
 	if ((info->rti_flags & RTF_GATEWAY) &&
 	    (info->rti_info[RTAX_GATEWAY] == NULL)) {
 
 		/*
 		 * route(8) adds RTF_GATEWAY flag if -interface is not set.
 		 * Remove RTF_GATEWAY to enforce consistency and maintain
 		 * compatibility..
 		 */
 		info->rti_flags &= ~RTF_GATEWAY;
 	}
 
 	/*
 	 * route change is done in multiple steps, with dropping and
 	 * reacquiring lock. In the situations with multiple processes
 	 * changes the same route in can lead to the case when route
 	 * is changed between the steps. Address it by retrying the operation
 	 * multiple times before failing.
 	 */
 
 	RIB_RLOCK(rnh);
 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
 	    info->rti_info[RTAX_NETMASK], &rnh->head);
 
 	if (rt == NULL) {
 		RIB_RUNLOCK(rnh);
 		return (ESRCH);
 	}
 
 	rnd_orig.rnd_nhop = rt->rt_nhop;
 	rnd_orig.rnd_weight = rt->rt_weight;
 
 	RIB_RUNLOCK(rnh);
 
 	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
 		error = change_route_byinfo(rnh, rt, info, &rnd_orig, rc);
 		if (error != EAGAIN)
 			break;
 	}
 
 	return (error);
 }
 
 static int
 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
     struct nhop_object *nh_orig, struct nhop_object **nh_new)
 {
 	int error;
 
 	/*
 	 * New gateway could require new ifaddr, ifp;
 	 * flags may also be different; ifp may be specified
 	 * by ll sockaddr when protocol address is ambiguous
 	 */
 	if (((nh_orig->nh_flags & NHF_GATEWAY) &&
 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
 	    info->rti_info[RTAX_IFP] != NULL ||
 	    (info->rti_info[RTAX_IFA] != NULL &&
 	     !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
 		error = rt_getifa_fib(info, rnh->rib_fibnum);
 
 		if (error != 0) {
 			info->rti_ifa = NULL;
 			return (error);
 		}
 	}
 
 	error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
 	info->rti_ifa = NULL;
 
 	return (error);
 }
 
 #ifdef ROUTE_MPATH
 static int
 change_mpath_route(struct rib_head *rnh, struct rtentry *rt,
     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
     struct rib_cmd_info *rc)
 {
 	int error = 0, found_idx = 0;
 	struct nhop_object *nh_orig = NULL, *nh_new;
 	struct route_nhop_data rnd_new = {};
 	const struct weightened_nhop *wn = NULL;
 	struct weightened_nhop *wn_new;
 	uint32_t num_nhops;
 
 	wn = nhgrp_get_nhops(rnd_orig->rnd_nhgrp, &num_nhops);
 	for (int i = 0; i < num_nhops; i++) {
 		if (check_info_match_nhop(info, NULL, wn[i].nh) == 0) {
 			nh_orig = wn[i].nh;
 			found_idx = i;
 			break;
 		}
 	}
 
 	if (nh_orig == NULL)
 		return (ESRCH);
 
 	error = change_nhop(rnh, info, nh_orig, &nh_new);
 	if (error != 0)
 		return (error);
 
 	wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
 	    M_TEMP, M_NOWAIT | M_ZERO);
 	if (wn_new == NULL) {
 		nhop_free(nh_new);
 		return (EAGAIN);
 	}
 
 	memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
 	wn_new[found_idx].nh = nh_new;
 	wn_new[found_idx].weight = get_info_weight(info, wn[found_idx].weight);
 
 	error = nhgrp_get_group(rnh, wn_new, num_nhops, 0, &rnd_new.rnd_nhgrp);
 	nhop_free(nh_new);
 	free(wn_new, M_TEMP);
 
 	if (error != 0)
 		return (error);
 
 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
 
 	return (error);
 }
 #endif
 
 static int
 change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
     struct rib_cmd_info *rc)
 {
 	int error = 0;
 	struct nhop_object *nh_orig;
 	struct route_nhop_data rnd_new;
 
 	nh_orig = rnd_orig->rnd_nhop;
 	if (nh_orig == NULL)
 		return (ESRCH);
 
 #ifdef ROUTE_MPATH
 	if (NH_IS_NHGRP(nh_orig))
 		return (change_mpath_route(rnh, rt, info, rnd_orig, rc));
 #endif
 
 	rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
 	error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
 	if (error != 0)
 		return (error);
 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
 
 	return (error);
 }
 
 /*
  * Insert @rt with nhop data from @rnd_new to @rnh.
  * Returns 0 on success and stores operation results in @rc.
  */
 static int
 add_route(struct rib_head *rnh, struct rtentry *rt,
     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
 {
 	struct radix_node *rn;
 
 	RIB_WLOCK_ASSERT(rnh);
 
 	rt->rt_nhop = rnd->rnd_nhop;
 	rt->rt_weight = rnd->rnd_weight;
 	rn = rnh->rnh_addaddr(rt_key(rt), rt_mask_const(rt), &rnh->head, rt->rt_nodes);
 
 	if (rn != NULL) {
 		if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
 			tmproutes_update(rnh, rt, rnd->rnd_nhop);
 
 		/* Finalize notification */
 		rib_bump_gen(rnh);
 		rnh->rnh_prefixes++;
 
 		rc->rc_cmd = RTM_ADD;
 		rc->rc_rt = rt;
 		rc->rc_nh_old = NULL;
 		rc->rc_nh_new = rnd->rnd_nhop;
 		rc->rc_nh_weight = rnd->rnd_weight;
 
 		rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
 		return (0);
 	}
 
 	/* Existing route or memory allocation failure. */
 	return (EEXIST);
 }
 
 /*
  * Unconditionally deletes @rt from @rnh.
  */
 static int
 delete_route(struct rib_head *rnh, struct rtentry *rt, struct rib_cmd_info *rc)
 {
 	RIB_WLOCK_ASSERT(rnh);
 
 	/* Route deletion requested. */
 	struct radix_node *rn;
 
 	rn = rnh->rnh_deladdr(rt_key_const(rt), rt_mask_const(rt), &rnh->head);
 	if (rn == NULL)
 		return (ESRCH);
 	rt = RNTORT(rn);
 	rt->rte_flags &= ~RTF_UP;
 
 	rib_bump_gen(rnh);
 	rnh->rnh_prefixes--;
 
 	rc->rc_cmd = RTM_DELETE;
 	rc->rc_rt = rt;
 	rc->rc_nh_old = rt->rt_nhop;
 	rc->rc_nh_new = NULL;
 	rc->rc_nh_weight = rt->rt_weight;
 
 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
 
 	return (0);
 }
 
 /*
  * Switch @rt nhop/weigh to the ones specified in @rnd.
  * Returns 0 on success.
  */
 int
 change_route(struct rib_head *rnh, struct rtentry *rt,
     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
 {
 	struct nhop_object *nh_orig;
 
 	RIB_WLOCK_ASSERT(rnh);
 
 	nh_orig = rt->rt_nhop;
 
 	if (rnd->rnd_nhop == NULL)
 		return (delete_route(rnh, rt, rc));
 
 	/* Changing nexthop & weight to a new one */
 	rt->rt_nhop = rnd->rnd_nhop;
 	rt->rt_weight = rnd->rnd_weight;
 	if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
 		tmproutes_update(rnh, rt, rnd->rnd_nhop);
 
 	/* Finalize notification */
 	rib_bump_gen(rnh);
 	rc->rc_cmd = RTM_CHANGE;
 	rc->rc_rt = rt;
 	rc->rc_nh_old = nh_orig;
 	rc->rc_nh_new = rnd->rnd_nhop;
 	rc->rc_nh_weight = rnd->rnd_weight;
 
 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
 
 	return (0);
 }
 
 /*
  * Conditionally update route nhop/weight IFF data in @nhd_orig is
  *  consistent with the current route data.
  * Nexthop in @nhd_new is consumed.
  */
 int
 change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
     struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new,
     struct rib_cmd_info *rc)
 {
 	struct rtentry *rt_new;
 	int error = 0;
 
 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
 		char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE];
 		nhop_print_buf_any(rnd_orig->rnd_nhop, buf_old, NHOP_PRINT_BUFSIZE);
 		nhop_print_buf_any(rnd_new->rnd_nhop, buf_new, NHOP_PRINT_BUFSIZE);
 		FIB_LOG(LOG_DEBUG2, rnh->rib_fibnum, rnh->rib_family,
 		    "trying change %s -> %s", buf_old, buf_new);
 	}
 	RIB_WLOCK(rnh);
 
 	struct route_nhop_data rnd;
 	rt_new = lookup_prefix_rt(rnh, rt, &rnd);
 
 	if (rt_new == NULL) {
 		if (rnd_orig->rnd_nhop == NULL)
 			error = add_route(rnh, rt, rnd_new, rc);
 		else {
 			/*
 			 * Prefix does not exist, which was not our assumption.
 			 * Update @rnd_orig with the new data and return
 			 */
 			rnd_orig->rnd_nhop = NULL;
 			rnd_orig->rnd_weight = 0;
 			error = EAGAIN;
 		}
 	} else {
 		/* Prefix exists, try to update */
 		if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
 			/*
 			 * Nhop/mpath group hasn't changed. Flip
 			 * to the new precalculated one and return
 			 */
 			error = change_route(rnh, rt_new, rnd_new, rc);
 		} else {
 			/* Update and retry */
 			rnd_orig->rnd_nhop = rt_new->rt_nhop;
 			rnd_orig->rnd_weight = rt_new->rt_weight;
 			error = EAGAIN;
 		}
 	}
 
 	RIB_WUNLOCK(rnh);
 
 	if (error == 0) {
 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
 
 		if (rnd_orig->rnd_nhop != NULL)
 			nhop_free_any(rnd_orig->rnd_nhop);
 
 	} else {
 		if (rnd_new->rnd_nhop != NULL)
 			nhop_free_any(rnd_new->rnd_nhop);
 	}
 
 	return (error);
 }
 
 /*
  * Performs modification of routing table specificed by @action.
  * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
  * Needs to be run in network epoch.
  *
  * Returns 0 on success and fills in @rc with action result.
  */
 int
 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
     struct rib_cmd_info *rc)
 {
 	int error;
 
 	switch (action) {
 	case RTM_ADD:
 		error = rib_add_route(fibnum, info, rc);
 		break;
 	case RTM_DELETE:
 		error = rib_del_route(fibnum, info, rc);
 		break;
 	case RTM_CHANGE:
 		error = rib_change_route(fibnum, info, rc);
 		break;
 	default:
 		error = ENOTSUP;
 	}
 
 	return (error);
 }
 
 struct rt_delinfo
 {
 	struct rib_head *rnh;
 	struct rtentry *head;
 	rib_filter_f_t *filter_f;
 	void *filter_arg;
 	int prio;
 	struct rib_cmd_info rc;
 };
 
 /*
  * Conditionally unlinks rtenties or paths from radix tree based
  * on the callback data passed in @arg.
  */
 static int
 rt_checkdelroute(struct radix_node *rn, void *arg)
 {
 	struct rt_delinfo *di = (struct rt_delinfo *)arg;
 	struct rtentry *rt = (struct rtentry *)rn;
 
 	if (rt_delete_conditional(di->rnh, rt, di->prio,
 	    di->filter_f, di->filter_arg, &di->rc) != 0)
 		return (0);
 
 	/*
 	 * Add deleted rtentries to the list to GC them
 	 *  after dropping the lock.
 	 *
 	 * XXX: Delayed notifications not implemented
 	 *  for nexthop updates.
 	 */
 	if (di->rc.rc_cmd == RTM_DELETE) {
 		/* Add to the list and return */
 		rt->rt_chain = di->head;
 		di->head = rt;
 #ifdef ROUTE_MPATH
 	} else {
 		/*
 		 * RTM_CHANGE to a different nexthop or nexthop group.
 		 * Free old multipath group.
 		 */
 		nhop_free_any(di->rc.rc_nh_old);
 #endif
 	}
 
 	return (0);
 }
 
 /*
  * Iterates over a routing table specified by @fibnum and @family and
  *  deletes elements marked by @filter_f.
  * @fibnum: rtable id
  * @family: AF_ address family
  * @filter_f: function returning non-zero value for items to delete
  * @arg: data to pass to the @filter_f function
  * @report: true if rtsock notification is needed.
  */
 void
 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *filter_arg,
     bool report)
 {
 	struct rib_head *rnh;
 	struct rtentry *rt;
 	struct nhop_object *nh;
 	struct epoch_tracker et;
 
 	rnh = rt_tables_get_rnh(fibnum, family);
 	if (rnh == NULL)
 		return;
 
 	struct rt_delinfo di = {
 		.rnh = rnh,
 		.filter_f = filter_f,
 		.filter_arg = filter_arg,
 		.prio = NH_PRIORITY_NORMAL,
 	};
 
 	NET_EPOCH_ENTER(et);
 
 	RIB_WLOCK(rnh);
 	rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
 	RIB_WUNLOCK(rnh);
 
 	/* We might have something to reclaim. */
 	bzero(&di.rc, sizeof(di.rc));
 	di.rc.rc_cmd = RTM_DELETE;
 	while (di.head != NULL) {
 		rt = di.head;
 		di.head = rt->rt_chain;
 		rt->rt_chain = NULL;
 		nh = rt->rt_nhop;
 
 		di.rc.rc_rt = rt;
 		di.rc.rc_nh_old = nh;
 		rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
 
 		if (report) {
 #ifdef ROUTE_MPATH
 			struct nhgrp_object *nhg;
 			const struct weightened_nhop *wn;
 			uint32_t num_nhops;
 			if (NH_IS_NHGRP(nh)) {
 				nhg = (struct nhgrp_object *)nh;
 				wn = nhgrp_get_nhops(nhg, &num_nhops);
 				for (int i = 0; i < num_nhops; i++)
 					rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum);
 			} else
 #endif
 			rt_routemsg(RTM_DELETE, rt, nh, fibnum);
 		}
 		rt_free(rt);
 	}
 
 	NET_EPOCH_EXIT(et);
 }
 
 static int
 rt_delete_unconditional(struct radix_node *rn, void *arg)
 {
 	struct rtentry *rt = RNTORT(rn);
 	struct rib_head *rnh = (struct rib_head *)arg;
 
 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head);
 	if (RNTORT(rn) == rt)
 		rt_free(rt);
 
 	return (0);
 }
 
 /*
  * Removes all routes from the routing table without executing notifications.
  * rtentres will be removed after the end of a current epoch.
  */
 static void
 rib_flush_routes(struct rib_head *rnh)
 {
 	RIB_WLOCK(rnh);
 	rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh);
 	RIB_WUNLOCK(rnh);
 }
 
 void
 rib_flush_routes_family(int family)
 {
 	struct rib_head *rnh;
 
 	for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 		if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
 			rib_flush_routes(rnh);
 	}
 }
 
 const char *
 rib_print_family(int family)
 {
 	switch (family) {
 	case AF_INET:
 		return ("inet");
 	case AF_INET6:
 		return ("inet6");
 	case AF_LINK:
 		return ("link");
 	}
 	return ("unknown");
 }
 
diff --git a/sys/net/route/route_ddb.c b/sys/net/route/route_ddb.c
index 437ede01b4a8..aa2a33ed6c90 100644
--- a/sys/net/route/route_ddb.c
+++ b/sys/net/route/route_ddb.c
@@ -1,270 +1,271 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright 2019 Conrad Meyer <cem@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/ctype.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 
 #include <ddb/ddb.h>
 #include <ddb/db_lex.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 
 /*
  * Unfortunately, RTF_ values are expressed as raw masks rather than powers of
  * 2, so we cannot use them as nice C99 initializer indices below.
  */
 static const char * const rtf_flag_strings[] = {
 	"UP",
 	"GATEWAY",
 	"HOST",
 	"REJECT",
 	"DYNAMIC",
 	"MODIFIED",
 	"DONE",
 	"UNUSED_0x80",
 	"UNUSED_0x100",
 	"XRESOLVE",
 	"LLDATA",
 	"STATIC",
 	"BLACKHOLE",
 	"UNUSED_0x2000",
 	"PROTO2",
 	"PROTO1",
 	"UNUSED_0x10000",
 	"UNUSED_0x20000",
 	"PROTO3",
 	"FIXEDMTU",
 	"PINNED",
 	"LOCAL",
 	"BROADCAST",
 	"MULTICAST",
 	/* Big gap. */
 	[28] = "STICKY",
 	[30] = "RNH_LOCKED",
 	[31] = "GWFLAG_COMPAT",
 };
 
 static const char * __pure
 rt_flag_name(unsigned idx)
 {
 	if (idx >= nitems(rtf_flag_strings))
 		return ("INVALID_FLAG");
 	if (rtf_flag_strings[idx] == NULL)
 		return ("UNKNOWN");
 	return (rtf_flag_strings[idx]);
 }
 
 static void
 rt_dumpaddr_ddb(const char *name, const struct sockaddr *sa)
 {
 	char buf[INET6_ADDRSTRLEN], *res;
 
 	res = NULL;
 	if (sa == NULL)
 		res = "NULL";
 	else if (sa->sa_family == AF_INET) {
 		res = inet_ntop(AF_INET,
 		    &((const struct sockaddr_in *)sa)->sin_addr,
 		    buf, sizeof(buf));
 	} else if (sa->sa_family == AF_INET6) {
 		res = inet_ntop(AF_INET6,
 		    &((const struct sockaddr_in6 *)sa)->sin6_addr,
 		    buf, sizeof(buf));
 	} else if (sa->sa_family == AF_LINK) {
 		res = "on link";
 	}
 
 	if (res != NULL) {
 		db_printf("%s <%s> ", name, res);
 		return;
 	}
 
 	db_printf("%s <af:%d> ", name, sa->sa_family);
 }
 
 static int
 rt_dumpentry_ddb(struct radix_node *rn, void *arg __unused)
 {
 	struct sockaddr_storage ss;
 	struct rtentry *rt;
 	struct nhop_object *nh;
 	int flags, idx;
 
 	/* If RNTORT is important, put it in a header. */
 	rt = (void *)rn;
 	nh = (struct nhop_object *)rt->rt_nhop;
 
 	rt_dumpaddr_ddb("dst", rt_key(rt));
 	rt_dumpaddr_ddb("gateway", &rt->rt_nhop->gw_sa);
 	rt_dumpaddr_ddb("netmask", rtsock_fix_netmask(rt_key(rt), rt_mask(rt),
 	    &ss));
 	if ((nh->nh_ifp->if_flags & IFF_DYING) == 0) {
 		rt_dumpaddr_ddb("ifp", nh->nh_ifp->if_addr->ifa_addr);
 		rt_dumpaddr_ddb("ifa", nh->nh_ifa->ifa_addr);
 	}
 
 	db_printf("flags ");
 	flags = rt->rte_flags | nhop_get_rtflags(nh);
 	if (flags == 0)
 		db_printf("none");
 
 	while ((idx = ffs(flags)) > 0) {
 		idx--;
 
 		db_printf("%s", rt_flag_name(idx));
 		flags &= ~(1ul << idx);
 		if (flags != 0)
 			db_printf(",");
 	}
 
 	db_printf("\n");
 	return (0);
 }
 
 DB_SHOW_COMMAND(routetable, db_show_routetable_cmd)
 {
 	struct rib_head *rnh;
 	int error, i, lim;
 
 	if (have_addr)
 		i = lim = addr;
 	else {
 		i = 1;
 		lim = AF_MAX;
 	}
 
 	for (; i <= lim; i++) {
 		rnh = rt_tables_get_rnh(0, i);
 		if (rnh == NULL) {
 			if (have_addr) {
 				db_printf("%s: AF %d not supported?\n",
 				    __func__, i);
 				break;
 			}
 			continue;
 		}
 
 		if (!have_addr && i > 1)
 			db_printf("\n");
 
 		db_printf("Route table for AF %d%s%s%s:\n", i,
 		    (i == AF_INET || i == AF_INET6) ? " (" : "",
 		    (i == AF_INET) ? "INET" : (i == AF_INET6) ? "INET6" : "",
 		    (i == AF_INET || i == AF_INET6) ? ")" : "");
 
 		error = rnh->rnh_walktree(&rnh->head, rt_dumpentry_ddb, NULL);
 		if (error != 0)
 			db_printf("%s: walktree(%d): %d\n", __func__, i,
 			    error);
 	}
 }
 
 DB_SHOW_COMMAND_FLAGS(route, db_show_route_cmd, CS_OWN)
 {
 	char abuf[INET6_ADDRSTRLEN], *buf, *end;
 	struct rib_head *rh;
 	struct radix_node *rn;
 	void *dst_addrp;
 	struct rtentry *rt;
 	union {
 		struct sockaddr_in dest_sin;
 		struct sockaddr_in6 dest_sin6;
 	} u;
 	int af;
 
 	buf = db_get_line();
 
 	/* Remove whitespaces from both ends */
 	end = buf + strlen(buf) - 1;
 	for (; (end >= buf) && (*end=='\n' || isspace(*end)); end--)
 		*end = '\0';
 	while (isspace(*buf))
 		buf++;
 
 	/* Determine AF */
 	if (strchr(buf, ':') != NULL) {
 		af = AF_INET6;
 		u.dest_sin6.sin6_family = af;
 		u.dest_sin6.sin6_len = sizeof(struct sockaddr_in6);
 		dst_addrp = &u.dest_sin6.sin6_addr;
 	} else {
 		af = AF_INET;
 		u.dest_sin.sin_family = af;
 		u.dest_sin.sin_len = sizeof(struct sockaddr_in);
 		dst_addrp = &u.dest_sin.sin_addr;
 	}
 
 	if (inet_pton(af, buf, dst_addrp) != 1)
 		goto usage;
 
 	if (inet_ntop(af, dst_addrp, abuf, sizeof(abuf)) != NULL)
 		db_printf("Looking up route to destination '%s'\n", abuf);
 
 	rt = NULL;
 	CURVNET_SET(vnet0);
 
 	rh = rt_tables_get_rnh(RT_DEFAULT_FIB, af);
 
 	rn = rh->rnh_matchaddr(&u, &rh->head);
 	if (rn && ((rn->rn_flags & RNF_ROOT) == 0))
 		rt = (struct rtentry *)rn;
 
 	CURVNET_RESTORE();
 
 	if (rt == NULL) {
 		db_printf("Could not get route for that server.\n");
 		return;
 	}
 
 	rt_dumpentry_ddb((void *)rt, NULL);
 
 	return;
 usage:
 	db_printf("Usage: 'show route <address>'\n"
 	    "  Currently accepts only IPv4 and IPv6 addresses\n");
 	db_skip_to_eol();
 }
diff --git a/sys/net/route/route_ifaddrs.c b/sys/net/route/route_ifaddrs.c
index a456ffa28696..4393ab635632 100644
--- a/sys/net/route/route_ifaddrs.c
+++ b/sys/net/route/route_ifaddrs.c
@@ -1,243 +1,244 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1980, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)route.c	8.3.1.1 (Berkeley) 2/23/95
  * $FreeBSD$
  */
 
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 
 /*
  * Control interface address fib propagation.
  * By default, interface address routes are added to the fib of the interface.
  * Once set to non-zero, adds interface address route to all fibs.
  */
 VNET_DEFINE(u_int, rt_add_addr_allfibs) = 0;
 SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET,
     &VNET_NAME(rt_add_addr_allfibs), 0, "");
 
 /*
  * Executes routing tables change specified by @cmd and @info for the fib
  * @fibnum. Generates routing message on success.
  * Note: it assumes there is only single route (interface route) for the
  * provided prefix.
  * Returns 0 on success or errno.
  */
 static int
 rib_handle_ifaddr_one(uint32_t fibnum, int cmd, struct rt_addrinfo *info)
 {
 	struct rib_cmd_info rc;
 	struct nhop_object *nh;
 	int error;
 
 	error = rib_action(fibnum, cmd, info, &rc);
 	if (error == 0) {
 		if (cmd == RTM_ADD)
 			nh = nhop_select(rc.rc_nh_new, 0);
 		else
 			nh = nhop_select(rc.rc_nh_old, 0);
 		rt_routemsg(cmd, rc.rc_rt, nh, fibnum);
 	}
 
 	return (error);
 }
 
 /*
  * Adds/deletes interface prefix specified by @info to the routing table.
  * If V_rt_add_addr_allfibs is set, iterates over all existing routing
  * tables, otherwise uses fib in @fibnum. Generates routing message for
  *  each table.
  * Returns 0 on success or errno.
  */
 int
 rib_handle_ifaddr_info(uint32_t fibnum, int cmd, struct rt_addrinfo *info)
 {
 	int error = 0, last_error = 0;
 	bool didwork = false;
 
 	if (V_rt_add_addr_allfibs == 0) {
 		error = rib_handle_ifaddr_one(fibnum, cmd, info);
 		didwork = (error == 0);
 	} else {
 		for (fibnum = 0; fibnum < V_rt_numfibs; fibnum++) {
 			error = rib_handle_ifaddr_one(fibnum, cmd, info);
 			if (error == 0)
 				didwork = true;
 			else
 				last_error = error;
 		}
 	}
 
 	if (cmd == RTM_DELETE) {
 		if (didwork) {
 			error = 0;
 		} else {
 			/* we only give an error if it wasn't in any table */
 			error = ((info->rti_flags & RTF_HOST) ?
 			    EHOSTUNREACH : ENETUNREACH);
 		}
 	} else {
 		if (last_error != 0) {
 			/* return an error if any of them failed */
 			error = last_error;
 		}
 	}
 	return (error);
 }
 
 static int
 ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa,
     struct sockaddr *ia)
 {
 	struct rib_cmd_info rc;
 	struct epoch_tracker et;
 	int error;
 	struct rt_addrinfo info;
 	struct sockaddr_dl null_sdl;
 	struct ifnet *ifp;
 
 	ifp = ifa->ifa_ifp;
 
 	NET_EPOCH_ENTER(et);
 	bzero(&info, sizeof(info));
 	if (cmd != RTM_DELETE)
 		info.rti_ifp = V_loif;
 	if (cmd == RTM_ADD) {
 		/* explicitly specify (loopback) ifa */
 		if (info.rti_ifp != NULL)
 			info.rti_ifa = ifaof_ifpforaddr(ifa->ifa_addr, info.rti_ifp);
 	}
 	info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED;
 	info.rti_info[RTAX_DST] = ia;
 	info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl;
 	link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type);
 
 	error = rib_action(ifp->if_fib, cmd, &info, &rc);
 	NET_EPOCH_EXIT(et);
 
 	if (error == 0 ||
 	    (cmd == RTM_ADD && error == EEXIST) ||
 	    (cmd == RTM_DELETE && (error == ENOENT || error == ESRCH)))
 		return (error);
 
 	log(LOG_DEBUG, "%s: %s failed for interface %s: %u\n",
 		__func__, otype, if_name(ifp), error);
 
 	return (error);
 }
 
 int
 ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 
 	return (ifa_maintain_loopback_route(RTM_ADD, "insertion", ifa, ia));
 }
 
 int
 ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 
 	return (ifa_maintain_loopback_route(RTM_DELETE, "deletion", ifa, ia));
 }
 
 int
 ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 
 	return (ifa_maintain_loopback_route(RTM_CHANGE, "switch", ifa, ia));
 }
 
 static bool
 match_kernel_route(const struct rtentry *rt, struct nhop_object *nh)
 {
 	if (!NH_IS_NHGRP(nh) && (nhop_get_rtflags(nh) & RTF_PINNED) &&
 	    nh->nh_aifp->if_fib == nhop_get_fibnum(nh))
 		return (true);
 	return (false);
 }
 
 static int
 pick_kernel_route(struct rtentry *rt, void *arg)
 {
 	struct nhop_object *nh = rt->rt_nhop;
 	struct rib_head *rh_dst = (struct rib_head *)arg;
 
 	if (match_kernel_route(rt, nh)) {
 		struct rib_cmd_info rc = {};
 		struct route_nhop_data rnd = {
 			.rnd_nhop = nh,
 			.rnd_weight = rt->rt_weight,
 		};
 		rib_copy_route(rt, &rnd, rh_dst, &rc);
 	}
 	return (0);
 }
 
 /*
  * Tries to copy kernel routes matching pattern from @rh_src to @rh_dst.
  *
  * Note: as this function acquires locks for both @rh_src and @rh_dst,
  *  it needs to be called under RTABLES_LOCK() to avoid deadlocking
  * with multiple ribs.
  */
 void
 rib_copy_kernel_routes(struct rib_head *rh_src, struct rib_head *rh_dst)
 {
 	struct epoch_tracker et;
 
 	if (V_rt_add_addr_allfibs == 0)
 		return;
 
 	NET_EPOCH_ENTER(et);
 	rib_walk_ext_internal(rh_src, false, pick_kernel_route, NULL, rh_dst);
 	NET_EPOCH_EXIT(et);
 }
 
diff --git a/sys/net/route/route_rtentry.c b/sys/net/route/route_rtentry.c
index 64900ae3ae39..0c3c8ddd7361 100644
--- a/sys/net/route/route_rtentry.c
+++ b/sys/net/route/route_rtentry.c
@@ -1,309 +1,308 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2021-2022 Alexander V. Chernikov
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/nhop.h>
 #include <netinet/in.h>
 #include <netinet6/scope6_var.h>
-#include <netinet6/in6_var.h>
 
 #include <vm/uma.h>
 
 /* Routing table UMA zone */
 VNET_DEFINE_STATIC(uma_zone_t, rtzone);
 #define	V_rtzone	VNET(rtzone)
 
 void
 vnet_rtzone_init(void)
 {
 
 	V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 
 #ifdef VIMAGE
 void
 vnet_rtzone_destroy(void)
 {
 
 	uma_zdestroy(V_rtzone);
 }
 #endif
 
 /*
  * Creates rtentry and based on @dst/@netmask data.
  * Return 0 and fills in rtentry into @prt on success,
  * Note: rtentry mask ptr will be set to @netmask , thus its pointer is required
  *  to be stable till the end of the operation (radix rt insertion/change/removal).
  */
 struct rtentry *
 rt_alloc(struct rib_head *rnh, const struct sockaddr *dst,
     struct sockaddr *netmask)
 {
 	MPASS(dst->sa_len <= sizeof(((struct rtentry *)NULL)->rt_dstb));
 
 	struct rtentry *rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO);
 	if (rt == NULL)
 		return (NULL);
 	rt->rte_flags = RTF_UP | (netmask == NULL ? RTF_HOST : 0);
 
 	/* Fill in dst, ensuring it's masked if needed. */
 	if (netmask != NULL) {
 		rt_maskedcopy(dst, &rt->rt_dst, netmask);
 	} else
 		bcopy(dst, &rt->rt_dst, dst->sa_len);
 	rt_key(rt) = &rt->rt_dst;
 	/* Set netmask to the storage from info. It will be updated upon insertion */
 	rt_mask(rt) = netmask;
 
 	return (rt);
 }
 
 static void
 destroy_rtentry(struct rtentry *rt)
 {
 #ifdef VIMAGE
 	struct nhop_object *nh = rt->rt_nhop;
 
 	/*
 	 * At this moment rnh, nh_control may be already freed.
 	 * nhop interface may have been migrated to a different vnet.
 	 * Use vnet stored in the nexthop to delete the entry.
 	 */
 #ifdef ROUTE_MPATH
 	if (NH_IS_NHGRP(nh)) {
 		const struct weightened_nhop *wn;
 		uint32_t num_nhops;
 		wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
 		nh = wn[0].nh;
 	}
 #endif
 	CURVNET_SET(nhop_get_vnet(nh));
 #endif
 
 	/* Unreference nexthop */
 	nhop_free_any(rt->rt_nhop);
 
 	rt_free_immediate(rt);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Epoch callback indicating rtentry is safe to destroy
  */
 static void
 destroy_rtentry_epoch(epoch_context_t ctx)
 {
 	struct rtentry *rt;
 
 	rt = __containerof(ctx, struct rtentry, rt_epoch_ctx);
 
 	destroy_rtentry(rt);
 }
 
 /*
  * Schedule rtentry deletion
  */
 void
 rt_free(struct rtentry *rt)
 {
 
 	KASSERT(rt != NULL, ("%s: NULL rt", __func__));
 
 	NET_EPOCH_CALL(destroy_rtentry_epoch, &rt->rt_epoch_ctx);
 }
 
 void
 rt_free_immediate(struct rtentry *rt)
 {
 	uma_zfree(V_rtzone, rt);
 }
 
 bool
 rt_is_host(const struct rtentry *rt)
 {
 
 	return (rt->rte_flags & RTF_HOST);
 }
 
 sa_family_t
 rt_get_family(const struct rtentry *rt)
 {
 	const struct sockaddr *dst;
 
 	dst = (const struct sockaddr *)rt_key_const(rt);
 
 	return (dst->sa_family);
 }
 
 /*
  * Returns pointer to nexthop or nexthop group
  * associated with @rt
  */
 struct nhop_object *
 rt_get_raw_nhop(const struct rtentry *rt)
 {
 
 	return (rt->rt_nhop);
 }
 
 void
 rt_get_rnd(const struct rtentry *rt, struct route_nhop_data *rnd)
 {
 	rnd->rnd_nhop = rt->rt_nhop;
 	rnd->rnd_weight = rt->rt_weight;
 }
 
 #ifdef INET
 /*
  * Stores IPv4 address and prefix length of @rt inside
  *  @paddr and @plen.
  * @pscopeid is currently always set to 0.
  */
 void
 rt_get_inet_prefix_plen(const struct rtentry *rt, struct in_addr *paddr,
     int *plen, uint32_t *pscopeid)
 {
 	const struct sockaddr_in *dst;
 
 	dst = (const struct sockaddr_in *)rt_key_const(rt);
 	KASSERT((dst->sin_family == AF_INET),
 	    ("rt family is %d, not inet", dst->sin_family));
 	*paddr = dst->sin_addr;
 	dst = (const struct sockaddr_in *)rt_mask_const(rt);
 	if (dst == NULL)
 		*plen = 32;
 	else
 		*plen = bitcount32(dst->sin_addr.s_addr);
 	*pscopeid = 0;
 }
 
 /*
  * Stores IPv4 address and prefix mask of @rt inside
  *  @paddr and @pmask. Sets mask to INADDR_ANY for host routes.
  * @pscopeid is currently always set to 0.
  */
 void
 rt_get_inet_prefix_pmask(const struct rtentry *rt, struct in_addr *paddr,
     struct in_addr *pmask, uint32_t *pscopeid)
 {
 	const struct sockaddr_in *dst;
 
 	dst = (const struct sockaddr_in *)rt_key_const(rt);
 	KASSERT((dst->sin_family == AF_INET),
 	    ("rt family is %d, not inet", dst->sin_family));
 	*paddr = dst->sin_addr;
 	dst = (const struct sockaddr_in *)rt_mask_const(rt);
 	if (dst == NULL)
 		pmask->s_addr = INADDR_BROADCAST;
 	else
 		*pmask = dst->sin_addr;
 	*pscopeid = 0;
 }
 #endif
 
 #ifdef INET6
 static int
 inet6_get_plen(const struct in6_addr *addr)
 {
 
 	return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) +
 	    bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3]));
 }
 
 /*
  * Stores IPv6 address and prefix length of @rt inside
  *  @paddr and @plen. Addresses are returned in de-embedded form.
  * Scopeid is set to 0 for non-LL addresses.
  */
 void
 rt_get_inet6_prefix_plen(const struct rtentry *rt, struct in6_addr *paddr,
     int *plen, uint32_t *pscopeid)
 {
 	const struct sockaddr_in6 *dst;
 
 	dst = (const struct sockaddr_in6 *)rt_key_const(rt);
 	KASSERT((dst->sin6_family == AF_INET6),
 	    ("rt family is %d, not inet6", dst->sin6_family));
 	if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr))
 		in6_splitscope(&dst->sin6_addr, paddr, pscopeid);
 	else
 		*paddr = dst->sin6_addr;
 	dst = (const struct sockaddr_in6 *)rt_mask_const(rt);
 	if (dst == NULL)
 		*plen = 128;
 	else
 		*plen = inet6_get_plen(&dst->sin6_addr);
 }
 
 /*
  * Stores IPv6 address and prefix mask of @rt inside
  *  @paddr and @pmask. Addresses are returned in de-embedded form.
  * Scopeid is set to 0 for non-LL addresses.
  */
 void
 rt_get_inet6_prefix_pmask(const struct rtentry *rt, struct in6_addr *paddr,
     struct in6_addr *pmask, uint32_t *pscopeid)
 {
 	const struct sockaddr_in6 *dst;
 
 	dst = (const struct sockaddr_in6 *)rt_key_const(rt);
 	KASSERT((dst->sin6_family == AF_INET6),
 	    ("rt family is %d, not inet", dst->sin6_family));
 	if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr))
 		in6_splitscope(&dst->sin6_addr, paddr, pscopeid);
 	else
 		*paddr = dst->sin6_addr;
 	dst = (const struct sockaddr_in6 *)rt_mask_const(rt);
 	if (dst == NULL)
 		memset(pmask, 0xFF, sizeof(struct in6_addr));
 	else
 		*pmask = dst->sin6_addr;
 }
 #endif
 
 
diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c
index 4ae5c9559566..b77692d28588 100644
--- a/sys/net/rtsock.c
+++ b/sys/net/rtsock.c
@@ -1,2723 +1,2724 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)rtsock.c	8.7 (Berkeley) 10/12/95
  * $FreeBSD$
  */
 #include "opt_ddb.h"
 #include "opt_route.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/eventhandler.h>
 #include <sys/domain.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip_carp.h>
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #endif
 #include <net/route/nhop.h>
 
 #define	DEBUG_MOD_NAME	rtsock
 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
 #include <net/route/route_debug.h>
 _DECLARE_DEBUG(LOG_INFO);
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 
 struct if_msghdr32 {
 	uint16_t ifm_msglen;
 	uint8_t	ifm_version;
 	uint8_t	ifm_type;
 	int32_t	ifm_addrs;
 	int32_t	ifm_flags;
 	uint16_t ifm_index;
 	uint16_t _ifm_spare1;
 	struct	if_data ifm_data;
 };
 
 struct if_msghdrl32 {
 	uint16_t ifm_msglen;
 	uint8_t	ifm_version;
 	uint8_t	ifm_type;
 	int32_t	ifm_addrs;
 	int32_t	ifm_flags;
 	uint16_t ifm_index;
 	uint16_t _ifm_spare1;
 	uint16_t ifm_len;
 	uint16_t ifm_data_off;
 	uint32_t _ifm_spare2;
 	struct	if_data ifm_data;
 };
 
 struct ifa_msghdrl32 {
 	uint16_t ifam_msglen;
 	uint8_t	ifam_version;
 	uint8_t	ifam_type;
 	int32_t	ifam_addrs;
 	int32_t	ifam_flags;
 	uint16_t ifam_index;
 	uint16_t _ifam_spare1;
 	uint16_t ifam_len;
 	uint16_t ifam_data_off;
 	int32_t	ifam_metric;
 	struct	if_data ifam_data;
 };
 
 #define SA_SIZE32(sa)						\
     (  (((struct sockaddr *)(sa))->sa_len == 0) ?		\
 	sizeof(int)		:				\
 	1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(int) - 1) ) )
 
 #endif /* COMPAT_FREEBSD32 */
 
 struct linear_buffer {
 	char		*base;	/* Base allocated memory pointer */
 	uint32_t	offset;	/* Currently used offset */
 	uint32_t	size;	/* Total buffer size */
 };
 #define	SCRATCH_BUFFER_SIZE	1024
 
 #define	RTS_PID_LOG(_l, _fmt, ...)	RT_LOG_##_l(_l, "PID %d: " _fmt, curproc ? curproc->p_pid : 0, ## __VA_ARGS__)
 
 MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables");
 
 /* NB: these are not modified */
 static struct	sockaddr route_src = { 2, PF_ROUTE, };
 static struct	sockaddr sa_zero   = { sizeof(sa_zero), AF_INET, };
 
 /* These are external hooks for CARP. */
 int	(*carp_get_vhid_p)(struct ifaddr *);
 
 /*
  * Used by rtsock callback code to decide whether to filter the update
  * notification to a socket bound to a particular FIB.
  */
 #define	RTS_FILTER_FIB	M_PROTO8
 /*
  * Used to store address family of the notification.
  */
 #define	m_rtsock_family	m_pkthdr.PH_loc.eight[0]
 
 struct rcb {
 	LIST_ENTRY(rcb) list;
 	struct socket	*rcb_socket;
 	sa_family_t	rcb_family;
 };
 
 typedef struct {
 	LIST_HEAD(, rcb)	cblist;
 	int	ip_count;	/* attached w/ AF_INET */
 	int	ip6_count;	/* attached w/ AF_INET6 */
 	int	any_count;	/* total attached */
 } route_cb_t;
 VNET_DEFINE_STATIC(route_cb_t, route_cb);
 #define	V_route_cb VNET(route_cb)
 
 struct mtx rtsock_mtx;
 MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF);
 
 #define	RTSOCK_LOCK()	mtx_lock(&rtsock_mtx)
 #define	RTSOCK_UNLOCK()	mtx_unlock(&rtsock_mtx)
 #define	RTSOCK_LOCK_ASSERT()	mtx_assert(&rtsock_mtx, MA_OWNED)
 
 SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
 
 struct walkarg {
 	int	family;
 	int	w_tmemsize;
 	int	w_op, w_arg;
 	caddr_t	w_tmem;
 	struct sysctl_req *w_req;
 	struct sockaddr *dst;
 	struct sockaddr *mask;
 };
 
 static void	rts_input(struct mbuf *m);
 static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo);
 static int	rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo,
 			struct walkarg *w, int *plen);
 static int	rt_xaddrs(caddr_t cp, caddr_t cplim,
 			struct rt_addrinfo *rtinfo);
 static int	cleanup_xaddrs(struct rt_addrinfo *info, struct linear_buffer *lb);
 static int	sysctl_dumpentry(struct rtentry *rt, void *vw);
 static int	sysctl_dumpnhop(struct rtentry *rt, struct nhop_object *nh,
 			uint32_t weight, struct walkarg *w);
 static int	sysctl_iflist(int af, struct walkarg *w);
 static int	sysctl_ifmalist(int af, struct walkarg *w);
 static void	rt_getmetrics(const struct rtentry *rt,
 			const struct nhop_object *nh, struct rt_metrics *out);
 static void	rt_dispatch(struct mbuf *, sa_family_t);
 static void	rt_ifannouncemsg(struct ifnet *ifp, int what);
 static int	handle_rtm_get(struct rt_addrinfo *info, u_int fibnum,
 			struct rt_msghdr *rtm, struct rib_cmd_info *rc);
 static int	update_rtm_from_rc(struct rt_addrinfo *info,
 			struct rt_msghdr **prtm, int alloc_len,
 			struct rib_cmd_info *rc, struct nhop_object *nh);
 static void	send_rtm_reply(struct socket *so, struct rt_msghdr *rtm,
 			struct mbuf *m, sa_family_t saf, u_int fibnum,
 			int rtm_errno);
 static bool	can_export_rte(struct ucred *td_ucred, bool rt_is_host,
 			const struct sockaddr *rt_dst);
 static void	rtsock_notify_event(uint32_t fibnum, const struct rib_cmd_info *rc);
 static void	rtsock_ifmsg(struct ifnet *ifp, int if_flags_mask);
 
 static struct netisr_handler rtsock_nh = {
 	.nh_name = "rtsock",
 	.nh_handler = rts_input,
 	.nh_proto = NETISR_ROUTE,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 static int
 sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&rtsock_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
         if (error || !req->newptr)
                 return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&rtsock_nh, qlimit));
 }
 SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, sysctl_route_netisr_maxqlen, "I",
     "maximum routing socket dispatch queue length");
 
 static void
 vnet_rts_init(void)
 {
 	int tmp;
 
 	if (IS_DEFAULT_VNET(curvnet)) {
 		if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp))
 			rtsock_nh.nh_qlimit = tmp;
 		netisr_register(&rtsock_nh);
 	}
 #ifdef VIMAGE
 	 else
 		netisr_register_vnet(&rtsock_nh);
 #endif
 }
 VNET_SYSINIT(vnet_rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     vnet_rts_init, 0);
 
 #ifdef VIMAGE
 static void
 vnet_rts_uninit(void)
 {
 
 	netisr_unregister_vnet(&rtsock_nh);
 }
 VNET_SYSUNINIT(vnet_rts_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     vnet_rts_uninit, 0);
 #endif
 
 static void
 report_route_event(const struct rib_cmd_info *rc, void *_cbdata)
 {
 	uint32_t fibnum = (uint32_t)(uintptr_t)_cbdata;
 	struct nhop_object *nh;
 
 	nh = rc->rc_cmd == RTM_DELETE ? rc->rc_nh_old : rc->rc_nh_new;
 	rt_routemsg(rc->rc_cmd, rc->rc_rt, nh, fibnum);
 }
 
 static void
 rts_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc)
 {
 #ifdef ROUTE_MPATH
 	if ((rc->rc_nh_new && NH_IS_NHGRP(rc->rc_nh_new)) ||
 	    (rc->rc_nh_old && NH_IS_NHGRP(rc->rc_nh_old))) {
 		rib_decompose_notification(rc, report_route_event,
 		    (void *)(uintptr_t)fibnum);
 	} else
 #endif
 		report_route_event(rc, (void *)(uintptr_t)fibnum);
 }
 static struct rtbridge rtsbridge = {
 	.route_f = rts_handle_route_event,
 	.ifmsg_f = rtsock_ifmsg,
 };
 static struct rtbridge *rtsbridge_orig_p;
 
 static void
 rtsock_notify_event(uint32_t fibnum, const struct rib_cmd_info *rc)
 {
 	netlink_callback_p->route_f(fibnum, rc);
 }
 
 static void
 rtsock_init(void)
 {
 	rtsbridge_orig_p = rtsock_callback_p;
 	rtsock_callback_p = &rtsbridge;
 }
 SYSINIT(rtsock_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtsock_init, NULL);
 
 static void
 rts_handle_ifnet_arrival(void *arg __unused, struct ifnet *ifp)
 {
 	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 }
 EVENTHANDLER_DEFINE(ifnet_arrival_event, rts_handle_ifnet_arrival, NULL, 0);
 
 static void
 rts_handle_ifnet_departure(void *arg __unused, struct ifnet *ifp)
 {
 	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 }
 EVENTHANDLER_DEFINE(ifnet_departure_event, rts_handle_ifnet_departure, NULL, 0);
 
 static void
 rts_append_data(struct socket *so, struct mbuf *m)
 {
 
 	if (sbappendaddr(&so->so_rcv, &route_src, m, NULL) == 0) {
 		soroverflow(so);
 		m_freem(m);
 	} else
 		sorwakeup(so);
 }
 
 static void
 rts_input(struct mbuf *m)
 {
 	struct rcb *rcb;
 	struct socket *last;
 
 	last = NULL;
 	RTSOCK_LOCK();
 	LIST_FOREACH(rcb, &V_route_cb.cblist, list) {
 		if (rcb->rcb_family != AF_UNSPEC &&
 		    rcb->rcb_family != m->m_rtsock_family)
 			continue;
 		if ((m->m_flags & RTS_FILTER_FIB) &&
 		    M_GETFIB(m) != rcb->rcb_socket->so_fibnum)
 			continue;
 		if (last != NULL) {
 			struct mbuf *n;
 
 			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 			if (n != NULL)
 				rts_append_data(last, n);
 		}
 		last = rcb->rcb_socket;
 	}
 	if (last != NULL)
 		rts_append_data(last, m);
 	else
 		m_freem(m);
 	RTSOCK_UNLOCK();
 }
 
 static void
 rts_close(struct socket *so)
 {
 
 	soisdisconnected(so);
 }
 
 static SYSCTL_NODE(_net, OID_AUTO, rtsock, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Routing socket infrastructure");
 static u_long rts_sendspace = 8192;
 SYSCTL_ULONG(_net_rtsock, OID_AUTO, sendspace, CTLFLAG_RW, &rts_sendspace, 0,
     "Default routing socket send space");
 static u_long rts_recvspace = 8192;
 SYSCTL_ULONG(_net_rtsock, OID_AUTO, recvspace, CTLFLAG_RW, &rts_recvspace, 0,
     "Default routing socket receive space");
 
 static int
 rts_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct rcb *rcb;
 	int error;
 
 	error = soreserve(so, rts_sendspace, rts_recvspace);
 	if (error)
 		return (error);
 
 	rcb = malloc(sizeof(*rcb), M_PCB, M_WAITOK);
 	rcb->rcb_socket = so;
 	rcb->rcb_family = proto;
 
 	so->so_pcb = rcb;
 	so->so_fibnum = td->td_proc->p_fibnum;
 	so->so_options |= SO_USELOOPBACK;
 
 	RTSOCK_LOCK();
 	LIST_INSERT_HEAD(&V_route_cb.cblist, rcb, list);
 	switch (proto) {
 	case AF_INET:
 		V_route_cb.ip_count++;
 		break;
 	case AF_INET6:
 		V_route_cb.ip6_count++;
 		break;
 	}
 	V_route_cb.any_count++;
 	RTSOCK_UNLOCK();
 	soisconnected(so);
 
 	return (0);
 }
 
 static void
 rts_detach(struct socket *so)
 {
 	struct rcb *rcb = so->so_pcb;
 
 	RTSOCK_LOCK();
 	LIST_REMOVE(rcb, list);
 	switch(rcb->rcb_family) {
 	case AF_INET:
 		V_route_cb.ip_count--;
 		break;
 	case AF_INET6:
 		V_route_cb.ip6_count--;
 		break;
 	}
 	V_route_cb.any_count--;
 	RTSOCK_UNLOCK();
 	free(rcb, M_PCB);
 	so->so_pcb = NULL;
 }
 
 static int
 rts_disconnect(struct socket *so)
 {
 
 	return (ENOTCONN);
 }
 
 static int
 rts_shutdown(struct socket *so)
 {
 
 	socantsendmore(so);
 	return (0);
 }
 
 #ifndef _SOCKADDR_UNION_DEFINED
 #define	_SOCKADDR_UNION_DEFINED
 /*
  * The union of all possible address formats we handle.
  */
 union sockaddr_union {
 	struct sockaddr		sa;
 	struct sockaddr_in	sin;
 	struct sockaddr_in6	sin6;
 };
 #endif /* _SOCKADDR_UNION_DEFINED */
 
 static int
 rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp,
     struct nhop_object *nh, union sockaddr_union *saun, struct ucred *cred)
 {
 #if defined(INET) || defined(INET6)
 	struct epoch_tracker et;
 #endif
 
 	/* First, see if the returned address is part of the jail. */
 	if (prison_if(cred, nh->nh_ifa->ifa_addr) == 0) {
 		info->rti_info[RTAX_IFA] = nh->nh_ifa->ifa_addr;
 		return (0);
 	}
 
 	switch (info->rti_info[RTAX_DST]->sa_family) {
 #ifdef INET
 	case AF_INET:
 	{
 		struct in_addr ia;
 		struct ifaddr *ifa;
 		int found;
 
 		found = 0;
 		/*
 		 * Try to find an address on the given outgoing interface
 		 * that belongs to the jail.
 		 */
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa;
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			ia = ((struct sockaddr_in *)sa)->sin_addr;
 			if (prison_check_ip4(cred, &ia) == 0) {
 				found = 1;
 				break;
 			}
 		}
 		NET_EPOCH_EXIT(et);
 		if (!found) {
 			/*
 			 * As a last resort return the 'default' jail address.
 			 */
 			ia = ((struct sockaddr_in *)nh->nh_ifa->ifa_addr)->
 			    sin_addr;
 			if (prison_get_ip4(cred, &ia) != 0)
 				return (ESRCH);
 		}
 		bzero(&saun->sin, sizeof(struct sockaddr_in));
 		saun->sin.sin_len = sizeof(struct sockaddr_in);
 		saun->sin.sin_family = AF_INET;
 		saun->sin.sin_addr.s_addr = ia.s_addr;
 		info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin;
 		break;
 	}
 #endif
 #ifdef INET6
 	case AF_INET6:
 	{
 		struct in6_addr ia6;
 		struct ifaddr *ifa;
 		int found;
 
 		found = 0;
 		/*
 		 * Try to find an address on the given outgoing interface
 		 * that belongs to the jail.
 		 */
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa;
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET6)
 				continue;
 			bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr,
 			    &ia6, sizeof(struct in6_addr));
 			if (prison_check_ip6(cred, &ia6) == 0) {
 				found = 1;
 				break;
 			}
 		}
 		NET_EPOCH_EXIT(et);
 		if (!found) {
 			/*
 			 * As a last resort return the 'default' jail address.
 			 */
 			ia6 = ((struct sockaddr_in6 *)nh->nh_ifa->ifa_addr)->
 			    sin6_addr;
 			if (prison_get_ip6(cred, &ia6) != 0)
 				return (ESRCH);
 		}
 		bzero(&saun->sin6, sizeof(struct sockaddr_in6));
 		saun->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		saun->sin6.sin6_family = AF_INET6;
 		bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr));
 		if (sa6_recoverscope(&saun->sin6) != 0)
 			return (ESRCH);
 		info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6;
 		break;
 	}
 #endif
 	default:
 		return (ESRCH);
 	}
 	return (0);
 }
 
 static int
 fill_blackholeinfo(struct rt_addrinfo *info, union sockaddr_union *saun)
 {
 	struct ifaddr *ifa;
 	sa_family_t saf;
 
 	if (V_loif == NULL) {
 		RTS_PID_LOG(LOG_INFO, "Unable to add blackhole/reject nhop without loopback");
 		return (ENOTSUP);
 	}
 	info->rti_ifp = V_loif;
 
 	saf = info->rti_info[RTAX_DST]->sa_family;
 
 	CK_STAILQ_FOREACH(ifa, &info->rti_ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family == saf) {
 			info->rti_ifa = ifa;
 			break;
 		}
 	}
 	if (info->rti_ifa == NULL) {
 		RTS_PID_LOG(LOG_INFO, "Unable to find ifa for blackhole/reject nhop");
 		return (ENOTSUP);
 	}
 
 	bzero(saun, sizeof(union sockaddr_union));
 	switch (saf) {
 #ifdef INET
 	case AF_INET:
 		saun->sin.sin_family = AF_INET;
 		saun->sin.sin_len = sizeof(struct sockaddr_in);
 		saun->sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		saun->sin6.sin6_family = AF_INET6;
 		saun->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		saun->sin6.sin6_addr = in6addr_loopback;
 		break;
 #endif
 	default:
 		RTS_PID_LOG(LOG_INFO, "unsupported family: %d", saf);
 		return (ENOTSUP);
 	}
 	info->rti_info[RTAX_GATEWAY] = &saun->sa;
 	info->rti_flags |= RTF_GATEWAY;
 
 	return (0);
 }
 
 /*
  * Fills in @info based on userland-provided @rtm message.
  *
  * Returns 0 on success.
  */
 static int
 fill_addrinfo(struct rt_msghdr *rtm, int len, struct linear_buffer *lb, u_int fibnum,
     struct rt_addrinfo *info)
 {
 	int error;
 
 	rtm->rtm_pid = curproc->p_pid;
 	info->rti_addrs = rtm->rtm_addrs;
 
 	info->rti_mflags = rtm->rtm_inits;
 	info->rti_rmx = &rtm->rtm_rmx;
 
 	/*
 	 * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6
 	 * link-local address because rtrequest requires addresses with
 	 * embedded scope id.
 	 */
 	if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, info))
 		return (EINVAL);
 
 	info->rti_flags = rtm->rtm_flags;
 	error = cleanup_xaddrs(info, lb);
 	if (error != 0)
 		return (error);
 	/*
 	 * Verify that the caller has the appropriate privilege; RTM_GET
 	 * is the only operation the non-superuser is allowed.
 	 */
 	if (rtm->rtm_type != RTM_GET) {
 		error = priv_check(curthread, PRIV_NET_ROUTE);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * The given gateway address may be an interface address.
 	 * For example, issuing a "route change" command on a route
 	 * entry that was created from a tunnel, and the gateway
 	 * address given is the local end point. In this case the 
 	 * RTF_GATEWAY flag must be cleared or the destination will
 	 * not be reachable even though there is no error message.
 	 */
 	if (info->rti_info[RTAX_GATEWAY] != NULL &&
 	    info->rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) {
 		struct nhop_object *nh;
 
 		/* 
 		 * A host route through the loopback interface is 
 		 * installed for each interface adddress. In pre 8.0
 		 * releases the interface address of a PPP link type
 		 * is not reachable locally. This behavior is fixed as 
 		 * part of the new L2/L3 redesign and rewrite work. The
 		 * signature of this interface address route is the
 		 * AF_LINK sa_family type of the gateway, and the
 		 * rt_ifp has the IFF_LOOPBACK flag set.
 		 */
 		nh = rib_lookup(fibnum, info->rti_info[RTAX_GATEWAY], NHR_NONE, 0);
 		if (nh != NULL && nh->gw_sa.sa_family == AF_LINK &&
 		    nh->nh_ifp->if_flags & IFF_LOOPBACK) {
 				info->rti_flags &= ~RTF_GATEWAY;
 				info->rti_flags |= RTF_GWFLAG_COMPAT;
 		}
 	}
 
 	return (0);
 }
 
 static struct nhop_object *
 select_nhop(struct nhop_object *nh, const struct sockaddr *gw)
 {
 	if (!NH_IS_NHGRP(nh))
 		return (nh);
 #ifdef ROUTE_MPATH
 	const struct weightened_nhop *wn;
 	uint32_t num_nhops;
 	wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
 	if (gw == NULL)
 		return (wn[0].nh);
 	for (int i = 0; i < num_nhops; i++) {
 		if (match_nhop_gw(wn[i].nh, gw))
 			return (wn[i].nh);
 	}
 #endif
 	return (NULL);
 }
 
 /*
  * Handles RTM_GET message from routing socket, returning matching rt.
  *
  * Returns:
  * 0 on success, with locked and referenced matching rt in @rt_nrt
  * errno of failure
  */
 static int
 handle_rtm_get(struct rt_addrinfo *info, u_int fibnum,
     struct rt_msghdr *rtm, struct rib_cmd_info *rc)
 {
 	RIB_RLOCK_TRACKER;
 	struct rib_head *rnh;
 	struct nhop_object *nh;
 	sa_family_t saf;
 
 	saf = info->rti_info[RTAX_DST]->sa_family;
 
 	rnh = rt_tables_get_rnh(fibnum, saf);
 	if (rnh == NULL)
 		return (EAFNOSUPPORT);
 
 	RIB_RLOCK(rnh);
 
 	/*
 	 * By (implicit) convention host route (one without netmask)
 	 * means longest-prefix-match request and the route with netmask
 	 * means exact-match lookup.
 	 * As cleanup_xaddrs() cleans up info flags&addrs for the /32,/128
 	 * prefixes, use original data to check for the netmask presence.
 	 */
 	if ((rtm->rtm_addrs & RTA_NETMASK) == 0) {
 		/*
 		 * Provide longest prefix match for
 		 * address lookup (no mask).
 		 * 'route -n get addr'
 		 */
 		rc->rc_rt = (struct rtentry *) rnh->rnh_matchaddr(
 		    info->rti_info[RTAX_DST], &rnh->head);
 	} else
 		rc->rc_rt = (struct rtentry *) rnh->rnh_lookup(
 		    info->rti_info[RTAX_DST],
 		    info->rti_info[RTAX_NETMASK], &rnh->head);
 
 	if (rc->rc_rt == NULL) {
 		RIB_RUNLOCK(rnh);
 		return (ESRCH);
 	}
 
 	nh = select_nhop(rt_get_raw_nhop(rc->rc_rt), info->rti_info[RTAX_GATEWAY]);
 	if (nh == NULL) {
 		RIB_RUNLOCK(rnh);
 		return (ESRCH);
 	}
 	/*
 	 * If performing proxied L2 entry insertion, and
 	 * the actual PPP host entry is found, perform
 	 * another search to retrieve the prefix route of
 	 * the local end point of the PPP link.
 	 * TODO: move this logic to userland.
 	 */
 	if (rtm->rtm_flags & RTF_ANNOUNCE) {
 		struct sockaddr_storage laddr;
 
 		if (nh->nh_ifp != NULL &&
 		    nh->nh_ifp->if_type == IFT_PROPVIRTUAL) {
 			struct ifaddr *ifa;
 
 			ifa = ifa_ifwithnet(info->rti_info[RTAX_DST], 1,
 					RT_ALL_FIBS);
 			if (ifa != NULL)
 				rt_maskedcopy(ifa->ifa_addr,
 					      (struct sockaddr *)&laddr,
 					      ifa->ifa_netmask);
 		} else
 			rt_maskedcopy(nh->nh_ifa->ifa_addr,
 				      (struct sockaddr *)&laddr,
 				      nh->nh_ifa->ifa_netmask);
 		/* 
 		 * refactor rt and no lock operation necessary
 		 */
 		rc->rc_rt = (struct rtentry *)rnh->rnh_matchaddr(
 		    (struct sockaddr *)&laddr, &rnh->head);
 		if (rc->rc_rt == NULL) {
 			RIB_RUNLOCK(rnh);
 			return (ESRCH);
 		}
 		nh = select_nhop(rt_get_raw_nhop(rc->rc_rt), info->rti_info[RTAX_GATEWAY]);
 		if (nh == NULL) {
 			RIB_RUNLOCK(rnh);
 			return (ESRCH);
 		}
 	}
 	rc->rc_nh_new = nh;
 	rc->rc_nh_weight = rc->rc_rt->rt_weight;
 	RIB_RUNLOCK(rnh);
 
 	return (0);
 }
 
 static void
 init_sockaddrs_family(int family, struct sockaddr *dst, struct sockaddr *mask)
 {
 #ifdef INET
 	if (family == AF_INET) {
 		struct sockaddr_in *dst4 = (struct sockaddr_in *)dst;
 		struct sockaddr_in *mask4 = (struct sockaddr_in *)mask;
 
 		bzero(dst4, sizeof(struct sockaddr_in));
 		bzero(mask4, sizeof(struct sockaddr_in));
 
 		dst4->sin_family = AF_INET;
 		dst4->sin_len = sizeof(struct sockaddr_in);
 		mask4->sin_family = AF_INET;
 		mask4->sin_len = sizeof(struct sockaddr_in);
 	}
 #endif
 #ifdef INET6
 	if (family == AF_INET6) {
 		struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst;
 		struct sockaddr_in6 *mask6 = (struct sockaddr_in6 *)mask;
 
 		bzero(dst6, sizeof(struct sockaddr_in6));
 		bzero(mask6, sizeof(struct sockaddr_in6));
 
 		dst6->sin6_family = AF_INET6;
 		dst6->sin6_len = sizeof(struct sockaddr_in6);
 		mask6->sin6_family = AF_INET6;
 		mask6->sin6_len = sizeof(struct sockaddr_in6);
 	}
 #endif
 }
 
 static void
 export_rtaddrs(const struct rtentry *rt, struct sockaddr *dst,
     struct sockaddr *mask)
 {
 #ifdef INET
 	if (dst->sa_family == AF_INET) {
 		struct sockaddr_in *dst4 = (struct sockaddr_in *)dst;
 		struct sockaddr_in *mask4 = (struct sockaddr_in *)mask;
 		uint32_t scopeid = 0;
 		rt_get_inet_prefix_pmask(rt, &dst4->sin_addr, &mask4->sin_addr,
 		    &scopeid);
 		return;
 	}
 #endif
 #ifdef INET6
 	if (dst->sa_family == AF_INET6) {
 		struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst;
 		struct sockaddr_in6 *mask6 = (struct sockaddr_in6 *)mask;
 		uint32_t scopeid = 0;
 		rt_get_inet6_prefix_pmask(rt, &dst6->sin6_addr,
 		    &mask6->sin6_addr, &scopeid);
 		dst6->sin6_scope_id = scopeid;
 		return;
 	}
 #endif
 }
 
 static int
 update_rtm_from_info(struct rt_addrinfo *info, struct rt_msghdr **prtm,
     int alloc_len)
 {
 	struct rt_msghdr *rtm, *orig_rtm = NULL;
 	struct walkarg w;
 	int len;
 
 	rtm = *prtm;
 	/* Check if we need to realloc storage */
 	rtsock_msg_buffer(rtm->rtm_type, info, NULL, &len);
 	if (len > alloc_len) {
 		struct rt_msghdr *tmp_rtm;
 
 		tmp_rtm = malloc(len, M_TEMP, M_NOWAIT);
 		if (tmp_rtm == NULL)
 			return (ENOBUFS);
 		bcopy(rtm, tmp_rtm, rtm->rtm_msglen);
 		orig_rtm = rtm;
 		rtm = tmp_rtm;
 		alloc_len = len;
 
 		/*
 		 * Delay freeing original rtm as info contains
 		 * data referencing it.
 		 */
 	}
 
 	w.w_tmem = (caddr_t)rtm;
 	w.w_tmemsize = alloc_len;
 	rtsock_msg_buffer(rtm->rtm_type, info, &w, &len);
 	rtm->rtm_addrs = info->rti_addrs;
 
 	if (orig_rtm != NULL)
 		free(orig_rtm, M_TEMP);
 	*prtm = rtm;
 	return (0);
 }
 
 
 /*
  * Update sockaddrs, flags, etc in @prtm based on @rc data.
  * rtm can be reallocated.
  *
  * Returns 0 on success, along with pointer to (potentially reallocated)
  *  rtm.
  *
  */
 static int
 update_rtm_from_rc(struct rt_addrinfo *info, struct rt_msghdr **prtm,
     int alloc_len, struct rib_cmd_info *rc, struct nhop_object *nh)
 {
 	union sockaddr_union saun;
 	struct rt_msghdr *rtm;
 	struct ifnet *ifp;
 	int error;
 
 	rtm = *prtm;
 	union sockaddr_union sa_dst, sa_mask;
 	int family = info->rti_info[RTAX_DST]->sa_family;
 	init_sockaddrs_family(family, &sa_dst.sa, &sa_mask.sa);
 	export_rtaddrs(rc->rc_rt, &sa_dst.sa, &sa_mask.sa);
 
 	info->rti_info[RTAX_DST] = &sa_dst.sa;
 	info->rti_info[RTAX_NETMASK] = rt_is_host(rc->rc_rt) ? NULL : &sa_mask.sa;
 	info->rti_info[RTAX_GATEWAY] = &nh->gw_sa;
 	info->rti_info[RTAX_GENMASK] = 0;
 	ifp = nh->nh_ifp;
 	if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) {
 		if (ifp) {
 			info->rti_info[RTAX_IFP] =
 			    ifp->if_addr->ifa_addr;
 			error = rtm_get_jailed(info, ifp, nh,
 			    &saun, curthread->td_ucred);
 			if (error != 0)
 				return (error);
 			if (ifp->if_flags & IFF_POINTOPOINT)
 				info->rti_info[RTAX_BRD] =
 				    nh->nh_ifa->ifa_dstaddr;
 			rtm->rtm_index = ifp->if_index;
 		} else {
 			info->rti_info[RTAX_IFP] = NULL;
 			info->rti_info[RTAX_IFA] = NULL;
 		}
 	} else if (ifp != NULL)
 		rtm->rtm_index = ifp->if_index;
 
 	if ((error = update_rtm_from_info(info, prtm, alloc_len)) != 0)
 		return (error);
 
 	rtm = *prtm;
 	rtm->rtm_flags = rc->rc_rt->rte_flags | nhop_get_rtflags(nh);
 	if (rtm->rtm_flags & RTF_GWFLAG_COMPAT)
 		rtm->rtm_flags = RTF_GATEWAY | 
 			(rtm->rtm_flags & ~RTF_GWFLAG_COMPAT);
 	rt_getmetrics(rc->rc_rt, nh, &rtm->rtm_rmx);
 	rtm->rtm_rmx.rmx_weight = rc->rc_nh_weight;
 
 	return (0);
 }
 
 #ifdef ROUTE_MPATH
 static void
 save_del_notification(const struct rib_cmd_info *rc, void *_cbdata)
 {
 	struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata;
 
 	if (rc->rc_cmd == RTM_DELETE)
 		*rc_new = *rc;
 }
 
 static void
 save_add_notification(const struct rib_cmd_info *rc, void *_cbdata)
 {
 	struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata;
 
 	if (rc->rc_cmd == RTM_ADD)
 		*rc_new = *rc;
 }
 #endif
 
 #if defined(INET6) || defined(INET)
 static struct sockaddr *
 alloc_sockaddr_aligned(struct linear_buffer *lb, int len)
 {
 	len = roundup2(len, sizeof(uint64_t));
 	if (lb->offset + len > lb->size)
 		return (NULL);
 	struct sockaddr *sa = (struct sockaddr *)(lb->base + lb->offset);
 	lb->offset += len;
 	return (sa);
 }
 #endif
 
 static int
 rts_send(struct socket *so, int flags, struct mbuf *m,
     struct sockaddr *nam, struct mbuf *control, struct thread *td)
 {
 	struct rt_msghdr *rtm = NULL;
 	struct rt_addrinfo info;
 	struct epoch_tracker et;
 #ifdef INET6
 	struct sockaddr_storage ss;
 	struct sockaddr_in6 *sin6;
 	int i, rti_need_deembed = 0;
 #endif
 	int alloc_len = 0, len, error = 0, fibnum;
 	sa_family_t saf = AF_UNSPEC;
 	struct rib_cmd_info rc;
 	struct nhop_object *nh;
 
 	if ((flags & PRUS_OOB) || control != NULL) {
 		m_freem(m);
 		if (control != NULL)
 			m_freem(control);
 		return (EOPNOTSUPP);
 	}
 
 	fibnum = so->so_fibnum;
 #define senderr(e) { error = e; goto flush;}
 	if (m == NULL || ((m->m_len < sizeof(long)) &&
 		       (m = m_pullup(m, sizeof(long))) == NULL))
 		return (ENOBUFS);
 	if ((m->m_flags & M_PKTHDR) == 0)
 		panic("route_output");
 	NET_EPOCH_ENTER(et);
 	len = m->m_pkthdr.len;
 	if (len < sizeof(*rtm) ||
 	    len != mtod(m, struct rt_msghdr *)->rtm_msglen)
 		senderr(EINVAL);
 
 	/*
 	 * Most of current messages are in range 200-240 bytes,
 	 * minimize possible re-allocation on reply using larger size
 	 * buffer aligned on 1k boundaty.
 	 */
 	alloc_len = roundup2(len, 1024);
 	int total_len = alloc_len + SCRATCH_BUFFER_SIZE;
 	if ((rtm = malloc(total_len, M_TEMP, M_NOWAIT)) == NULL)
 		senderr(ENOBUFS);
 
 	m_copydata(m, 0, len, (caddr_t)rtm);
 	bzero(&info, sizeof(info));
 	nh = NULL;
 	struct linear_buffer lb = {
 		.base = (char *)rtm + alloc_len,
 		.size = SCRATCH_BUFFER_SIZE,
 	};
 
 	if (rtm->rtm_version != RTM_VERSION) {
 		/* Do not touch message since format is unknown */
 		free(rtm, M_TEMP);
 		rtm = NULL;
 		senderr(EPROTONOSUPPORT);
 	}
 
 	/*
 	 * Starting from here, it is possible
 	 * to alter original message and insert
 	 * caller PID and error value.
 	 */
 
 	if ((error = fill_addrinfo(rtm, len, &lb, fibnum, &info)) != 0) {
 		senderr(error);
 	}
 	/* fill_addringo() embeds scope into IPv6 addresses */
 #ifdef INET6
 	rti_need_deembed = 1;
 #endif
 
 	saf = info.rti_info[RTAX_DST]->sa_family;
 
 	/* support for new ARP code */
 	if (rtm->rtm_flags & RTF_LLDATA) {
 		error = lla_rt_output(rtm, &info);
 		goto flush;
 	}
 
 	union sockaddr_union gw_saun;
 	int blackhole_flags = rtm->rtm_flags & (RTF_BLACKHOLE|RTF_REJECT);
 	if (blackhole_flags != 0) {
 		if (blackhole_flags != (RTF_BLACKHOLE | RTF_REJECT))
 			error = fill_blackholeinfo(&info, &gw_saun);
 		else {
 			RTS_PID_LOG(LOG_DEBUG, "both BLACKHOLE and REJECT flags specifiied");
 			error = EINVAL;
 		}
 		if (error != 0)
 			senderr(error);
 	}
 
 	switch (rtm->rtm_type) {
 	case RTM_ADD:
 	case RTM_CHANGE:
 		if (rtm->rtm_type == RTM_ADD) {
 			if (info.rti_info[RTAX_GATEWAY] == NULL) {
 				RTS_PID_LOG(LOG_DEBUG, "RTM_ADD w/o gateway");
 				senderr(EINVAL);
 			}
 		}
 		error = rib_action(fibnum, rtm->rtm_type, &info, &rc);
 		if (error == 0) {
 			rtsock_notify_event(fibnum, &rc);
 #ifdef ROUTE_MPATH
 			if (NH_IS_NHGRP(rc.rc_nh_new) ||
 			    (rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) {
 				struct rib_cmd_info rc_simple = {};
 				rib_decompose_notification(&rc,
 				    save_add_notification, (void *)&rc_simple);
 				rc = rc_simple;
 			}
 #endif
 			/* nh MAY be empty if RTM_CHANGE request is no-op */
 			nh = rc.rc_nh_new;
 			if (nh != NULL) {
 				rtm->rtm_index = nh->nh_ifp->if_index;
 				rtm->rtm_flags = rc.rc_rt->rte_flags | nhop_get_rtflags(nh);
 			}
 		}
 		break;
 
 	case RTM_DELETE:
 		error = rib_action(fibnum, RTM_DELETE, &info, &rc);
 		if (error == 0) {
 			rtsock_notify_event(fibnum, &rc);
 #ifdef ROUTE_MPATH
 			if (NH_IS_NHGRP(rc.rc_nh_old) ||
 			    (rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) {
 				struct rib_cmd_info rc_simple = {};
 				rib_decompose_notification(&rc,
 				    save_del_notification, (void *)&rc_simple);
 				rc = rc_simple;
 			}
 #endif
 			nh = rc.rc_nh_old;
 		}
 		break;
 
 	case RTM_GET:
 		error = handle_rtm_get(&info, fibnum, rtm, &rc);
 		if (error != 0)
 			senderr(error);
 		nh = rc.rc_nh_new;
 
 		if (!can_export_rte(curthread->td_ucred,
 		    info.rti_info[RTAX_NETMASK] == NULL,
 		    info.rti_info[RTAX_DST])) {
 			senderr(ESRCH);
 		}
 		break;
 
 	default:
 		senderr(EOPNOTSUPP);
 	}
 
 	if (error == 0 && nh != NULL) {
 		error = update_rtm_from_rc(&info, &rtm, alloc_len, &rc, nh);
 		/*
 		 * Note that some sockaddr pointers may have changed to
 		 * point to memory outsize @rtm. Some may be pointing
 		 * to the on-stack variables.
 		 * Given that, any pointer in @info CANNOT BE USED.
 		 */
 
 		/*
 		 * scopeid deembedding has been performed while
 		 * writing updated rtm in rtsock_msg_buffer().
 		 * With that in mind, skip deembedding procedure below.
 		 */
 #ifdef INET6
 		rti_need_deembed = 0;
 #endif
 	}
 
 flush:
 	NET_EPOCH_EXIT(et);
 
 #ifdef INET6
 	if (rtm != NULL) {
 		if (rti_need_deembed) {
 			/* sin6_scope_id is recovered before sending rtm. */
 			sin6 = (struct sockaddr_in6 *)&ss;
 			for (i = 0; i < RTAX_MAX; i++) {
 				if (info.rti_info[i] == NULL)
 					continue;
 				if (info.rti_info[i]->sa_family != AF_INET6)
 					continue;
 				bcopy(info.rti_info[i], sin6, sizeof(*sin6));
 				if (sa6_recoverscope(sin6) == 0)
 					bcopy(sin6, info.rti_info[i],
 						    sizeof(*sin6));
 			}
 			if (update_rtm_from_info(&info, &rtm, alloc_len) != 0) {
 				if (error != 0)
 					error = ENOBUFS;
 			}
 		}
 	}
 #endif
 	send_rtm_reply(so, rtm, m, saf, fibnum, error);
 
 	return (error);
 }
 
 /*
  * Sends the prepared reply message in @rtm to all rtsock clients.
  * Frees @m and @rtm.
  *
  */
 static void
 send_rtm_reply(struct socket *so, struct rt_msghdr *rtm, struct mbuf *m,
     sa_family_t saf, u_int fibnum, int rtm_errno)
 {
 	struct rcb *rcb = NULL;
 
 	/*
 	 * Check to see if we don't want our own messages.
 	 */
 	if ((so->so_options & SO_USELOOPBACK) == 0) {
 		if (V_route_cb.any_count <= 1) {
 			if (rtm != NULL)
 				free(rtm, M_TEMP);
 			m_freem(m);
 			return;
 		}
 		/* There is another listener, so construct message */
 		rcb = so->so_pcb;
 	}
 
 	if (rtm != NULL) {
 		if (rtm_errno!= 0)
 			rtm->rtm_errno = rtm_errno;
 		else
 			rtm->rtm_flags |= RTF_DONE;
 
 		m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm);
 		if (m->m_pkthdr.len < rtm->rtm_msglen) {
 			m_freem(m);
 			m = NULL;
 		} else if (m->m_pkthdr.len > rtm->rtm_msglen)
 			m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len);
 
 		free(rtm, M_TEMP);
 	}
 	if (m != NULL) {
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 		if (rcb) {
 			/*
 			 * XXX insure we don't get a copy by
 			 * invalidating our protocol
 			 */
 			sa_family_t family = rcb->rcb_family;
 			rcb->rcb_family = AF_UNSPEC;
 			rt_dispatch(m, saf);
 			rcb->rcb_family = family;
 		} else
 			rt_dispatch(m, saf);
 	}
 }
 
 static void
 rt_getmetrics(const struct rtentry *rt, const struct nhop_object *nh,
     struct rt_metrics *out)
 {
 
 	bzero(out, sizeof(*out));
 	out->rmx_mtu = nh->nh_mtu;
 	out->rmx_weight = rt->rt_weight;
 	out->rmx_nhidx = nhop_get_idx(nh);
 	/* Kernel -> userland timebase conversion. */
 	out->rmx_expire = nhop_get_expire(nh) ?
 	    nhop_get_expire(nh) - time_uptime + time_second : 0;
 }
 
 /*
  * Extract the addresses of the passed sockaddrs.
  * Do a little sanity checking so as to avoid bad memory references.
  * This data is derived straight from userland.
  */
 static int
 rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo)
 {
 	struct sockaddr *sa;
 	int i;
 
 	for (i = 0; i < RTAX_MAX && cp < cplim; i++) {
 		if ((rtinfo->rti_addrs & (1 << i)) == 0)
 			continue;
 		sa = (struct sockaddr *)cp;
 		/*
 		 * It won't fit.
 		 */
 		if (cp + sa->sa_len > cplim) {
 			RTS_PID_LOG(LOG_DEBUG, "sa_len too big for sa type %d", i);
 			return (EINVAL);
 		}
 		/*
 		 * there are no more.. quit now
 		 * If there are more bits, they are in error.
 		 * I've seen this. route(1) can evidently generate these. 
 		 * This causes kernel to core dump.
 		 * for compatibility, If we see this, point to a safe address.
 		 */
 		if (sa->sa_len == 0) {
 			rtinfo->rti_info[i] = &sa_zero;
 			return (0); /* should be EINVAL but for compat */
 		}
 		/* accept it */
 #ifdef INET6
 		if (sa->sa_family == AF_INET6)
 			sa6_embedscope((struct sockaddr_in6 *)sa,
 			    V_ip6_use_defzone);
 #endif
 		rtinfo->rti_info[i] = sa;
 		cp += SA_SIZE(sa);
 	}
 	return (0);
 }
 
 #ifdef INET
 static inline void
 fill_sockaddr_inet(struct sockaddr_in *sin, struct in_addr addr)
 {
 
 	const struct sockaddr_in nsin = {
 		.sin_family = AF_INET,
 		.sin_len = sizeof(struct sockaddr_in),
 		.sin_addr = addr,
 	};
 	*sin = nsin;
 }
 #endif
 
 #ifdef INET6
 static inline void
 fill_sockaddr_inet6(struct sockaddr_in6 *sin6, const struct in6_addr *addr6,
     uint32_t scopeid)
 {
 
 	const struct sockaddr_in6 nsin6 = {
 		.sin6_family = AF_INET6,
 		.sin6_len = sizeof(struct sockaddr_in6),
 		.sin6_addr = *addr6,
 		.sin6_scope_id = scopeid,
 	};
 	*sin6 = nsin6;
 }
 #endif
 
 #if defined(INET6) || defined(INET)
 /*
  * Checks if gateway is suitable for lltable operations.
  * Lltable code requires AF_LINK gateway with ifindex
  *  and mac address specified.
  * Returns 0 on success.
  */
 static int
 cleanup_xaddrs_lladdr(struct rt_addrinfo *info)
 {
 	struct sockaddr_dl *sdl = (struct sockaddr_dl *)info->rti_info[RTAX_GATEWAY];
 
 	if (sdl->sdl_family != AF_LINK)
 		return (EINVAL);
 
 	if (sdl->sdl_index == 0) {
 		RTS_PID_LOG(LOG_DEBUG, "AF_LINK gateway w/o ifindex");
 		return (EINVAL);
 	}
 
 	if (offsetof(struct sockaddr_dl, sdl_data) + sdl->sdl_nlen + sdl->sdl_alen > sdl->sdl_len) {
 		RTS_PID_LOG(LOG_DEBUG, "AF_LINK gw: sdl_nlen/sdl_alen too large");
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static int
 cleanup_xaddrs_gateway(struct rt_addrinfo *info, struct linear_buffer *lb)
 {
 	struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
 	struct sockaddr *sa;
 
 	if (info->rti_flags & RTF_LLDATA)
 		return (cleanup_xaddrs_lladdr(info));
 
 	switch (gw->sa_family) {
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *gw_sin = (struct sockaddr_in *)gw;
 
 			/* Ensure reads do not go beyoud SA boundary */
 			if (SA_SIZE(gw) < offsetof(struct sockaddr_in, sin_zero)) {
 				RTS_PID_LOG(LOG_DEBUG, "gateway sin_len too small: %d",
 				    gw->sa_len);
 				return (EINVAL);
 			}
 			sa = alloc_sockaddr_aligned(lb, sizeof(struct sockaddr_in));
 			if (sa == NULL)
 				return (ENOBUFS);
 			fill_sockaddr_inet((struct sockaddr_in *)sa, gw_sin->sin_addr);
 			info->rti_info[RTAX_GATEWAY] = sa;
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *gw_sin6 = (struct sockaddr_in6 *)gw;
 			if (gw_sin6->sin6_len < sizeof(struct sockaddr_in6)) {
 				RTS_PID_LOG(LOG_DEBUG, "gateway sin6_len too small: %d",
 				    gw->sa_len);
 				return (EINVAL);
 			}
 			fill_sockaddr_inet6(gw_sin6, &gw_sin6->sin6_addr, 0);
 			break;
 		}
 #endif
 	case AF_LINK:
 		{
 			struct sockaddr_dl *gw_sdl;
 
 			size_t sdl_min_len = offsetof(struct sockaddr_dl, sdl_data);
 			gw_sdl = (struct sockaddr_dl *)gw;
 			if (gw_sdl->sdl_len < sdl_min_len) {
 				RTS_PID_LOG(LOG_DEBUG, "gateway sdl_len too small: %d",
 				    gw_sdl->sdl_len);
 				return (EINVAL);
 			}
 			sa = alloc_sockaddr_aligned(lb, sizeof(struct sockaddr_dl_short));
 			if (sa == NULL)
 				return (ENOBUFS);
 
 			const struct sockaddr_dl_short sdl = {
 				.sdl_family = AF_LINK,
 				.sdl_len = sizeof(struct sockaddr_dl_short),
 				.sdl_index = gw_sdl->sdl_index,
 			};
 			*((struct sockaddr_dl_short *)sa) = sdl;
 			info->rti_info[RTAX_GATEWAY] = sa;
 			break;
 		}
 	}
 
 	return (0);
 }
 #endif
 
 static void
 remove_netmask(struct rt_addrinfo *info)
 {
 	info->rti_info[RTAX_NETMASK] = NULL;
 	info->rti_flags |= RTF_HOST;
 	info->rti_addrs &= ~RTA_NETMASK;
 }
 
 #ifdef INET
 static int
 cleanup_xaddrs_inet(struct rt_addrinfo *info, struct linear_buffer *lb)
 {
 	struct sockaddr_in *dst_sa, *mask_sa;
 	const int sa_len = sizeof(struct sockaddr_in);
 	struct in_addr dst, mask;
 
 	/* Check & fixup dst/netmask combination first */
 	dst_sa = (struct sockaddr_in *)info->rti_info[RTAX_DST];
 	mask_sa = (struct sockaddr_in *)info->rti_info[RTAX_NETMASK];
 
 	/* Ensure reads do not go beyound the buffer size */
 	if (SA_SIZE(dst_sa) < offsetof(struct sockaddr_in, sin_zero)) {
 		RTS_PID_LOG(LOG_DEBUG, "prefix dst sin_len too small: %d",
 		    dst_sa->sin_len);
 		return (EINVAL);
 	}
 
 	if ((mask_sa != NULL) && mask_sa->sin_len < sizeof(struct sockaddr_in)) {
 		/*
 		 * Some older routing software encode mask length into the
 		 * sin_len, thus resulting in "truncated" sockaddr.
 		 */
 		int len = mask_sa->sin_len - offsetof(struct sockaddr_in, sin_addr);
 		if (len >= 0) {
 			mask.s_addr = 0;
 			if (len > sizeof(struct in_addr))
 				len = sizeof(struct in_addr);
 			memcpy(&mask, &mask_sa->sin_addr, len);
 		} else {
 			RTS_PID_LOG(LOG_DEBUG, "prefix mask sin_len too small: %d",
 			    mask_sa->sin_len);
 			return (EINVAL);
 		}
 	} else
 		mask.s_addr = mask_sa ? mask_sa->sin_addr.s_addr : INADDR_BROADCAST;
 
 	dst.s_addr = htonl(ntohl(dst_sa->sin_addr.s_addr) & ntohl(mask.s_addr));
 
 	/* Construct new "clean" dst/mask sockaddresses */
 	if ((dst_sa = (struct sockaddr_in *)alloc_sockaddr_aligned(lb, sa_len)) == NULL)
 		return (ENOBUFS);
 	fill_sockaddr_inet(dst_sa, dst);
 	info->rti_info[RTAX_DST] = (struct sockaddr *)dst_sa;
 
 	if (mask.s_addr != INADDR_BROADCAST) {
 		if ((mask_sa = (struct sockaddr_in *)alloc_sockaddr_aligned(lb, sa_len)) == NULL)
 			return (ENOBUFS);
 		fill_sockaddr_inet(mask_sa, mask);
 		info->rti_info[RTAX_NETMASK] = (struct sockaddr *)mask_sa;
 		info->rti_flags &= ~RTF_HOST;
 	} else
 		remove_netmask(info);
 
 	/* Check gateway */
 	if (info->rti_info[RTAX_GATEWAY] != NULL)
 		return (cleanup_xaddrs_gateway(info, lb));
 
 	return (0);
 }
 #endif
 
 #ifdef INET6
 static int
 cleanup_xaddrs_inet6(struct rt_addrinfo *info, struct linear_buffer *lb)
 {
 	struct sockaddr *sa;
 	struct sockaddr_in6 *dst_sa, *mask_sa;
 	struct in6_addr mask, *dst;
 	const int sa_len = sizeof(struct sockaddr_in6);
 
 	/* Check & fixup dst/netmask combination first */
 	dst_sa = (struct sockaddr_in6 *)info->rti_info[RTAX_DST];
 	mask_sa = (struct sockaddr_in6 *)info->rti_info[RTAX_NETMASK];
 
 	if (dst_sa->sin6_len < sizeof(struct sockaddr_in6)) {
 		RTS_PID_LOG(LOG_DEBUG, "prefix dst sin6_len too small: %d",
 		    dst_sa->sin6_len);
 		return (EINVAL);
 	}
 
 	if (mask_sa && mask_sa->sin6_len < sizeof(struct sockaddr_in6)) {
 		/*
 		 * Some older routing software encode mask length into the
 		 * sin6_len, thus resulting in "truncated" sockaddr.
 		 */
 		int len = mask_sa->sin6_len - offsetof(struct sockaddr_in6, sin6_addr);
 		if (len >= 0) {
 			bzero(&mask, sizeof(mask));
 			if (len > sizeof(struct in6_addr))
 				len = sizeof(struct in6_addr);
 			memcpy(&mask, &mask_sa->sin6_addr, len);
 		} else {
 			RTS_PID_LOG(LOG_DEBUG, "rtsock: prefix mask sin6_len too small: %d",
 			    mask_sa->sin6_len);
 			return (EINVAL);
 		}
 	} else
 		mask = mask_sa ? mask_sa->sin6_addr : in6mask128;
 
 	dst = &dst_sa->sin6_addr;
 	IN6_MASK_ADDR(dst, &mask);
 
 	if ((sa = alloc_sockaddr_aligned(lb, sa_len)) == NULL)
 		return (ENOBUFS);
 	fill_sockaddr_inet6((struct sockaddr_in6 *)sa, dst, 0);
 	info->rti_info[RTAX_DST] = sa;
 
 	if (!IN6_ARE_ADDR_EQUAL(&mask, &in6mask128)) {
 		if ((sa = alloc_sockaddr_aligned(lb, sa_len)) == NULL)
 			return (ENOBUFS);
 		fill_sockaddr_inet6((struct sockaddr_in6 *)sa, &mask, 0);
 		info->rti_info[RTAX_NETMASK] = sa;
 		info->rti_flags &= ~RTF_HOST;
 	} else
 		remove_netmask(info);
 
 	/* Check gateway */
 	if (info->rti_info[RTAX_GATEWAY] != NULL)
 		return (cleanup_xaddrs_gateway(info, lb));
 
 	return (0);
 }
 #endif
 
 static int
 cleanup_xaddrs(struct rt_addrinfo *info, struct linear_buffer *lb)
 {
 	int error = EAFNOSUPPORT;
 
 	if (info->rti_info[RTAX_DST] == NULL) {
 		RTS_PID_LOG(LOG_DEBUG, "prefix dst is not set");
 		return (EINVAL);
 	}
 
 	if (info->rti_flags & RTF_LLDATA) {
 		/*
 		 * arp(8)/ndp(8) sends RTA_NETMASK for the associated
 		 * prefix along with the actual address in RTA_DST.
 		 * Remove netmask to avoid unnecessary address masking.
 		 */
 		remove_netmask(info);
 	}
 
 	switch (info->rti_info[RTAX_DST]->sa_family) {
 #ifdef INET
 	case AF_INET:
 		error = cleanup_xaddrs_inet(info, lb);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		error = cleanup_xaddrs_inet6(info, lb);
 		break;
 #endif
 	}
 
 	return (error);
 }
 
 /*
  * Fill in @dmask with valid netmask leaving original @smask
  * intact. Mostly used with radix netmasks.
  */
 struct sockaddr *
 rtsock_fix_netmask(const struct sockaddr *dst, const struct sockaddr *smask,
     struct sockaddr_storage *dmask)
 {
 	if (dst == NULL || smask == NULL)
 		return (NULL);
 
 	memset(dmask, 0, dst->sa_len);
 	memcpy(dmask, smask, smask->sa_len);
 	dmask->ss_len = dst->sa_len;
 	dmask->ss_family = dst->sa_family;
 
 	return ((struct sockaddr *)dmask);
 }
 
 /*
  * Writes information related to @rtinfo object to newly-allocated mbuf.
  * Assumes MCLBYTES is enough to construct any message.
  * Used for OS notifications of vaious events (if/ifa announces,etc)
  *
  * Returns allocated mbuf or NULL on failure.
  */
 static struct mbuf *
 rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo)
 {
 	struct sockaddr_storage ss;
 	struct rt_msghdr *rtm;
 	struct mbuf *m;
 	int i;
 	struct sockaddr *sa;
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	int len, dlen;
 
 	switch (type) {
 	case RTM_DELADDR:
 	case RTM_NEWADDR:
 		len = sizeof(struct ifa_msghdr);
 		break;
 
 	case RTM_DELMADDR:
 	case RTM_NEWMADDR:
 		len = sizeof(struct ifma_msghdr);
 		break;
 
 	case RTM_IFINFO:
 		len = sizeof(struct if_msghdr);
 		break;
 
 	case RTM_IFANNOUNCE:
 	case RTM_IEEE80211:
 		len = sizeof(struct if_announcemsghdr);
 		break;
 
 	default:
 		len = sizeof(struct rt_msghdr);
 	}
 
 	/* XXXGL: can we use MJUMPAGESIZE cluster here? */
 	KASSERT(len <= MCLBYTES, ("%s: message too big", __func__));
 	if (len > MHLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	else
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (m);
 
 	m->m_pkthdr.len = m->m_len = len;
 	rtm = mtod(m, struct rt_msghdr *);
 	bzero((caddr_t)rtm, len);
 	for (i = 0; i < RTAX_MAX; i++) {
 		if ((sa = rtinfo->rti_info[i]) == NULL)
 			continue;
 		rtinfo->rti_addrs |= (1 << i);
 
 		dlen = SA_SIZE(sa);
 		KASSERT(dlen <= sizeof(ss),
 		    ("%s: sockaddr size overflow", __func__));
 		bzero(&ss, sizeof(ss));
 		bcopy(sa, &ss, sa->sa_len);
 		sa = (struct sockaddr *)&ss;
 #ifdef INET6
 		if (sa->sa_family == AF_INET6) {
 			sin6 = (struct sockaddr_in6 *)sa;
 			(void)sa6_recoverscope(sin6);
 		}
 #endif
 		m_copyback(m, len, dlen, (caddr_t)sa);
 		len += dlen;
 	}
 	if (m->m_pkthdr.len != len) {
 		m_freem(m);
 		return (NULL);
 	}
 	rtm->rtm_msglen = len;
 	rtm->rtm_version = RTM_VERSION;
 	rtm->rtm_type = type;
 	return (m);
 }
 
 /*
  * Writes information related to @rtinfo object to preallocated buffer.
  * Stores needed size in @plen. If @w is NULL, calculates size without
  * writing.
  * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation.
  *
  * Returns 0 on success.
  *
  */
 static int
 rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen)
 {
 	struct sockaddr_storage ss;
 	int len, buflen = 0, dlen, i;
 	caddr_t cp = NULL;
 	struct rt_msghdr *rtm = NULL;
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 #ifdef COMPAT_FREEBSD32
 	bool compat32 = false;
 #endif
 
 	switch (type) {
 	case RTM_DELADDR:
 	case RTM_NEWADDR:
 		if (w != NULL && w->w_op == NET_RT_IFLISTL) {
 #ifdef COMPAT_FREEBSD32
 			if (w->w_req->flags & SCTL_MASK32) {
 				len = sizeof(struct ifa_msghdrl32);
 				compat32 = true;
 			} else
 #endif
 				len = sizeof(struct ifa_msghdrl);
 		} else
 			len = sizeof(struct ifa_msghdr);
 		break;
 
 	case RTM_IFINFO:
 #ifdef COMPAT_FREEBSD32
 		if (w != NULL && w->w_req->flags & SCTL_MASK32) {
 			if (w->w_op == NET_RT_IFLISTL)
 				len = sizeof(struct if_msghdrl32);
 			else
 				len = sizeof(struct if_msghdr32);
 			compat32 = true;
 			break;
 		}
 #endif
 		if (w != NULL && w->w_op == NET_RT_IFLISTL)
 			len = sizeof(struct if_msghdrl);
 		else
 			len = sizeof(struct if_msghdr);
 		break;
 
 	case RTM_NEWMADDR:
 		len = sizeof(struct ifma_msghdr);
 		break;
 
 	default:
 		len = sizeof(struct rt_msghdr);
 	}
 
 	if (w != NULL) {
 		rtm = (struct rt_msghdr *)w->w_tmem;
 		buflen = w->w_tmemsize - len;
 		cp = (caddr_t)w->w_tmem + len;
 	}
 
 	rtinfo->rti_addrs = 0;
 	for (i = 0; i < RTAX_MAX; i++) {
 		struct sockaddr *sa;
 
 		if ((sa = rtinfo->rti_info[i]) == NULL)
 			continue;
 		rtinfo->rti_addrs |= (1 << i);
 #ifdef COMPAT_FREEBSD32
 		if (compat32)
 			dlen = SA_SIZE32(sa);
 		else
 #endif
 			dlen = SA_SIZE(sa);
 		if (cp != NULL && buflen >= dlen) {
 			KASSERT(dlen <= sizeof(ss),
 			    ("%s: sockaddr size overflow", __func__));
 			bzero(&ss, sizeof(ss));
 			bcopy(sa, &ss, sa->sa_len);
 			sa = (struct sockaddr *)&ss;
 #ifdef INET6
 			if (sa->sa_family == AF_INET6) {
 				sin6 = (struct sockaddr_in6 *)sa;
 				(void)sa6_recoverscope(sin6);
 			}
 #endif
 			bcopy((caddr_t)sa, cp, (unsigned)dlen);
 			cp += dlen;
 			buflen -= dlen;
 		} else if (cp != NULL) {
 			/*
 			 * Buffer too small. Count needed size
 			 * and return with error.
 			 */
 			cp = NULL;
 		}
 
 		len += dlen;
 	}
 
 	if (cp != NULL) {
 		dlen = ALIGN(len) - len;
 		if (buflen < dlen)
 			cp = NULL;
 		else {
 			bzero(cp, dlen);
 			cp += dlen;
 			buflen -= dlen;
 		}
 	}
 	len = ALIGN(len);
 
 	if (cp != NULL) {
 		/* fill header iff buffer is large enough */
 		rtm->rtm_version = RTM_VERSION;
 		rtm->rtm_type = type;
 		rtm->rtm_msglen = len;
 	}
 
 	*plen = len;
 
 	if (w != NULL && cp == NULL)
 		return (ENOBUFS);
 
 	return (0);
 }
 
 /*
  * This routine is called to generate a message from the routing
  * socket indicating that a redirect has occurred, a routing lookup
  * has failed, or that a protocol has detected timeouts to a particular
  * destination.
  */
 void
 rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error,
     int fibnum)
 {
 	struct rt_msghdr *rtm;
 	struct mbuf *m;
 	struct sockaddr *sa = rtinfo->rti_info[RTAX_DST];
 
 	if (V_route_cb.any_count == 0)
 		return;
 	m = rtsock_msg_mbuf(type, rtinfo);
 	if (m == NULL)
 		return;
 
 	if (fibnum != RT_ALL_FIBS) {
 		KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out "
 		    "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs));
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rtm = mtod(m, struct rt_msghdr *);
 	rtm->rtm_flags = RTF_DONE | flags;
 	rtm->rtm_errno = error;
 	rtm->rtm_addrs = rtinfo->rti_addrs;
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 }
 
 void
 rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error)
 {
 
 	rt_missmsg_fib(type, rtinfo, flags, error, RT_ALL_FIBS);
 }
 
 /*
  * This routine is called to generate a message from the routing
  * socket indicating that the status of a network interface has changed.
  */
 static void
 rtsock_ifmsg(struct ifnet *ifp, int if_flags_mask __unused)
 {
 	struct if_msghdr *ifm;
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	if (V_route_cb.any_count == 0)
 		return;
 	bzero((caddr_t)&info, sizeof(info));
 	m = rtsock_msg_mbuf(RTM_IFINFO, &info);
 	if (m == NULL)
 		return;
 	ifm = mtod(m, struct if_msghdr *);
 	ifm->ifm_index = ifp->if_index;
 	ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 	if_data_copy(ifp, &ifm->ifm_data);
 	ifm->ifm_addrs = 0;
 	rt_dispatch(m, AF_UNSPEC);
 }
 
 /*
  * Announce interface address arrival/withdraw.
  * Please do not call directly, use rt_addrmsg().
  * Assume input data to be valid.
  * Returns 0 on success.
  */
 int
 rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum)
 {
 	struct rt_addrinfo info;
 	struct sockaddr *sa;
 	int ncmd;
 	struct mbuf *m;
 	struct ifa_msghdr *ifam;
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct sockaddr_storage ss;
 
 	if (V_route_cb.any_count == 0)
 		return (0);
 
 	ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR;
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr;
 	info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr;
 	info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(
 	    info.rti_info[RTAX_IFA], ifa->ifa_netmask, &ss);
 	info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
 	if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL)
 		return (ENOBUFS);
 	ifam = mtod(m, struct ifa_msghdr *);
 	ifam->ifam_index = ifp->if_index;
 	ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 	ifam->ifam_flags = ifa->ifa_flags;
 	ifam->ifam_addrs = info.rti_addrs;
 
 	if (fibnum != RT_ALL_FIBS) {
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 
 	return (0);
 }
 
 /*
  * Announce route addition/removal to rtsock based on @rt data.
  * Callers are advives to use rt_routemsg() instead of using this
  *  function directly.
  * Assume @rt data is consistent.
  *
  * Returns 0 on success.
  */
 int
 rtsock_routemsg(int cmd, struct rtentry *rt, struct nhop_object *nh,
     int fibnum)
 {
 	union sockaddr_union dst, mask;
 	struct rt_addrinfo info;
 
 	if (V_route_cb.any_count == 0)
 		return (0);
 
 	int family = rt_get_family(rt);
 	init_sockaddrs_family(family, &dst.sa, &mask.sa);
 	export_rtaddrs(rt, &dst.sa, &mask.sa);
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = &dst.sa;
 	info.rti_info[RTAX_NETMASK] = &mask.sa;
 	info.rti_info[RTAX_GATEWAY] = &nh->gw_sa;
 	info.rti_flags = rt->rte_flags | nhop_get_rtflags(nh);
 	info.rti_ifp = nh->nh_ifp;
 
 	return (rtsock_routemsg_info(cmd, &info, fibnum));
 }
 
 int
 rtsock_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum)
 {
 	struct rt_msghdr *rtm;
 	struct sockaddr *sa;
 	struct mbuf *m;
 
 	if (V_route_cb.any_count == 0)
 		return (0);
 
 	if (info->rti_flags & RTF_HOST)
 		info->rti_info[RTAX_NETMASK] = NULL;
 
 	m = rtsock_msg_mbuf(cmd, info);
 	if (m == NULL)
 		return (ENOBUFS);
 
 	if (fibnum != RT_ALL_FIBS) {
 		KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out "
 		    "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs));
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rtm = mtod(m, struct rt_msghdr *);
 	rtm->rtm_addrs = info->rti_addrs;
 	if (info->rti_ifp != NULL)
 		rtm->rtm_index = info->rti_ifp->if_index;
 	/* Add RTF_DONE to indicate command 'completion' required by API */
 	info->rti_flags |= RTF_DONE;
 	/* Reported routes has to be up */
 	if (cmd == RTM_ADD || cmd == RTM_CHANGE)
 		info->rti_flags |= RTF_UP;
 	rtm->rtm_flags = info->rti_flags;
 
 	sa = info->rti_info[RTAX_DST];
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 
 	return (0);
 }
 
 /*
  * This is the analogue to the rt_newaddrmsg which performs the same
  * function but for multicast group memberhips.  This is easier since
  * there is no route state to worry about.
  */
 void
 rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma)
 {
 	struct rt_addrinfo info;
 	struct mbuf *m = NULL;
 	struct ifnet *ifp = ifma->ifma_ifp;
 	struct ifma_msghdr *ifmam;
 
 	if (V_route_cb.any_count == 0)
 		return;
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_IFA] = ifma->ifma_addr;
 	if (ifp && ifp->if_addr)
 		info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr;
 	else
 		info.rti_info[RTAX_IFP] = NULL;
 	/*
 	 * If a link-layer address is present, present it as a ``gateway''
 	 * (similarly to how ARP entries, e.g., are presented).
 	 */
 	info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr;
 	m = rtsock_msg_mbuf(cmd, &info);
 	if (m == NULL)
 		return;
 	ifmam = mtod(m, struct ifma_msghdr *);
 	KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n",
 	    __func__));
 	ifmam->ifmam_index = ifp->if_index;
 	ifmam->ifmam_addrs = info.rti_addrs;
 	rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC);
 }
 
 static struct mbuf *
 rt_makeifannouncemsg(struct ifnet *ifp, int type, int what,
 	struct rt_addrinfo *info)
 {
 	struct if_announcemsghdr *ifan;
 	struct mbuf *m;
 
 	if (V_route_cb.any_count == 0)
 		return NULL;
 	bzero((caddr_t)info, sizeof(*info));
 	m = rtsock_msg_mbuf(type, info);
 	if (m != NULL) {
 		ifan = mtod(m, struct if_announcemsghdr *);
 		ifan->ifan_index = ifp->if_index;
 		strlcpy(ifan->ifan_name, ifp->if_xname,
 			sizeof(ifan->ifan_name));
 		ifan->ifan_what = what;
 	}
 	return m;
 }
 
 /*
  * This is called to generate routing socket messages indicating
  * IEEE80211 wireless events.
  * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way.
  */
 void
 rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len)
 {
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info);
 	if (m != NULL) {
 		/*
 		 * Append the ieee80211 data.  Try to stick it in the
 		 * mbuf containing the ifannounce msg; otherwise allocate
 		 * a new mbuf and append.
 		 *
 		 * NB: we assume m is a single mbuf.
 		 */
 		if (data_len > M_TRAILINGSPACE(m)) {
 			struct mbuf *n = m_get(M_NOWAIT, MT_DATA);
 			if (n == NULL) {
 				m_freem(m);
 				return;
 			}
 			bcopy(data, mtod(n, void *), data_len);
 			n->m_len = data_len;
 			m->m_next = n;
 		} else if (data_len > 0) {
 			bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len);
 			m->m_len += data_len;
 		}
 		if (m->m_flags & M_PKTHDR)
 			m->m_pkthdr.len += data_len;
 		mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len;
 		rt_dispatch(m, AF_UNSPEC);
 	}
 }
 
 /*
  * This is called to generate routing socket messages indicating
  * network interface arrival and departure.
  */
 static void
 rt_ifannouncemsg(struct ifnet *ifp, int what)
 {
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info);
 	if (m != NULL)
 		rt_dispatch(m, AF_UNSPEC);
 }
 
 static void
 rt_dispatch(struct mbuf *m, sa_family_t saf)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	m->m_rtsock_family = saf;
 	if (V_loif)
 		m->m_pkthdr.rcvif = V_loif;
 	else {
 		m_freem(m);
 		return;
 	}
 	netisr_queue(NETISR_ROUTE, m);	/* mbuf is free'd on failure. */
 }
 
 /*
  * Checks if rte can be exported w.r.t jails/vnets.
  *
  * Returns true if it can, false otherwise.
  */
 static bool
 can_export_rte(struct ucred *td_ucred, bool rt_is_host,
     const struct sockaddr *rt_dst)
 {
 
 	if ((!rt_is_host) ? jailed_without_vnet(td_ucred)
 	    : prison_if(td_ucred, rt_dst) != 0)
 		return (false);
 	return (true);
 }
 
 
 /*
  * This is used in dumping the kernel table via sysctl().
  */
 static int
 sysctl_dumpentry(struct rtentry *rt, void *vw)
 {
 	struct walkarg *w = vw;
 	struct nhop_object *nh;
 
 	NET_EPOCH_ASSERT();
 
 	export_rtaddrs(rt, w->dst, w->mask);
 	if (!can_export_rte(w->w_req->td->td_ucred, rt_is_host(rt), w->dst))
 		return (0);
 	nh = rt_get_raw_nhop(rt);
 #ifdef ROUTE_MPATH
 	if (NH_IS_NHGRP(nh)) {
 		const struct weightened_nhop *wn;
 		uint32_t num_nhops;
 		int error;
 		wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
 		for (int i = 0; i < num_nhops; i++) {
 			error = sysctl_dumpnhop(rt, wn[i].nh, wn[i].weight, w);
 			if (error != 0)
 				return (error);
 		}
 	} else
 #endif
 		sysctl_dumpnhop(rt, nh, rt->rt_weight, w);
 
 	return (0);
 }
 
 
 static int
 sysctl_dumpnhop(struct rtentry *rt, struct nhop_object *nh, uint32_t weight,
     struct walkarg *w)
 {
 	struct rt_addrinfo info;
 	int error = 0, size;
 	uint32_t rtflags;
 
 	rtflags = nhop_get_rtflags(nh);
 
 	if (w->w_op == NET_RT_FLAGS && !(rtflags & w->w_arg))
 		return (0);
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = w->dst;
 	info.rti_info[RTAX_GATEWAY] = &nh->gw_sa;
 	info.rti_info[RTAX_NETMASK] = (rtflags & RTF_HOST) ? NULL : w->mask;
 	info.rti_info[RTAX_GENMASK] = 0;
 	if (nh->nh_ifp && !(nh->nh_ifp->if_flags & IFF_DYING)) {
 		info.rti_info[RTAX_IFP] = nh->nh_ifp->if_addr->ifa_addr;
 		info.rti_info[RTAX_IFA] = nh->nh_ifa->ifa_addr;
 		if (nh->nh_ifp->if_flags & IFF_POINTOPOINT)
 			info.rti_info[RTAX_BRD] = nh->nh_ifa->ifa_dstaddr;
 	}
 	if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0)
 		return (error);
 	if (w->w_req && w->w_tmem) {
 		struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
 
 		bzero(&rtm->rtm_index,
 		    sizeof(*rtm) - offsetof(struct rt_msghdr, rtm_index));
 
 		/*
 		 * rte flags may consist of RTF_HOST (duplicated in nhop rtflags)
 		 * and RTF_UP (if entry is linked, which is always true here).
 		 * Given that, use nhop rtflags & add RTF_UP.
 		 */
 		rtm->rtm_flags = rtflags | RTF_UP;
 		if (rtm->rtm_flags & RTF_GWFLAG_COMPAT)
 			rtm->rtm_flags = RTF_GATEWAY | 
 				(rtm->rtm_flags & ~RTF_GWFLAG_COMPAT);
 		rt_getmetrics(rt, nh, &rtm->rtm_rmx);
 		rtm->rtm_rmx.rmx_weight = weight;
 		rtm->rtm_index = nh->nh_ifp->if_index;
 		rtm->rtm_addrs = info.rti_addrs;
 		error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
 		return (error);
 	}
 	return (error);
 }
 
 static int
 sysctl_iflist_ifml(struct ifnet *ifp, const struct if_data *src_ifd,
     struct rt_addrinfo *info, struct walkarg *w, int len)
 {
 	struct if_msghdrl *ifm;
 	struct if_data *ifd;
 
 	ifm = (struct if_msghdrl *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct if_msghdrl32 *ifm32;
 
 		ifm32 = (struct if_msghdrl32 *)ifm;
 		ifm32->ifm_addrs = info->rti_addrs;
 		ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm32->ifm_index = ifp->if_index;
 		ifm32->_ifm_spare1 = 0;
 		ifm32->ifm_len = sizeof(*ifm32);
 		ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data);
 		ifm32->_ifm_spare2 = 0;
 		ifd = &ifm32->ifm_data;
 	} else
 #endif
 	{
 		ifm->ifm_addrs = info->rti_addrs;
 		ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm->ifm_index = ifp->if_index;
 		ifm->_ifm_spare1 = 0;
 		ifm->ifm_len = sizeof(*ifm);
 		ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data);
 		ifm->_ifm_spare2 = 0;
 		ifd = &ifm->ifm_data;
 	}
 
 	memcpy(ifd, src_ifd, sizeof(*ifd));
 
 	return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len));
 }
 
 static int
 sysctl_iflist_ifm(struct ifnet *ifp, const struct if_data *src_ifd,
     struct rt_addrinfo *info, struct walkarg *w, int len)
 {
 	struct if_msghdr *ifm;
 	struct if_data *ifd;
 
 	ifm = (struct if_msghdr *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct if_msghdr32 *ifm32;
 
 		ifm32 = (struct if_msghdr32 *)ifm;
 		ifm32->ifm_addrs = info->rti_addrs;
 		ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm32->ifm_index = ifp->if_index;
 		ifm32->_ifm_spare1 = 0;
 		ifd = &ifm32->ifm_data;
 	} else
 #endif
 	{
 		ifm->ifm_addrs = info->rti_addrs;
 		ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm->ifm_index = ifp->if_index;
 		ifm->_ifm_spare1 = 0;
 		ifd = &ifm->ifm_data;
 	}
 
 	memcpy(ifd, src_ifd, sizeof(*ifd));
 
 	return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len));
 }
 
 static int
 sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct ifa_msghdrl *ifam;
 	struct if_data *ifd;
 
 	ifam = (struct ifa_msghdrl *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct ifa_msghdrl32 *ifam32;
 
 		ifam32 = (struct ifa_msghdrl32 *)ifam;
 		ifam32->ifam_addrs = info->rti_addrs;
 		ifam32->ifam_flags = ifa->ifa_flags;
 		ifam32->ifam_index = ifa->ifa_ifp->if_index;
 		ifam32->_ifam_spare1 = 0;
 		ifam32->ifam_len = sizeof(*ifam32);
 		ifam32->ifam_data_off =
 		    offsetof(struct ifa_msghdrl32, ifam_data);
 		ifam32->ifam_metric = ifa->ifa_ifp->if_metric;
 		ifd = &ifam32->ifam_data;
 	} else
 #endif
 	{
 		ifam->ifam_addrs = info->rti_addrs;
 		ifam->ifam_flags = ifa->ifa_flags;
 		ifam->ifam_index = ifa->ifa_ifp->if_index;
 		ifam->_ifam_spare1 = 0;
 		ifam->ifam_len = sizeof(*ifam);
 		ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data);
 		ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 		ifd = &ifam->ifam_data;
 	}
 
 	bzero(ifd, sizeof(*ifd));
 	ifd->ifi_datalen = sizeof(struct if_data);
 	ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets);
 	ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets);
 	ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes);
 	ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes);
 
 	/* Fixup if_data carp(4) vhid. */
 	if (carp_get_vhid_p != NULL)
 		ifd->ifi_vhid = (*carp_get_vhid_p)(ifa);
 
 	return (SYSCTL_OUT(w->w_req, w->w_tmem, len));
 }
 
 static int
 sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct ifa_msghdr *ifam;
 
 	ifam = (struct ifa_msghdr *)w->w_tmem;
 	ifam->ifam_addrs = info->rti_addrs;
 	ifam->ifam_flags = ifa->ifa_flags;
 	ifam->ifam_index = ifa->ifa_ifp->if_index;
 	ifam->_ifam_spare1 = 0;
 	ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 
 	return (SYSCTL_OUT(w->w_req, w->w_tmem, len));
 }
 
 static int
 sysctl_iflist(int af, struct walkarg *w)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct if_data ifd;
 	struct rt_addrinfo info;
 	int len, error = 0;
 	struct sockaddr_storage ss;
 
 	bzero((caddr_t)&info, sizeof(info));
 	bzero(&ifd, sizeof(ifd));
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (w->w_arg && w->w_arg != ifp->if_index)
 			continue;
 		if_data_copy(ifp, &ifd);
 		ifa = ifp->if_addr;
 		info.rti_info[RTAX_IFP] = ifa->ifa_addr;
 		error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len);
 		if (error != 0)
 			goto done;
 		info.rti_info[RTAX_IFP] = NULL;
 		if (w->w_req && w->w_tmem) {
 			if (w->w_op == NET_RT_IFLISTL)
 				error = sysctl_iflist_ifml(ifp, &ifd, &info, w,
 				    len);
 			else
 				error = sysctl_iflist_ifm(ifp, &ifd, &info, w,
 				    len);
 			if (error)
 				goto done;
 		}
 		while ((ifa = CK_STAILQ_NEXT(ifa, ifa_link)) != NULL) {
 			if (af && af != ifa->ifa_addr->sa_family)
 				continue;
 			if (prison_if(w->w_req->td->td_ucred,
 			    ifa->ifa_addr) != 0)
 				continue;
 			info.rti_info[RTAX_IFA] = ifa->ifa_addr;
 			info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(
 			    ifa->ifa_addr, ifa->ifa_netmask, &ss);
 			info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
 			error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len);
 			if (error != 0)
 				goto done;
 			if (w->w_req && w->w_tmem) {
 				if (w->w_op == NET_RT_IFLISTL)
 					error = sysctl_iflist_ifaml(ifa, &info,
 					    w, len);
 				else
 					error = sysctl_iflist_ifam(ifa, &info,
 					    w, len);
 				if (error)
 					goto done;
 			}
 		}
 		info.rti_info[RTAX_IFA] = NULL;
 		info.rti_info[RTAX_NETMASK] = NULL;
 		info.rti_info[RTAX_BRD] = NULL;
 	}
 done:
 	return (error);
 }
 
 static int
 sysctl_ifmalist(int af, struct walkarg *w)
 {
 	struct rt_addrinfo info;
 	struct ifaddr *ifa;
 	struct ifmultiaddr *ifma;
 	struct ifnet *ifp;
 	int error, len;
 
 	NET_EPOCH_ASSERT();
 
 	error = 0;
 	bzero((caddr_t)&info, sizeof(info));
 
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (w->w_arg && w->w_arg != ifp->if_index)
 			continue;
 		ifa = ifp->if_addr;
 		info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL;
 		CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (af && af != ifma->ifma_addr->sa_family)
 				continue;
 			if (prison_if(w->w_req->td->td_ucred,
 			    ifma->ifma_addr) != 0)
 				continue;
 			info.rti_info[RTAX_IFA] = ifma->ifma_addr;
 			info.rti_info[RTAX_GATEWAY] =
 			    (ifma->ifma_addr->sa_family != AF_LINK) ?
 			    ifma->ifma_lladdr : NULL;
 			error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len);
 			if (error != 0)
 				break;
 			if (w->w_req && w->w_tmem) {
 				struct ifma_msghdr *ifmam;
 
 				ifmam = (struct ifma_msghdr *)w->w_tmem;
 				ifmam->ifmam_index = ifma->ifma_ifp->if_index;
 				ifmam->ifmam_flags = 0;
 				ifmam->ifmam_addrs = info.rti_addrs;
 				ifmam->_ifmam_spare1 = 0;
 				error = SYSCTL_OUT(w->w_req, w->w_tmem, len);
 				if (error != 0)
 					break;
 			}
 		}
 		if (error != 0)
 			break;
 	}
 	return (error);
 }
 
 static void
 rtable_sysctl_dump(uint32_t fibnum, int family, struct walkarg *w)
 {
 	union sockaddr_union sa_dst, sa_mask;
 
 	w->family = family;
 	w->dst = (struct sockaddr *)&sa_dst;
 	w->mask = (struct sockaddr *)&sa_mask;
 
 	init_sockaddrs_family(family, w->dst, w->mask);
 
 	rib_walk(fibnum, family, false, sysctl_dumpentry, w);
 }
 
 static int
 sysctl_rtsock(SYSCTL_HANDLER_ARGS)
 {
 	struct epoch_tracker et;
 	int	*name = (int *)arg1;
 	u_int	namelen = arg2;
 	struct rib_head *rnh = NULL; /* silence compiler. */
 	int	i, lim, error = EINVAL;
 	int	fib = 0;
 	u_char	af;
 	struct	walkarg w;
 
 	if (namelen < 3)
 		return (EINVAL);
 
 	name++;
 	namelen--;
 	if (req->newptr)
 		return (EPERM);
 	if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP || name[1] == NET_RT_NHGRP) {
 		if (namelen == 3)
 			fib = req->td->td_proc->p_fibnum;
 		else if (namelen == 4)
 			fib = (name[3] == RT_ALL_FIBS) ?
 			    req->td->td_proc->p_fibnum : name[3];
 		else
 			return ((namelen < 3) ? EISDIR : ENOTDIR);
 		if (fib < 0 || fib >= rt_numfibs)
 			return (EINVAL);
 	} else if (namelen != 3)
 		return ((namelen < 3) ? EISDIR : ENOTDIR);
 	af = name[0];
 	if (af > AF_MAX)
 		return (EINVAL);
 	bzero(&w, sizeof(w));
 	w.w_op = name[1];
 	w.w_arg = name[2];
 	w.w_req = req;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error)
 		return (error);
 
 	/*
 	 * Allocate reply buffer in advance.
 	 * All rtsock messages has maximum length of u_short.
 	 */
 	w.w_tmemsize = 65536;
 	w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK);
 
 	NET_EPOCH_ENTER(et);
 	switch (w.w_op) {
 	case NET_RT_DUMP:
 	case NET_RT_FLAGS:
 		if (af == 0) {			/* dump all tables */
 			i = 1;
 			lim = AF_MAX;
 		} else				/* dump only one table */
 			i = lim = af;
 
 		/*
 		 * take care of llinfo entries, the caller must
 		 * specify an AF
 		 */
 		if (w.w_op == NET_RT_FLAGS &&
 		    (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) {
 			if (af != 0)
 				error = lltable_sysctl_dumparp(af, w.w_req);
 			else
 				error = EINVAL;
 			break;
 		}
 		/*
 		 * take care of routing entries
 		 */
 		for (error = 0; error == 0 && i <= lim; i++) {
 			rnh = rt_tables_get_rnh(fib, i);
 			if (rnh != NULL) {
 				rtable_sysctl_dump(fib, i, &w);
 			} else if (af != 0)
 				error = EAFNOSUPPORT;
 		}
 		break;
 	case NET_RT_NHOP:
 	case NET_RT_NHGRP:
 		/* Allow dumping one specific af/fib at a time */
 		if (namelen < 4) {
 			error = EINVAL;
 			break;
 		}
 		fib = name[3];
 		if (fib < 0 || fib > rt_numfibs) {
 			error = EINVAL;
 			break;
 		}
 		rnh = rt_tables_get_rnh(fib, af);
 		if (rnh == NULL) {
 			error = EAFNOSUPPORT;
 			break;
 		}
 		if (w.w_op == NET_RT_NHOP)
 			error = nhops_dump_sysctl(rnh, w.w_req);
 		else
 #ifdef ROUTE_MPATH
 			error = nhgrp_dump_sysctl(rnh, w.w_req);
 #else
 			error = ENOTSUP;
 #endif
 		break;
 	case NET_RT_IFLIST:
 	case NET_RT_IFLISTL:
 		error = sysctl_iflist(af, &w);
 		break;
 
 	case NET_RT_IFMALIST:
 		error = sysctl_ifmalist(af, &w);
 		break;
 	}
 	NET_EPOCH_EXIT(et);
 
 	free(w.w_tmem, M_TEMP);
 	return (error);
 }
 
 static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD | CTLFLAG_MPSAFE,
     sysctl_rtsock, "Return route tables and interface/address lists");
 
 /*
  * Definitions of protocols supported in the ROUTE domain.
  */
 
 static struct domain routedomain;		/* or at least forward */
 
 static struct protosw routesw = {
 	.pr_type =		SOCK_RAW,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_abort =		rts_close,
 	.pr_attach =		rts_attach,
 	.pr_detach =		rts_detach,
 	.pr_send =		rts_send,
 	.pr_shutdown =		rts_shutdown,
 	.pr_disconnect =	rts_disconnect,
 	.pr_close =		rts_close,
 };
 
 static struct domain routedomain = {
 	.dom_family =		PF_ROUTE,
 	.dom_name =		"route",
 	.dom_nprotosw =		1,
 	.dom_protosw =		{ &routesw },
 };
 
 DOMAIN_SET(route);